From 2c35a150be2f37973b5a404ef8177fa544804356 Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Mon, 13 Apr 2026 19:27:41 +0000
Subject: [PATCH 01/23] Add audio analysis support to app

---
 apps/README.md      |  40 +++
 apps/analysis.py    |   7 +-
 apps/audio_plots.py | 639 ++++++++++++++++++++++++++++++++++++++++++++
 pyproject.toml      |   3 +
 uv.lock             | 215 +++++++++++++++
 5 files changed, 903 insertions(+), 1 deletion(-)
 create mode 100644 apps/audio_plots.py

diff --git a/apps/README.md b/apps/README.md
index 2da58a34..d5474bc0 100644
--- a/apps/README.md
+++ b/apps/README.md
@@ -45,3 +45,43 @@ EVA_OUTPUT_DIR=path/to/results streamlit run apps/analysis.py
 3. **Run Selection** — Pick a run (with metadata summary)
 4. **Record Selection** — Pick a record within the selected run
 5. **Trial Selection** — If a record has multiple trials, pick one
+
+---
+
+## Audio Analysis Tab
+
+The **Audio Analysis** tab in the Record Detail view renders an interactive Plotly figure built from the audio files and timestamp logs of a single trial. It is implemented in `apps/audio_plots.py`.
+
+### Subplots
+
+| Row | Content | Always shown |
+|-----|---------|--------------|
+| 1 | Mixed audio waveform, colour-coded by speaker | Yes |
+| 2 | Mixed audio spectrogram | Optional (checkbox) |
+| 3 | ElevenLabs audio waveform, colour-coded by speaker | Yes |
+| 4 | ElevenLabs audio spectrogram | Optional (checkbox) |
+| 5 | Speaker Turn Timeline with per-turn durations and pause markers | Yes |
+
+Toggle spectrograms on or off using the checkboxes above the chart. Results are cached per trial so switching between records is fast after the first load.
+
+### Colour Coding
+
+| Colour | Meaning |
+|--------|---------|
+| Blue | User speaker turn |
+| Orange | Assistant speaker turn |
+| Gray (semi-transparent line) | Silence — audio not covered by any speaker turn |
+| Gray shaded box | Pause — gap between consecutive speaker turns |
+
+Colours are chosen for visibility in both Streamlit light and dark mode.
+
+### Hover Tooltips
+
+Hovering over any waveform sample shows the **transcript text** for the active speaker turn, along with the turn start/end time and duration. Hovering over a pause region shows the pause duration and the from/to speakers. The timeline row shows the same transcript text when hovering over each bar.
+
+### Silence vs. Pause
+
+- **Pause** — derived from speaker turn event logs. The gap between one speaker's audio end event and the next speaker's audio start event: `pause = turns[i+1].start − turns[i].end`. Only recorded when `> 0`.
+- **Silence** — derived from the waveform timeline. Any portion of the audio not covered by a speaker turn event (including audio before the first turn or after the last turn).
+
+A Pause always coincides with a Silence region, but Silence can be wider (e.g. leading/trailing audio with no events).
diff --git a/apps/analysis.py b/apps/analysis.py
index af4b1460..4b55390c 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -24,6 +24,7 @@
 from eva.metrics.registry import get_global_registry
 from eva.models.record import EvaluationRecord
 from eva.models.results import ConversationResult, RecordMetrics
+from apps.audio_plots import render_audio_analysis_tab
 
 # ============================================================================
 # Configuration
@@ -1897,12 +1898,13 @@ def main():
     st.divider()
 
     # Tabs
-    tab1, tab2, tab3, tab4 = st.tabs(
+    tab1, tab2, tab3, tab4, tab5 = st.tabs(
         [
             "Conversation Trace",
             "Transcript",
             "Metrics Detail",
             "Processed Data",
+            "Audio Analysis",
         ]
     )
 
@@ -1950,6 +1952,9 @@ def main():
     with tab4:
         render_processed_data_tab(metrics)
 
+    with tab5:
+        render_audio_analysis_tab(selected_record_dir)
+
 
 if __name__ == "__main__":
     main()
diff --git a/apps/audio_plots.py b/apps/audio_plots.py
new file mode 100644
index 00000000..ba6928a3
--- /dev/null
+++ b/apps/audio_plots.py
@@ -0,0 +1,639 @@
+"""
+Interactive audio visualization for the EVA Streamlit app.
+
+Adapted from EVA-Bench/downloads/plot_script/plot_timestamp.py.
+Renders a Plotly figure directly into a Streamlit tab without writing files.
+
+Layout (dynamic — spectrograms are optional):
+  Row 1        : audio_mixed waveform, colour-coded by speaker turn
+  Row 2 (opt)  : audio_mixed spectrogram
+  Row 3        : ElevenLabs waveform, colour-coded by speaker turn
+  Row 4 (opt)  : ElevenLabs spectrogram
+  Row 5        : Speaker Turn Timeline
+"""
+
+import json
+import warnings
+from pathlib import Path
+
+import numpy as np
+import streamlit as st
+from pydub import AudioSegment
+import librosa
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+
+
+# =============================================================================
+# Colours — visible in both Streamlit light and dark mode
+# =============================================================================
+
+USER_COLOR  = "#4A90D9"                 # mid-blue   — clear on white & dark
+ASST_COLOR  = "#E8724A"                 # orange-red — clear on white & dark
+GAP_COLOR   = "rgba(140,140,140,0.55)"  # neutral gray for silence gaps
+USER_FILL   = "rgba(74,144,217,0.22)"
+ASST_FILL   = "rgba(232,114,74,0.22)"
+PAUSE_FILL  = "rgba(140,140,140,0.18)"
+
+
+# =============================================================================
+# Parsing / loading helpers
+# =============================================================================
+
+def _parse_elevenlabs_events(events_file: Path) -> dict:
+    events = []
+    with open(events_file) as f:
+        for line in f:
+            if line.strip():
+                events.append(json.loads(line))
+
+    audio_events = [e for e in events if e.get("event_type") in ("audio_start", "audio_end")]
+    audio_events.sort(key=lambda x: x.get("audio_timestamp", 0))
+
+    active_turns: dict = {}
+    turns: list = []
+    for event in audio_events:
+        user = event.get("user")
+        etype = event.get("event_type")
+        ts = event.get("audio_timestamp")
+        if etype == "audio_start":
+            if user not in active_turns or active_turns[user].get("end") is not None:
+                active_turns[user] = {"user": user, "start": ts, "end": None}
+        elif etype == "audio_end":
+            if user in active_turns and active_turns[user].get("end") is None:
+                active_turns[user]["end"] = ts
+                active_turns[user]["duration"] = ts - active_turns[user]["start"]
+                turns.append(active_turns[user].copy())
+
+    turns.sort(key=lambda x: x["start"])
+    return {"turns": turns}
+
+
+def _calculate_pauses(turns: list) -> list:
+    pauses = []
+    for i in range(len(turns) - 1):
+        cur, nxt = turns[i], turns[i + 1]
+        if cur["end"] and nxt["start"]:
+            gap = nxt["start"] - cur["end"]
+            if gap > 0:
+                pauses.append({
+                    "from_speaker": cur["user"],
+                    "to_speaker": nxt["user"],
+                    "start": cur["end"],
+                    "end": nxt["start"],
+                    "duration_seconds": gap,
+                })
+    return pauses
+
+
+def _parse_transcript(transcript_file: Path) -> dict:
+    result: dict = {"user": [], "assistant": []}
+    if not transcript_file or not transcript_file.exists():
+        return result
+    with open(transcript_file) as f:
+        for line in f:
+            if line.strip():
+                entry = json.loads(line)
+                role = entry.get("type", "")
+                content = entry.get("content", "")
+                if role == "user":
+                    result["user"].append(content)
+                elif role == "assistant":
+                    result["assistant"].append(content)
+    return result
+
+
+def _load_pydub(path: Path) -> tuple:
+    seg = AudioSegment.from_file(str(path))
+    if seg.channels > 1:
+        seg = seg.set_channels(1)
+    sr = seg.frame_rate
+    y = np.array(seg.get_array_of_samples()).astype(np.float32) / 32768.0
+    return y, sr
+
+
+def _load_librosa(path: Path) -> tuple:
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message="PySoundFile failed")
+        warnings.filterwarnings("ignore", category=FutureWarning, message=".*audioread.*")
+        return librosa.load(str(path), sr=None, mono=True)
+
+
+def _downsample(y: np.ndarray, sr: float, target_rate: int = 100) -> tuple:
+    duration = len(y) / sr
+    target = max(2, int(duration * target_rate))
+    if len(y) > target:
+        step = max(1, len(y) // target)
+        y_ds = y[::step]
+        sr_ds = sr * len(y_ds) / len(y)
+    else:
+        y_ds, sr_ds = y, sr
+    return y_ds, sr_ds
+
+
+def _wrap(text: str, width: int = 80) -> str:
+    words = text.split()
+    lines, current, length = [], [], 0
+    for word in words:
+        if length + len(word) + 1 > width and current:
+            lines.append(" ".join(current))
+            current, length = [word], len(word)
+        else:
+            current.append(word)
+            length += len(word) + 1
+    if current:
+        lines.append(" ".join(current))
+    return "<br>".join(lines)
+
+
+# =============================================================================
+# Data preparation
+# =============================================================================
+
+def _prepare_data(record_dir: Path) -> dict:
+    audio_mixed = next(record_dir.glob("audio_mixed*.wav"), record_dir / "audio_mixed.wav")
+    audio_el    = record_dir / "elevenlabs_audio_recording.mp3"
+    events_file = record_dir / "elevenlabs_events.jsonl"
+    transcript  = record_dir / "transcript.jsonl"
+
+    # Turns / pauses
+    if events_file.exists():
+        turns = _parse_elevenlabs_events(events_file)["turns"]
+    else:
+        turns = []
+    pauses = _calculate_pauses(turns)
+
+    start_time = min((t["start"] for t in turns), default=0)
+    turns_rel = [{
+        "user":     t["user"],
+        "start":    t["start"] - start_time,
+        "end":      (t["end"] - start_time) if t["end"] else None,
+        "duration": t.get("duration", (t["end"] - t["start"]) if t["end"] else 0),
+    } for t in turns]
+    pauses_rel = [{
+        "from_speaker":     p["from_speaker"],
+        "to_speaker":       p["to_speaker"],
+        "start":            p["start"] - start_time,
+        "end":              p["end"]   - start_time,
+        "duration_seconds": p["duration_seconds"],
+    } for p in pauses]
+
+    transcript_map = _parse_transcript(transcript)
+
+    # Mixed audio
+    y_mixed, sr_mixed, duration, mixed_loaded = None, None, 0.0, False
+    if audio_mixed.exists():
+        try:
+            y_mixed, sr_mixed = _load_pydub(audio_mixed)
+            duration = len(y_mixed) / sr_mixed
+            mixed_loaded = True
+        except Exception:
+            pass
+
+    plot_xlim = [0, max(duration, 1.0)]
+
+    if mixed_loaded:
+        y_ds, _ = _downsample(y_mixed, sr_mixed)
+        t_mixed = np.linspace(0, duration, len(y_ds))
+    else:
+        y_ds = np.array([])
+        t_mixed = np.array([])
+
+    # ElevenLabs audio
+    el_y_ds, el_t, el_sr_ds, el_loaded = np.array([]), np.array([]), 1.0, False
+    el_spec = None
+    if audio_el.exists():
+        try:
+            _el_y, _el_sr = _load_librosa(audio_el)
+            el_y_ds, _ = _downsample(_el_y, _el_sr)
+            el_sr_ds   = _el_sr * len(el_y_ds) / len(_el_y)
+            el_t       = np.linspace(0, len(_el_y) / _el_sr, len(el_y_ds))
+            el_loaded  = True
+            D = librosa.amplitude_to_db(
+                np.abs(librosa.stft(_el_y, hop_length=512, n_fft=2048)), ref=np.max)
+            freqs = librosa.fft_frequencies(sr=int(_el_sr), n_fft=2048)
+            times = librosa.frames_to_time(np.arange(D.shape[1]),
+                                           sr=int(_el_sr), hop_length=512)
+            el_spec = (D, freqs, times)
+        except Exception:
+            pass
+
+    # Mixed spectrogram
+    mixed_spec = None
+    if mixed_loaded and len(y_ds) > 0:
+        try:
+            sr_ds = sr_mixed * len(y_ds) / len(y_mixed)
+            D     = librosa.amplitude_to_db(
+                np.abs(librosa.stft(y_ds, hop_length=512, n_fft=2048)), ref=np.max)
+            freqs = librosa.fft_frequencies(sr=int(sr_ds), n_fft=2048)
+            times = librosa.frames_to_time(np.arange(D.shape[1]),
+                                           sr=int(sr_ds), hop_length=512)
+            mixed_spec = (D, freqs, times)
+        except Exception:
+            pass
+
+    return {
+        "duration":     duration,
+        "plot_xlim":    plot_xlim,
+        "mixed_loaded": mixed_loaded,
+        "y_ds":         y_ds,
+        "t_mixed":      t_mixed,
+        "el_loaded":    el_loaded,
+        "el_y_ds":      el_y_ds,
+        "el_t":         el_t,
+        "el_sr_ds":     el_sr_ds,
+        "mixed_spec":   mixed_spec,
+        "el_spec":      el_spec,
+        "turns_rel":    turns_rel,
+        "pauses_rel":   pauses_rel,
+        "transcript_map": transcript_map,
+    }
+
+
+# =============================================================================
+# Plotly figure builder
+# =============================================================================
+
+def _build_figure(data: dict,
+                  show_mixed_spec: bool = False,
+                  show_el_spec: bool = False,
+                  title_suffix: str = "") -> go.Figure:
+
+    turns_rel      = data["turns_rel"]
+    pauses_rel     = data["pauses_rel"]
+    transcript_map = data["transcript_map"]
+    plot_xlim      = data["plot_xlim"]
+
+    # ------------------------------------------------------------------ #
+    # Dynamic row layout
+    # ------------------------------------------------------------------ #
+    row_keys: list[str] = ["mixed_waveform"]
+    if show_mixed_spec and data["mixed_spec"]:
+        row_keys.append("mixed_spec")
+    row_keys.append("el_waveform")
+    if show_el_spec and data["el_spec"]:
+        row_keys.append("el_spec")
+    row_keys.append("timeline")
+
+    _titles = {
+        "mixed_waveform": "Waveform \u2014 audio_mixed.wav",
+        "mixed_spec":     "Spectrogram \u2014 audio_mixed.wav",
+        "el_waveform":    "Waveform \u2014 elevenlabs_audio_recording.mp3",
+        "el_spec":        "Spectrogram \u2014 elevenlabs_audio_recording.mp3",
+        "timeline":       "Speaker Turn Timeline",
+    }
+    _heights = {
+        "mixed_waveform": 1.5,
+        "mixed_spec":     1.3,
+        "el_waveform":    1.5,
+        "el_spec":        1.3,
+        "timeline":       1.5,
+    }
+
+    n_rows     = len(row_keys)
+    row_of     = {k: i + 1 for i, k in enumerate(row_keys)}
+    row_heights = [_heights[k] for k in row_keys]
+
+    fig = make_subplots(
+        rows=n_rows, cols=1,
+        shared_xaxes=True,
+        subplot_titles=[_titles[k] for k in row_keys],
+        row_heights=row_heights,
+        vertical_spacing=0.05,
+    )
+
+    fig.update_layout(
+        title=dict(
+            text=f"Speaker Turn Analysis \u2014 Pause Detection{title_suffix}",
+            font=dict(size=15),
+        ),
+        height=max(500, 320 * n_rows),
+        hovermode="closest",
+        legend=dict(
+            orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1,
+            bordercolor="rgba(128,128,128,0.4)", borderwidth=1,
+        ),
+    )
+
+    # ------------------------------------------------------------------ #
+    # Centralised legend — one dummy trace per category, added once.
+    # All real traces use showlegend=False + legendgroup for toggling.
+    # ------------------------------------------------------------------ #
+    for _name, _color, _symbol in [
+        ("User",      USER_COLOR,                  "square"),
+        ("Assistant", ASST_COLOR,                  "square"),
+        ("Silence",   "rgba(140,140,140,0.55)",    "square"),
+        ("Pause",     "rgba(140,140,140,0.40)",    "square-open"),
+    ]:
+        fig.add_trace(go.Scatter(
+            x=[None], y=[None], mode="markers",
+            marker=dict(color=_color, size=12, symbol=_symbol,
+                        line=dict(color=_color, width=2)),
+            name=_name, legendgroup=_name, showlegend=True,
+        ), row=1, col=1)
+
+    # ------------------------------------------------------------------ #
+    # Hover text — per-sample transcript strings
+    # ------------------------------------------------------------------ #
+    def _hover_texts(time_array: np.ndarray) -> list:
+        if len(time_array) == 0:
+            return []
+        texts = np.full(len(time_array), "", dtype=object)
+        tc: dict = {"user": 0, "assistant": 0}
+        for turn in turns_rel:
+            if not turn["end"]:
+                continue
+            is_asst = turn["user"] == "pipecat_agent"
+            speaker = "Assistant" if is_asst else "User"
+            key     = "assistant" if is_asst else "user"
+            tx_list = transcript_map[key]
+            text    = tx_list[tc[key]] if tc[key] < len(tx_list) else "(no transcript)"
+            tc[key] += 1
+            hover = (f"<b>{speaker}</b><br>"
+                     f"t\u00a0=\u00a0{turn['start']:.2f}s\u2013{turn['end']:.2f}s "
+                     f"({turn['duration']:.1f}s)<br><br>"
+                     f"{_wrap(text)}")
+            mask = (time_array >= turn["start"]) & (time_array <= turn["end"])
+            texts[mask] = hover
+        for pause in pauses_rel:
+            hover = (f"<b>Pause</b><br>"
+                     f"t\u00a0=\u00a0{pause['start']:.2f}s\u2013{pause['end']:.2f}s<br>"
+                     f"Duration:\u00a0{pause['duration_seconds'] * 1000:.0f}\u00a0ms<br>"
+                     f"{pause['from_speaker']}\u00a0\u2192\u00a0{pause['to_speaker']}")
+            mask = (time_array >= pause["start"]) & (time_array <= pause["end"])
+            texts[mask] = hover
+        return texts.tolist()
+
+    # ------------------------------------------------------------------ #
+    # Colour-coded waveform — one Scatter trace per speaker segment
+    # ------------------------------------------------------------------ #
+    def _colored_waveform(row: int, y: np.ndarray, t: np.ndarray,
+                          y_range: list) -> None:
+        """Split waveform into per-speaker segments and colour each differently."""
+        if len(y) == 0:
+            fig.add_annotation(
+                text="No file available", xref="x domain", yref="y domain",
+                x=0.5, y=0.5, showarrow=False, font=dict(color="gray", size=11),
+                row=row, col=1)
+            fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0], row=row, col=1)
+            return
+
+        # Build ordered segment list: (t_start, t_end, label)
+        turn_segs = sorted(
+            [(tr["start"], tr["end"],
+              "asst" if tr["user"] == "pipecat_agent" else "user")
+             for tr in turns_rel if tr["end"]],
+            key=lambda s: s[0],
+        )
+        segments: list[tuple] = []
+        prev_end = 0.0
+        for seg_s, seg_e, spk in turn_segs:
+            if seg_s > prev_end + 1e-3:
+                segments.append((prev_end, seg_s, "gap"))
+            segments.append((seg_s, seg_e, spk))
+            prev_end = seg_e
+        duration = float(t[-1]) if len(t) > 0 else 0.0
+        if prev_end < duration - 1e-3:
+            segments.append((prev_end, duration, "gap"))
+
+        _color_map = {"user": USER_COLOR, "asst": ASST_COLOR, "gap": GAP_COLOR}
+        _name_map  = {"user": "User",     "asst": "Assistant", "gap": "Silence"}
+
+        for seg_s, seg_e, spk in segments:
+            mask = (t >= seg_s) & (t <= seg_e)
+            if not mask.any():
+                continue
+            name = _name_map[spk]
+
+            fig.add_trace(go.Scatter(
+                x=t[mask].tolist(), y=y[mask].tolist(),
+                mode="lines",
+                line=dict(width=1.0, color=_color_map[spk]),
+                opacity=0.85 if spk != "gap" else 0.45,
+                name=name, legendgroup=name, showlegend=False,
+                text=_hover_texts(t[mask]),
+                hovertemplate="%{text}<extra></extra>",
+            ), row=row, col=1)
+
+        # Pause vrects (visual only)
+        for pause in pauses_rel:
+            fig.add_vrect(x0=pause["start"], x1=pause["end"],
+                          fillcolor=PAUSE_FILL, line_width=0, layer="below",
+                          row=row, col=1)
+
+        fig.update_yaxes(title_text="Amplitude", range=y_range, row=row, col=1)
+
+    # ------------------------------------------------------------------ #
+    # Spectrogram row — heatmap + invisible transcript strip
+    # ------------------------------------------------------------------ #
+    def _spec_row(row: int, spec: tuple, label: str) -> None:
+        D, freqs, times = spec
+
+        fig.add_trace(go.Heatmap(
+            z=D, x=times, y=freqs,
+            colorscale="Viridis", zmin=-80, zmax=0,
+            colorbar=dict(title="dB", thickness=12, len=0.12, x=1.01),
+            hovertemplate=(
+                "t=%{x:.2f}s  freq=%{y:.0f}Hz  %{z:.1f}dB"
+                "<extra>" + label + "</extra>"
+            ),
+            showscale=True,
+        ), row=row, col=1)
+
+        # Transcript strip at freq_max for hover
+        strip_t  = np.asarray(times, dtype=float)
+        freq_max = float(freqs[-1])
+        fig.add_trace(go.Scatter(
+            x=strip_t.tolist(), y=[freq_max] * len(strip_t),
+            mode="markers", marker=dict(opacity=0, size=6),
+            showlegend=False, name="",
+            text=_hover_texts(strip_t),
+            hovertemplate="%{text}<extra>Transcript</extra>",
+        ), row=row, col=1)
+
+        # Turn boundary vrects
+        for turn in turns_rel:
+            if not turn["end"]:
+                continue
+            color = ASST_FILL if turn["user"] == "pipecat_agent" else USER_FILL
+            fig.add_vrect(x0=turn["start"], x1=turn["end"],
+                          fillcolor=color, line_width=0, layer="below",
+                          row=row, col=1)
+        for pause in pauses_rel:
+            fig.add_vrect(x0=pause["start"], x1=pause["end"],
+                          fillcolor=PAUSE_FILL, line_width=0, layer="below",
+                          row=row, col=1)
+
+        fig.update_yaxes(title_text="Freq (Hz)", row=row, col=1)
+
+    def _no_file(row: int) -> None:
+        fig.add_annotation(
+            text="No file available", xref="x domain", yref="y domain",
+            x=0.5, y=0.5, showarrow=False, font=dict(color="gray", size=11),
+            row=row, col=1)
+
+    # ---- Mixed waveform ----
+    if data["mixed_loaded"] and len(data["y_ds"]) > 0:
+        y_range = [float(data["y_ds"].min() * 1.1), float(data["y_ds"].max() * 1.1)]
+        _colored_waveform(row_of["mixed_waveform"], data["y_ds"], data["t_mixed"], y_range)
+    else:
+        _no_file(row_of["mixed_waveform"])
+        fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0],
+                         row=row_of["mixed_waveform"], col=1)
+
+    # ---- Mixed spectrogram (optional) ----
+    if "mixed_spec" in row_of:
+        if data["mixed_spec"]:
+            _spec_row(row_of["mixed_spec"], data["mixed_spec"], "Mixed Spec")
+        else:
+            _no_file(row_of["mixed_spec"])
+            fig.update_yaxes(title_text="Freq (Hz)", row=row_of["mixed_spec"], col=1)
+
+    # ---- ElevenLabs waveform ----
+    if data["el_loaded"] and len(data["el_y_ds"]) > 0:
+        el_range = [float(data["el_y_ds"].min() * 1.1), float(data["el_y_ds"].max() * 1.1)]
+        _colored_waveform(row_of["el_waveform"], data["el_y_ds"], data["el_t"], el_range)
+    else:
+        _no_file(row_of["el_waveform"])
+        fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0],
+                         row=row_of["el_waveform"], col=1)
+
+    # ---- ElevenLabs spectrogram (optional) ----
+    if "el_spec" in row_of:
+        if data["el_spec"]:
+            _spec_row(row_of["el_spec"], data["el_spec"], "EL Spec")
+        else:
+            _no_file(row_of["el_spec"])
+            fig.update_yaxes(title_text="Freq (Hz)", row=row_of["el_spec"], col=1)
+
+    # ---- Timeline ----
+    tl: dict = {"user": 0, "assistant": 0}
+    tl_row   = row_of["timeline"]
+
+    for turn in turns_rel:
+        if not turn["end"]:
+            continue
+        is_asst  = turn["user"] == "pipecat_agent"
+        speaker  = "Assistant" if is_asst else "User"
+        y_pos    = 2.0 if is_asst else 1.0
+        bar_fill = "rgba(232,114,74,0.80)" if is_asst else "rgba(74,144,217,0.80)"
+        bar_line = "rgba(180,70,30,1)"     if is_asst else "rgba(30,90,170,1)"
+        key      = "assistant" if is_asst else "user"
+
+        texts = transcript_map[key]
+        text  = texts[tl[key]] if tl[key] < len(texts) else "(no transcript)"
+        tl[key] += 1
+
+        hover = (f"<b>{speaker}</b><br>"
+                 f"t\u00a0=\u00a0{turn['start']:.2f}s\u2013{turn['end']:.2f}s "
+                 f"({turn['duration']:.1f}s)<br><br>{_wrap(text)}")
+
+        # Visual bar (hoverinfo='skip' — corners are too sparse)
+        fig.add_trace(go.Scatter(
+            x=[turn["start"], turn["end"], turn["end"], turn["start"], turn["start"]],
+            y=[y_pos - 0.38, y_pos - 0.38, y_pos + 0.38, y_pos + 0.38, y_pos - 0.38],
+            fill="toself", fillcolor=bar_fill, line=dict(color=bar_line, width=1),
+            mode="lines", hoverinfo="skip",
+            name=speaker, legendgroup=speaker, showlegend=False,
+        ), row=tl_row, col=1)
+
+        # Dense hover strip at bar midline (~2 pts/sec, min 5)
+        n_pts   = max(5, int(turn["duration"] * 2))
+        x_strip = np.linspace(turn["start"], turn["end"], n_pts).tolist()
+        fig.add_trace(go.Scatter(
+            x=x_strip, y=[y_pos] * n_pts,
+            mode="markers", marker=dict(opacity=0, size=10),
+            hovertext=hover, hoverinfo="text",
+            showlegend=False, name="",
+        ), row=tl_row, col=1)
+
+        fig.add_annotation(
+            x=turn["start"] + turn["duration"] / 2, y=y_pos,
+            text=f"{turn['duration']:.1f}s",
+            showarrow=False, font=dict(size=8, color="white"),
+            xref=f"x{tl_row}", yref=f"y{tl_row}",
+        )
+
+    for pause in pauses_rel:
+        hover = (f"<b>Pause</b><br>"
+                 f"t\u00a0=\u00a0{pause['start']:.2f}s\u2013{pause['end']:.2f}s<br>"
+                 f"Duration:\u00a0{pause['duration_seconds'] * 1000:.0f}\u00a0ms<br>"
+                 f"{pause['from_speaker']}\u00a0\u2192\u00a0{pause['to_speaker']}")
+
+        fig.add_trace(go.Scatter(
+            x=[pause["start"], pause["end"], pause["end"], pause["start"], pause["start"]],
+            y=[1.15, 1.15, 1.85, 1.85, 1.15],
+            fill="toself", fillcolor="rgba(140,140,140,0.40)",
+            line=dict(color="rgba(180,60,60,0.8)", width=1, dash="dash"),
+            mode="lines", hoverinfo="skip",
+            name="Pause", legendgroup="Pause", showlegend=False,
+        ), row=tl_row, col=1)
+
+        n_pts   = max(5, int(pause["duration_seconds"] * 2))
+        x_strip = np.linspace(pause["start"], pause["end"], n_pts).tolist()
+        fig.add_trace(go.Scatter(
+            x=x_strip, y=[1.5] * n_pts,
+            mode="markers", marker=dict(opacity=0, size=10),
+            hovertext=hover, hoverinfo="text",
+            showlegend=False, name="",
+        ), row=tl_row, col=1)
+
+        fig.add_annotation(
+            x=pause["start"] + pause["duration_seconds"] / 2, y=1.5,
+            text=f"{pause['duration_seconds'] * 1000:.0f}ms",
+            showarrow=False, font=dict(size=7, color="dimgray"),
+            bgcolor="rgba(255,255,255,0.7)",
+            xref=f"x{tl_row}", yref=f"y{tl_row}",
+        )
+
+    fig.update_yaxes(
+        tickvals=[1, 2], ticktext=["User", "Assistant"], range=[0.5, 2.5],
+        title_text="Speaker", row=tl_row, col=1,
+    )
+    fig.update_xaxes(title_text="Time (seconds)", row=tl_row, col=1)
+
+    # Shared x-range + grid for all rows
+    for r in range(1, n_rows + 1):
+        fig.update_xaxes(range=plot_xlim, showgrid=True,
+                         gridcolor="rgba(128,128,128,0.15)", row=r, col=1)
+        fig.update_yaxes(showgrid=True,
+                         gridcolor="rgba(128,128,128,0.15)", row=r, col=1)
+
+    return fig
+
+
+# =============================================================================
+# Streamlit tab renderer
+# =============================================================================
+
+def render_audio_analysis_tab(record_dir: Path) -> None:
+    """Render the Audio Analysis tab for a given record / trial directory."""
+    st.markdown("### Audio Analysis")
+
+    events_file = record_dir / "elevenlabs_events.jsonl"
+    audio_mixed = next(record_dir.glob("audio_mixed*.wav"), record_dir / "audio_mixed.wav")
+
+    if not events_file.exists() and not audio_mixed.exists():
+        st.info("No audio files found in this record directory.")
+        return
+
+    # Spectrogram toggles
+    col1, col2 = st.columns(2)
+    with col1:
+        show_mixed_spec = st.checkbox("Show Mixed Audio Spectrogram", value=False)
+    with col2:
+        show_el_spec = st.checkbox("Show ElevenLabs Spectrogram", value=False)
+
+    @st.cache_data(show_spinner="Loading audio and building interactive plot\u2026")
+    def _cached(path_str: str, mixed_spec: bool, el_spec: bool) -> go.Figure:
+        return _build_figure(
+            _prepare_data(Path(path_str)),
+            show_mixed_spec=mixed_spec,
+            show_el_spec=el_spec,
+        )
+
+    try:
+        fig = _cached(str(record_dir), show_mixed_spec, show_el_spec)
+        st.plotly_chart(fig, use_container_width=True, theme="streamlit")
+    except Exception as exc:
+        st.error(f"Could not render audio plot: {exc}")
diff --git a/pyproject.toml b/pyproject.toml
index 78cf1fa8..b52b74b1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,6 +71,9 @@ apps = [
     "plotly>=5.0",
     "streamlit>=1.51.0",
     "streamlit-diff-viewer>=0.0.2",
+    "librosa>=0.11",
+    "soundfile>=0.13",
+    "audioread>=3.1",
 ]
 
 [project.scripts]
diff --git a/uv.lock b/uv.lock
index a291ea54..f68a2ccc 100644
--- a/uv.lock
+++ b/uv.lock
@@ -290,6 +290,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ee/ca/baf2b9cc7e96c179bb4a54f30fcd83e6ecb340031bde68f486403f943768/audioop_lts-0.2.2-cp313-cp313t-win_arm64.whl", hash = "sha256:c174e322bb5783c099aaf87faeb240c8d210686b04bd61dfd05a8e5a83d88969", size = 24603, upload-time = "2025-08-05T16:42:57.571Z" },
 ]
 
+[[package]]
+name = "audioread"
+version = "3.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "standard-aifc", marker = "python_full_version >= '3.13'" },
+    { name = "standard-sunau", marker = "python_full_version >= '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a1/4a/874ecf9b472f998130c2b5e145dcdb9f6131e84786111489103b66772143/audioread-3.1.0.tar.gz", hash = "sha256:1c4ab2f2972764c896a8ac61ac53e261c8d29f0c6ccd652f84e18f08a4cab190", size = 20082, upload-time = "2025-10-26T19:44:13.484Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/16/fbe8e1e185a45042f7cd3a282def5bb8d95bb69ab9e9ef6a5368aa17e426/audioread-3.1.0-py3-none-any.whl", hash = "sha256:b30d1df6c5d3de5dcef0fb0e256f6ea17bdcf5f979408df0297d8a408e2971b4", size = 23143, upload-time = "2025-10-26T19:44:12.016Z" },
+]
+
 [[package]]
 name = "azure-cognitiveservices-speech"
 version = "1.48.2"
@@ -670,6 +683,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c3/be/d0d44e092656fe7a06b55e6103cbce807cdbdee17884a5367c68c9860853/dataclasses_json-0.6.7-py3-none-any.whl", hash = "sha256:0dbf33f26c8d5305befd61b39d2b3414e8a407bedc2834dea9b8d642666fb40a", size = 28686, upload-time = "2024-06-09T16:20:16.715Z" },
 ]
 
+[[package]]
+name = "decorator"
+version = "5.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" },
+]
+
 [[package]]
 name = "deepgram-sdk"
 version = "3.11.0"
@@ -790,8 +812,11 @@ dependencies = [
 
 [package.optional-dependencies]
 apps = [
+    { name = "audioread" },
+    { name = "librosa" },
     { name = "pandas" },
     { name = "plotly" },
+    { name = "soundfile" },
     { name = "streamlit" },
     { name = "streamlit-diff-viewer" },
 ]
@@ -810,6 +835,7 @@ requires-dist = [
     { name = "aiofiles", specifier = ">=23.0" },
     { name = "anthropic", specifier = ">=0.83.0" },
     { name = "assemblyai", specifier = ">=0.17.0" },
+    { name = "audioread", marker = "extra == 'apps'", specifier = ">=3.1" },
     { name = "azure-cognitiveservices-speech", specifier = ">=1.31.0" },
     { name = "cartesia", specifier = ">=1.0.0" },
     { name = "deepgram-sdk", specifier = ">=3.5.0,<4.0.0" },
@@ -823,6 +849,7 @@ requires-dist = [
     { name = "inflect", specifier = ">=7.0.0" },
     { name = "jaconv", specifier = ">=0.3.0" },
     { name = "jiwer", specifier = ">=3.0.0" },
+    { name = "librosa", marker = "extra == 'apps'", specifier = ">=0.11" },
     { name = "litellm", specifier = ">=1.30.0" },
     { name = "more-itertools", specifier = ">=10.0.0" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.5" },
@@ -846,6 +873,7 @@ requires-dist = [
     { name = "regex", specifier = ">=2023.0.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1.0" },
     { name = "setuptools", specifier = ">=65.0.0" },
+    { name = "soundfile", marker = "extra == 'apps'", specifier = ">=0.13" },
     { name = "streamlit", marker = "extra == 'apps'", specifier = ">=1.51.0" },
     { name = "streamlit-diff-viewer", marker = "extra == 'apps'", specifier = ">=0.0.2" },
     { name = "structlog", specifier = ">=23.0" },
@@ -1623,6 +1651,44 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
 ]
 
+[[package]]
+name = "lazy-loader"
+version = "0.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/49/ac/21a1f8aa3777f5658576777ea76bfb124b702c520bbe90edf4ae9915eafa/lazy_loader-0.5.tar.gz", hash = "sha256:717f9179a0dbed357012ddad50a5ad3d5e4d9a0b8712680d4e687f5e6e6ed9b3", size = 15294, upload-time = "2026-03-06T15:45:09.054Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8a/a1/8d812e53a5da1687abb10445275d41a8b13adb781bbf7196ddbcf8d88505/lazy_loader-0.5-py3-none-any.whl", hash = "sha256:ab0ea149e9c554d4ffeeb21105ac60bed7f3b4fd69b1d2360a4add51b170b005", size = 8044, upload-time = "2026-03-06T15:45:07.668Z" },
+]
+
+[[package]]
+name = "librosa"
+version = "0.11.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "audioread" },
+    { name = "decorator" },
+    { name = "joblib" },
+    { name = "lazy-loader" },
+    { name = "msgpack" },
+    { name = "numba" },
+    { name = "numpy" },
+    { name = "pooch" },
+    { name = "scikit-learn" },
+    { name = "scipy" },
+    { name = "soundfile" },
+    { name = "soxr" },
+    { name = "standard-aifc", marker = "python_full_version >= '3.13'" },
+    { name = "standard-sunau", marker = "python_full_version >= '3.13'" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/64/36/360b5aafa0238e29758729e9486c6ed92a6f37fa403b7875e06c115cdf4a/librosa-0.11.0.tar.gz", hash = "sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908", size = 327001, upload-time = "2025-03-11T15:09:54.884Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b5/ba/c63c5786dfee4c3417094c4b00966e61e4a63efecee22cb7b4c0387dda83/librosa-0.11.0-py3-none-any.whl", hash = "sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1", size = 260749, upload-time = "2025-03-11T15:09:52.982Z" },
+]
+
 [[package]]
 name = "librt"
 version = "0.8.1"
@@ -1841,6 +1907,41 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" },
 ]
 
+[[package]]
+name = "msgpack"
+version = "1.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4d/f2/bfb55a6236ed8725a96b0aa3acbd0ec17588e6a2c3b62a93eb513ed8783f/msgpack-1.1.2.tar.gz", hash = "sha256:3b60763c1373dd60f398488069bcdc703cd08a711477b5d480eecc9f9626f47e", size = 173581, upload-time = "2025-10-08T09:15:56.596Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2c/97/560d11202bcd537abca693fd85d81cebe2107ba17301de42b01ac1677b69/msgpack-1.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2e86a607e558d22985d856948c12a3fa7b42efad264dca8a3ebbcfa2735d786c", size = 82271, upload-time = "2025-10-08T09:14:49.967Z" },
+    { url = "https://files.pythonhosted.org/packages/83/04/28a41024ccbd67467380b6fb440ae916c1e4f25e2cd4c63abe6835ac566e/msgpack-1.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:283ae72fc89da59aa004ba147e8fc2f766647b1251500182fac0350d8af299c0", size = 84914, upload-time = "2025-10-08T09:14:50.958Z" },
+    { url = "https://files.pythonhosted.org/packages/71/46/b817349db6886d79e57a966346cf0902a426375aadc1e8e7a86a75e22f19/msgpack-1.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:61c8aa3bd513d87c72ed0b37b53dd5c5a0f58f2ff9f26e1555d3bd7948fb7296", size = 416962, upload-time = "2025-10-08T09:14:51.997Z" },
+    { url = "https://files.pythonhosted.org/packages/da/e0/6cc2e852837cd6086fe7d8406af4294e66827a60a4cf60b86575a4a65ca8/msgpack-1.1.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:454e29e186285d2ebe65be34629fa0e8605202c60fbc7c4c650ccd41870896ef", size = 426183, upload-time = "2025-10-08T09:14:53.477Z" },
+    { url = "https://files.pythonhosted.org/packages/25/98/6a19f030b3d2ea906696cedd1eb251708e50a5891d0978b012cb6107234c/msgpack-1.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7bc8813f88417599564fafa59fd6f95be417179f76b40325b500b3c98409757c", size = 411454, upload-time = "2025-10-08T09:14:54.648Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/cd/9098fcb6adb32187a70b7ecaabf6339da50553351558f37600e53a4a2a23/msgpack-1.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bafca952dc13907bdfdedfc6a5f579bf4f292bdd506fadb38389afa3ac5b208e", size = 422341, upload-time = "2025-10-08T09:14:56.328Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/ae/270cecbcf36c1dc85ec086b33a51a4d7d08fc4f404bdbc15b582255d05ff/msgpack-1.1.2-cp311-cp311-win32.whl", hash = "sha256:602b6740e95ffc55bfb078172d279de3773d7b7db1f703b2f1323566b878b90e", size = 64747, upload-time = "2025-10-08T09:14:57.882Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/79/309d0e637f6f37e83c711f547308b91af02b72d2326ddd860b966080ef29/msgpack-1.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:d198d275222dc54244bf3327eb8cbe00307d220241d9cec4d306d49a44e85f68", size = 71633, upload-time = "2025-10-08T09:14:59.177Z" },
+    { url = "https://files.pythonhosted.org/packages/73/4d/7c4e2b3d9b1106cd0aa6cb56cc57c6267f59fa8bfab7d91df5adc802c847/msgpack-1.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:86f8136dfa5c116365a8a651a7d7484b65b13339731dd6faebb9a0242151c406", size = 64755, upload-time = "2025-10-08T09:15:00.48Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/bd/8b0d01c756203fbab65d265859749860682ccd2a59594609aeec3a144efa/msgpack-1.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:70a0dff9d1f8da25179ffcf880e10cf1aad55fdb63cd59c9a49a1b82290062aa", size = 81939, upload-time = "2025-10-08T09:15:01.472Z" },
+    { url = "https://files.pythonhosted.org/packages/34/68/ba4f155f793a74c1483d4bdef136e1023f7bcba557f0db4ef3db3c665cf1/msgpack-1.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:446abdd8b94b55c800ac34b102dffd2f6aa0ce643c55dfc017ad89347db3dbdb", size = 85064, upload-time = "2025-10-08T09:15:03.764Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/60/a064b0345fc36c4c3d2c743c82d9100c40388d77f0b48b2f04d6041dbec1/msgpack-1.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c63eea553c69ab05b6747901b97d620bb2a690633c77f23feb0c6a947a8a7b8f", size = 417131, upload-time = "2025-10-08T09:15:05.136Z" },
+    { url = "https://files.pythonhosted.org/packages/65/92/a5100f7185a800a5d29f8d14041f61475b9de465ffcc0f3b9fba606e4505/msgpack-1.1.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:372839311ccf6bdaf39b00b61288e0557916c3729529b301c52c2d88842add42", size = 427556, upload-time = "2025-10-08T09:15:06.837Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/87/ffe21d1bf7d9991354ad93949286f643b2bb6ddbeab66373922b44c3b8cc/msgpack-1.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2929af52106ca73fcb28576218476ffbb531a036c2adbcf54a3664de124303e9", size = 404920, upload-time = "2025-10-08T09:15:08.179Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/41/8543ed2b8604f7c0d89ce066f42007faac1eaa7d79a81555f206a5cdb889/msgpack-1.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:be52a8fc79e45b0364210eef5234a7cf8d330836d0a64dfbb878efa903d84620", size = 415013, upload-time = "2025-10-08T09:15:09.83Z" },
+    { url = "https://files.pythonhosted.org/packages/41/0d/2ddfaa8b7e1cee6c490d46cb0a39742b19e2481600a7a0e96537e9c22f43/msgpack-1.1.2-cp312-cp312-win32.whl", hash = "sha256:1fff3d825d7859ac888b0fbda39a42d59193543920eda9d9bea44d958a878029", size = 65096, upload-time = "2025-10-08T09:15:11.11Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/ec/d431eb7941fb55a31dd6ca3404d41fbb52d99172df2e7707754488390910/msgpack-1.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1de460f0403172cff81169a30b9a92b260cb809c4cb7e2fc79ae8d0510c78b6b", size = 72708, upload-time = "2025-10-08T09:15:12.554Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/31/5b1a1f70eb0e87d1678e9624908f86317787b536060641d6798e3cf70ace/msgpack-1.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:be5980f3ee0e6bd44f3a9e9dea01054f175b50c3e6cdb692bc9424c0bbb8bf69", size = 64119, upload-time = "2025-10-08T09:15:13.589Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/31/b46518ecc604d7edf3a4f94cb3bf021fc62aa301f0cb849936968164ef23/msgpack-1.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4efd7b5979ccb539c221a4c4e16aac1a533efc97f3b759bb5a5ac9f6d10383bf", size = 81212, upload-time = "2025-10-08T09:15:14.552Z" },
+    { url = "https://files.pythonhosted.org/packages/92/dc/c385f38f2c2433333345a82926c6bfa5ecfff3ef787201614317b58dd8be/msgpack-1.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:42eefe2c3e2af97ed470eec850facbe1b5ad1d6eacdbadc42ec98e7dcf68b4b7", size = 84315, upload-time = "2025-10-08T09:15:15.543Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/68/93180dce57f684a61a88a45ed13047558ded2be46f03acb8dec6d7c513af/msgpack-1.1.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1fdf7d83102bf09e7ce3357de96c59b627395352a4024f6e2458501f158bf999", size = 412721, upload-time = "2025-10-08T09:15:16.567Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/ba/459f18c16f2b3fc1a1ca871f72f07d70c07bf768ad0a507a698b8052ac58/msgpack-1.1.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fac4be746328f90caa3cd4bc67e6fe36ca2bf61d5c6eb6d895b6527e3f05071e", size = 424657, upload-time = "2025-10-08T09:15:17.825Z" },
+    { url = "https://files.pythonhosted.org/packages/38/f8/4398c46863b093252fe67368b44edc6c13b17f4e6b0e4929dbf0bdb13f23/msgpack-1.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:fffee09044073e69f2bad787071aeec727183e7580443dfeb8556cbf1978d162", size = 402668, upload-time = "2025-10-08T09:15:19.003Z" },
+    { url = "https://files.pythonhosted.org/packages/28/ce/698c1eff75626e4124b4d78e21cca0b4cc90043afb80a507626ea354ab52/msgpack-1.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5928604de9b032bc17f5099496417f113c45bc6bc21b5c6920caf34b3c428794", size = 419040, upload-time = "2025-10-08T09:15:20.183Z" },
+    { url = "https://files.pythonhosted.org/packages/67/32/f3cd1667028424fa7001d82e10ee35386eea1408b93d399b09fb0aa7875f/msgpack-1.1.2-cp313-cp313-win32.whl", hash = "sha256:a7787d353595c7c7e145e2331abf8b7ff1e6673a6b974ded96e6d4ec09f00c8c", size = 65037, upload-time = "2025-10-08T09:15:21.416Z" },
+    { url = "https://files.pythonhosted.org/packages/74/07/1ed8277f8653c40ebc65985180b007879f6a836c525b3885dcc6448ae6cb/msgpack-1.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:a465f0dceb8e13a487e54c07d04ae3ba131c7c5b95e2612596eafde1dccf64a9", size = 72631, upload-time = "2025-10-08T09:15:22.431Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/db/0314e4e2db56ebcf450f277904ffd84a7988b9e5da8d0d61ab2d057df2b6/msgpack-1.1.2-cp313-cp313-win_arm64.whl", hash = "sha256:e69b39f8c0aa5ec24b57737ebee40be647035158f14ed4b40e6f150077e21a84", size = 64118, upload-time = "2025-10-08T09:15:23.402Z" },
+]
+
 [[package]]
 name = "multidict"
 version = "6.7.1"
@@ -2320,6 +2421,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
 ]
 
+[[package]]
+name = "pooch"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "packaging" },
+    { name = "platformdirs" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/83/43/85ef45e8b36c6a48546af7b266592dc32d7f67837a6514d111bced6d7d75/pooch-1.9.0.tar.gz", hash = "sha256:de46729579b9857ffd3e741987a2f6d5e0e03219892c167c6578c0091fb511ed", size = 61788, upload-time = "2026-01-30T19:15:09.649Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/2d/d4bf65e47cea8ff2c794a600c4fd1273a7902f268757c531e0ee9f18aa58/pooch-1.9.0-py3-none-any.whl", hash = "sha256:f265597baa9f760d25ceb29d0beb8186c243d6607b0f60b83ecf14078dbc703b", size = 67175, upload-time = "2026-01-30T19:15:08.36Z" },
+]
+
 [[package]]
 name = "pre-commit"
 version = "4.5.1"
@@ -3115,6 +3230,44 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" },
 ]
 
+[[package]]
+name = "scikit-learn"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "joblib" },
+    { name = "numpy" },
+    { name = "scipy" },
+    { name = "threadpoolctl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0e/d4/40988bf3b8e34feec1d0e6a051446b1f66225f8529b9309becaeef62b6c4/scikit_learn-1.8.0.tar.gz", hash = "sha256:9bccbb3b40e3de10351f8f5068e105d0f4083b1a65fa07b6634fbc401a6287fd", size = 7335585, upload-time = "2025-12-10T07:08:53.618Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c9/92/53ea2181da8ac6bf27170191028aee7251f8f841f8d3edbfdcaf2008fde9/scikit_learn-1.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:146b4d36f800c013d267b29168813f7a03a43ecd2895d04861f1240b564421da", size = 8595835, upload-time = "2025-12-10T07:07:39.385Z" },
+    { url = "https://files.pythonhosted.org/packages/01/18/d154dc1638803adf987910cdd07097d9c526663a55666a97c124d09fb96a/scikit_learn-1.8.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:f984ca4b14914e6b4094c5d52a32ea16b49832c03bd17a110f004db3c223e8e1", size = 8080381, upload-time = "2025-12-10T07:07:41.93Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/44/226142fcb7b7101e64fdee5f49dbe6288d4c7af8abf593237b70fca080a4/scikit_learn-1.8.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5e30adb87f0cc81c7690a84f7932dd66be5bac57cfe16b91cb9151683a4a2d3b", size = 8799632, upload-time = "2025-12-10T07:07:43.899Z" },
+    { url = "https://files.pythonhosted.org/packages/36/4d/4a67f30778a45d542bbea5db2dbfa1e9e100bf9ba64aefe34215ba9f11f6/scikit_learn-1.8.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ada8121bcb4dac28d930febc791a69f7cb1673c8495e5eee274190b73a4559c1", size = 9103788, upload-time = "2025-12-10T07:07:45.982Z" },
+    { url = "https://files.pythonhosted.org/packages/89/3c/45c352094cfa60050bcbb967b1faf246b22e93cb459f2f907b600f2ceda5/scikit_learn-1.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:c57b1b610bd1f40ba43970e11ce62821c2e6569e4d74023db19c6b26f246cb3b", size = 8081706, upload-time = "2025-12-10T07:07:48.111Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/46/5416595bb395757f754feb20c3d776553a386b661658fb21b7c814e89efe/scikit_learn-1.8.0-cp311-cp311-win_arm64.whl", hash = "sha256:2838551e011a64e3053ad7618dda9310175f7515f1742fa2d756f7c874c05961", size = 7688451, upload-time = "2025-12-10T07:07:49.873Z" },
+    { url = "https://files.pythonhosted.org/packages/90/74/e6a7cc4b820e95cc38cf36cd74d5aa2b42e8ffc2d21fe5a9a9c45c1c7630/scikit_learn-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5fb63362b5a7ddab88e52b6dbb47dac3fd7dafeee740dc6c8d8a446ddedade8e", size = 8548242, upload-time = "2025-12-10T07:07:51.568Z" },
+    { url = "https://files.pythonhosted.org/packages/49/d8/9be608c6024d021041c7f0b3928d4749a706f4e2c3832bbede4fb4f58c95/scikit_learn-1.8.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:5025ce924beccb28298246e589c691fe1b8c1c96507e6d27d12c5fadd85bfd76", size = 8079075, upload-time = "2025-12-10T07:07:53.697Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/47/f187b4636ff80cc63f21cd40b7b2d177134acaa10f6bb73746130ee8c2e5/scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4496bb2cf7a43ce1a2d7524a79e40bc5da45cf598dbf9545b7e8316ccba47bb4", size = 8660492, upload-time = "2025-12-10T07:07:55.574Z" },
+    { url = "https://files.pythonhosted.org/packages/97/74/b7a304feb2b49df9fafa9382d4d09061a96ee9a9449a7cbea7988dda0828/scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0bcfe4d0d14aec44921545fd2af2338c7471de9cb701f1da4c9d85906ab847a", size = 8931904, upload-time = "2025-12-10T07:07:57.666Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/c4/0ab22726a04ede56f689476b760f98f8f46607caecff993017ac1b64aa5d/scikit_learn-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:35c007dedb2ffe38fe3ee7d201ebac4a2deccd2408e8621d53067733e3c74809", size = 8019359, upload-time = "2025-12-10T07:07:59.838Z" },
+    { url = "https://files.pythonhosted.org/packages/24/90/344a67811cfd561d7335c1b96ca21455e7e472d281c3c279c4d3f2300236/scikit_learn-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:8c497fff237d7b4e07e9ef1a640887fa4fb765647f86fbe00f969ff6280ce2bb", size = 7641898, upload-time = "2025-12-10T07:08:01.36Z" },
+    { url = "https://files.pythonhosted.org/packages/03/aa/e22e0768512ce9255eba34775be2e85c2048da73da1193e841707f8f039c/scikit_learn-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d6ae97234d5d7079dc0040990a6f7aeb97cb7fa7e8945f1999a429b23569e0a", size = 8513770, upload-time = "2025-12-10T07:08:03.251Z" },
+    { url = "https://files.pythonhosted.org/packages/58/37/31b83b2594105f61a381fc74ca19e8780ee923be2d496fcd8d2e1147bd99/scikit_learn-1.8.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:edec98c5e7c128328124a029bceb09eda2d526997780fef8d65e9a69eead963e", size = 8044458, upload-time = "2025-12-10T07:08:05.336Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/5a/3f1caed8765f33eabb723596666da4ebbf43d11e96550fb18bdec42b467b/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:74b66d8689d52ed04c271e1329f0c61635bcaf5b926db9b12d58914cdc01fe57", size = 8610341, upload-time = "2025-12-10T07:08:07.732Z" },
+    { url = "https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8fdf95767f989b0cfedb85f7ed8ca215d4be728031f56ff5a519ee1e3276dc2e", size = 8900022, upload-time = "2025-12-10T07:08:09.862Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/f9/9b7563caf3ec8873e17a31401858efab6b39a882daf6c1bfa88879c0aa11/scikit_learn-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:2de443b9373b3b615aec1bb57f9baa6bb3a9bd093f1269ba95c17d870422b271", size = 7989409, upload-time = "2025-12-10T07:08:12.028Z" },
+    { url = "https://files.pythonhosted.org/packages/49/bd/1f4001503650e72c4f6009ac0c4413cb17d2d601cef6f71c0453da2732fc/scikit_learn-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:eddde82a035681427cbedded4e6eff5e57fa59216c2e3e90b10b19ab1d0a65c3", size = 7619760, upload-time = "2025-12-10T07:08:13.688Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/7d/a630359fc9dcc95496588c8d8e3245cc8fd81980251079bc09c70d41d951/scikit_learn-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7cc267b6108f0a1499a734167282c00c4ebf61328566b55ef262d48e9849c735", size = 8826045, upload-time = "2025-12-10T07:08:15.215Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/56/a0c86f6930cfcd1c7054a2bc417e26960bb88d32444fe7f71d5c2cfae891/scikit_learn-1.8.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:fe1c011a640a9f0791146011dfd3c7d9669785f9fed2b2a5f9e207536cf5c2fd", size = 8420324, upload-time = "2025-12-10T07:08:17.561Z" },
+    { url = "https://files.pythonhosted.org/packages/46/1e/05962ea1cebc1cf3876667ecb14c283ef755bf409993c5946ade3b77e303/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72358cce49465d140cc4e7792015bb1f0296a9742d5622c67e31399b75468b9e", size = 8680651, upload-time = "2025-12-10T07:08:19.952Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/56/a85473cd75f200c9759e3a5f0bcab2d116c92a8a02ee08ccd73b870f8bb4/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80832434a6cc114f5219211eec13dcbc16c2bac0e31ef64c6d346cde3cf054cb", size = 8925045, upload-time = "2025-12-10T07:08:22.11Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/b7/64d8cfa896c64435ae57f4917a548d7ac7a44762ff9802f75a79b77cb633/scikit_learn-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ee787491dbfe082d9c3013f01f5991658b0f38aa8177e4cd4bf434c58f551702", size = 8507994, upload-time = "2025-12-10T07:08:23.943Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/37/e192ea709551799379958b4c4771ec507347027bb7c942662c7fbeba31cb/scikit_learn-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf97c10a3f5a7543f9b88cbf488d33d175e9146115a451ae34568597ba33dcde", size = 7869518, upload-time = "2025-12-10T07:08:25.71Z" },
+]
+
 [[package]]
 name = "scipy"
 version = "1.17.1"
@@ -3211,6 +3364,25 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
 ]
 
+[[package]]
+name = "soundfile"
+version = "0.13.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cffi" },
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/64/28/e2a36573ccbcf3d57c00626a21fe51989380636e821b341d36ccca0c1c3a/soundfile-0.13.1-py2.py3-none-any.whl", hash = "sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445", size = 25751, upload-time = "2025-01-25T09:16:44.235Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/ab/73e97a5b3cc46bba7ff8650a1504348fa1863a6f9d57d7001c6b67c5f20e/soundfile-0.13.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:82dc664d19831933fe59adad199bf3945ad06d84bc111a5b4c0d3089a5b9ec33", size = 1142250, upload-time = "2025-01-25T09:16:47.583Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/e5/58fd1a8d7b26fc113af244f966ee3aecf03cb9293cb935daaddc1e455e18/soundfile-0.13.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:743f12c12c4054921e15736c6be09ac26b3b3d603aef6fd69f9dde68748f2593", size = 1101406, upload-time = "2025-01-25T09:16:49.662Z" },
+    { url = "https://files.pythonhosted.org/packages/58/ae/c0e4a53d77cf6e9a04179535766b3321b0b9ced5f70522e4caf9329f0046/soundfile-0.13.1-py2.py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9c9e855f5a4d06ce4213f31918653ab7de0c5a8d8107cd2427e44b42df547deb", size = 1235729, upload-time = "2025-01-25T09:16:53.018Z" },
+    { url = "https://files.pythonhosted.org/packages/57/5e/70bdd9579b35003a489fc850b5047beeda26328053ebadc1fb60f320f7db/soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:03267c4e493315294834a0870f31dbb3b28a95561b80b134f0bd3cf2d5f0e618", size = 1313646, upload-time = "2025-01-25T09:16:54.872Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/df/8c11dc4dfceda14e3003bb81a0d0edcaaf0796dd7b4f826ea3e532146bba/soundfile-0.13.1-py2.py3-none-win32.whl", hash = "sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5", size = 899881, upload-time = "2025-01-25T09:16:56.663Z" },
+    { url = "https://files.pythonhosted.org/packages/14/e9/6b761de83277f2f02ded7e7ea6f07828ec78e4b229b80e4ca55dd205b9dc/soundfile-0.13.1-py2.py3-none-win_amd64.whl", hash = "sha256:1e70a05a0626524a69e9f0f4dd2ec174b4e9567f4d8b6c11d38b5c289be36ee9", size = 1019162, upload-time = "2025-01-25T09:16:59.573Z" },
+]
+
 [[package]]
 name = "soxr"
 version = "0.5.0.post1"
@@ -3232,6 +3404,40 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/bc/10/440f1ba3d4955e0dc740bbe4ce8968c254a3d644d013eb75eea729becdb8/soxr-0.5.0.post1-cp312-abi3-win_amd64.whl", hash = "sha256:b1be9fee90afb38546bdbd7bde714d1d9a8c5a45137f97478a83b65e7f3146f6", size = 164937, upload-time = "2024-08-31T03:43:23.671Z" },
 ]
 
+[[package]]
+name = "standard-aifc"
+version = "3.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "audioop-lts", marker = "python_full_version >= '3.13'" },
+    { name = "standard-chunk", marker = "python_full_version >= '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c4/53/6050dc3dde1671eb3db592c13b55a8005e5040131f7509cef0215212cb84/standard_aifc-3.13.0.tar.gz", hash = "sha256:64e249c7cb4b3daf2fdba4e95721f811bde8bdfc43ad9f936589b7bb2fae2e43", size = 15240, upload-time = "2024-10-30T16:01:31.772Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c3/52/5fbb203394cc852334d1575cc020f6bcec768d2265355984dfd361968f36/standard_aifc-3.13.0-py3-none-any.whl", hash = "sha256:f7ae09cc57de1224a0dd8e3eb8f73830be7c3d0bc485de4c1f82b4a7f645ac66", size = 10492, upload-time = "2024-10-30T16:01:07.071Z" },
+]
+
+[[package]]
+name = "standard-chunk"
+version = "3.13.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/06/ce1bb165c1f111c7d23a1ad17204d67224baa69725bb6857a264db61beaf/standard_chunk-3.13.0.tar.gz", hash = "sha256:4ac345d37d7e686d2755e01836b8d98eda0d1a3ee90375e597ae43aaf064d654", size = 4672, upload-time = "2024-10-30T16:18:28.326Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7a/90/a5c1084d87767d787a6caba615aa50dc587229646308d9420c960cb5e4c0/standard_chunk-3.13.0-py3-none-any.whl", hash = "sha256:17880a26c285189c644bd5bd8f8ed2bdb795d216e3293e6dbe55bbd848e2982c", size = 4944, upload-time = "2024-10-30T16:18:26.694Z" },
+]
+
+[[package]]
+name = "standard-sunau"
+version = "3.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "audioop-lts", marker = "python_full_version >= '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/e3/ce8d38cb2d70e05ffeddc28bb09bad77cfef979eb0a299c9117f7ed4e6a9/standard_sunau-3.13.0.tar.gz", hash = "sha256:b319a1ac95a09a2378a8442f403c66f4fd4b36616d6df6ae82b8e536ee790908", size = 9368, upload-time = "2024-10-30T16:01:41.626Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/34/ae/e3707f6c1bc6f7aa0df600ba8075bfb8a19252140cd595335be60e25f9ee/standard_sunau-3.13.0-py3-none-any.whl", hash = "sha256:53af624a9529c41062f4c2fd33837f297f3baa196b0cfceffea6555654602622", size = 7364, upload-time = "2024-10-30T16:01:28.003Z" },
+]
+
 [[package]]
 name = "starlette"
 version = "0.52.1"
@@ -3316,6 +3522,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926, upload-time = "2026-02-07T10:45:32.24Z" },
 ]
 
+[[package]]
+name = "threadpoolctl"
+version = "3.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" },
+]
+
 [[package]]
 name = "tiktoken"
 version = "0.12.0"

From 63b694649fb43f27fd009594f6f62c8a085ad185 Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Fri, 10 Apr 2026 16:45:11 -0400
Subject: [PATCH 02/23] Update Streamlit to 1.56

---
 pyproject.toml | 2 +-
 uv.lock        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b52b74b1..be7c6d9c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -69,7 +69,7 @@ dev = [
 apps = [
     "pandas>=2.0",
     "plotly>=5.0",
-    "streamlit>=1.51.0",
+    "streamlit>=1.56.0",
     "streamlit-diff-viewer>=0.0.2",
     "librosa>=0.11",
     "soundfile>=0.13",
diff --git a/uv.lock b/uv.lock
index f68a2ccc..d6351fb3 100644
--- a/uv.lock
+++ b/uv.lock
@@ -874,7 +874,7 @@ requires-dist = [
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1.0" },
     { name = "setuptools", specifier = ">=65.0.0" },
     { name = "soundfile", marker = "extra == 'apps'", specifier = ">=0.13" },
-    { name = "streamlit", marker = "extra == 'apps'", specifier = ">=1.51.0" },
+    { name = "streamlit", marker = "extra == 'apps'", specifier = ">=1.56.0" },
     { name = "streamlit-diff-viewer", marker = "extra == 'apps'", specifier = ">=0.0.2" },
     { name = "structlog", specifier = ">=23.0" },
     { name = "tqdm", specifier = ">=4.65" },
@@ -3453,7 +3453,7 @@ wheels = [
 
 [[package]]
 name = "streamlit"
-version = "1.54.0"
+version = "1.56.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "altair" },
@@ -3475,9 +3475,9 @@ dependencies = [
     { name = "typing-extensions" },
     { name = "watchdog", marker = "sys_platform != 'darwin'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/be/66/d887ee80ea85f035baee607c60af024994e17ae9b921277fca9675e76ecf/streamlit-1.54.0.tar.gz", hash = "sha256:09965e6ae7eb0357091725de1ce2a3f7e4be155c2464c505c40a3da77ab69dd8", size = 8662292, upload-time = "2026-02-04T16:37:54.734Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/03/85/7c669b3a1336d34ef39fa9760fbd343185f3b15db2ad0838fd78423d1c7f/streamlit-1.56.0.tar.gz", hash = "sha256:1176acfa89ae1318b79078e8efe689a9d02e8d58e325c00fc0e55fa2f3fe8d6a", size = 8559239, upload-time = "2026-03-31T22:29:38.59Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/48/1d/40de1819374b4f0507411a60f4d2de0d620a9b10c817de5925799132b6c9/streamlit-1.54.0-py3-none-any.whl", hash = "sha256:a7b67d6293a9f5f6b4d4c7acdbc4980d7d9f049e78e404125022ecb1712f79fc", size = 9119730, upload-time = "2026-02-04T16:37:52.199Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/91/cb6f13a89e376ef179309d74f37a70ea0041d5e4b5ba5c4836dbf6e020ad/streamlit-1.56.0-py3-none-any.whl", hash = "sha256:8677a335734a30a51bc57ad0ec910e365d95f7c456fc02c60032927cd0729dc5", size = 9052089, upload-time = "2026-03-31T22:29:36.342Z" },
 ]
 
 [[package]]

From 0bb5ab70b6d5de3ca76b9cc5425e7b6d1db73570 Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Fri, 10 Apr 2026 17:58:43 -0400
Subject: [PATCH 03/23] Bind query params

with the new bind feature instead of manually
---
 apps/analysis.py | 97 ++++++++----------------------------------------
 1 file changed, 16 insertions(+), 81 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index 4b55390c..afb846ea 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -876,45 +876,12 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""):
         all_providers.update(_extract_providers(cfg))
         all_types.add(_classify_pipeline_type(cfg))
 
-    # Read defaults from query params
-    qp = st.query_params
-    default_models = [m for m in qp.get_all("model") if m in all_models] or sorted(all_models)
-    default_providers = [p for p in qp.get_all("provider") if p in all_providers] or sorted(all_providers)
-    default_types = [t for t in qp.get_all("type") if t in all_types] or sorted(all_types)
-
     # Render filters
-    sel_types = st.multiselect("Pipeline Type", sorted(all_types), default=default_types)
-    sel_providers = st.multiselect("Provider", sorted(all_providers), default=default_providers)
-    sel_models = st.multiselect("Model", sorted(all_models), default=default_models)
-
-    # Update query params only when they differ to avoid rerun loops
-    new_params: dict[str, str | list[str]] = {
-        "output_dir": output_dir_str or (str(run_dirs[0].parent) if run_dirs else ""),
-        "view": "Cross-Run Comparison",
-    }
-    if sel_models and set(sel_models) != all_models:
-        new_params["model"] = sel_models
-    if sel_providers and set(sel_providers) != all_providers:
-        new_params["provider"] = sel_providers
-    if sel_types and set(sel_types) != all_types:
-        new_params["type"] = sel_types
-
-    # Compare with current params to avoid unnecessary rerun
-    current = dict(st.query_params)
-    needs_update = False
-    for k, v in new_params.items():
-        cur_val = current.get(k)
-        if isinstance(v, list):
-            if sorted(v) != sorted(st.query_params.get_all(k)):
-                needs_update = True
-                break
-        elif cur_val != v:
-            needs_update = True
-            break
-    if set(current.keys()) != set(new_params.keys()):
-        needs_update = True
-    if needs_update:
-        st.query_params.from_dict(new_params)
+    sel_types = st.multiselect("Pipeline Type", sorted(all_types), default=all_types, key="type", bind="query-params")
+    sel_providers = st.multiselect(
+        "Provider", sorted(all_providers), default=all_providers, key="provider", bind="query-params"
+    )
+    sel_models = st.multiselect("Model", sorted(all_models), default=all_models, key="model", bind="query-params")
 
     # Apply filters
     filtered_dirs = [
@@ -1750,12 +1717,9 @@ def main():
     st.set_page_config(page_title="EVA Results Analysis", layout="wide")
     st.title("EVA Results Analysis")
 
-    query_params = st.query_params
-
-    # Sidebar: output directory selection
-    st.sidebar.header("Output Directory")
-    default_output = query_params.get("output_dir", _DEFAULT_OUTPUT_DIR)
-    output_dir = Path(st.sidebar.text_input("Path to output directory", value=default_output))
+    output_dir = Path(
+        st.sidebar.text_input("Output directory", value=_DEFAULT_OUTPUT_DIR, key="output_dir", bind="query-params")
+    )
 
     run_dirs = get_run_directories(output_dir)
 
@@ -1765,10 +1729,8 @@ def main():
 
     # View mode
     st.sidebar.header("View")
-    default_view = query_params.get("view", "Cross-Run Comparison")
     view_options = ["Cross-Run Comparison", "Run Overview", "Record Detail"]
-    default_view_idx = view_options.index(default_view) if default_view in view_options else 0
-    view_mode = st.sidebar.radio("View", view_options, index=default_view_idx, label_visibility="collapsed")
+    view_mode = st.sidebar.radio("View", view_options, key="view", label_visibility="collapsed", bind="query-params")
 
     if view_mode == "Cross-Run Comparison":
         render_cross_run_comparison([output_dir / d.name for d in run_dirs], str(output_dir))
@@ -1776,27 +1738,17 @@ def main():
 
     # Sidebar: run selection
     st.sidebar.header("Run Selection")
-    run_dir_names = [d.name for d in run_dirs]
-    default_run_idx = 0
-    if "run" in query_params and query_params["run"] in run_dir_names:
-        default_run_idx = run_dir_names.index(query_params["run"])
-    selected_run_name = st.sidebar.selectbox("Select Run", run_dir_names, index=default_run_idx)
-    selected_run_dir = output_dir / selected_run_name
+    selected_run_dir = st.sidebar.selectbox(
+        "Select Run", run_dirs, format_func=lambda d: d.name, key="run", bind="query-params"
+    )
 
     run_config = _load_run_config(selected_run_dir)
     if run_config:
-        _render_sidebar_run_metadata(selected_run_name, run_config)
+        _render_sidebar_run_metadata(selected_run_dir.name, run_config)
     else:
-        st.sidebar.info(f"**Run:** {selected_run_name}")
+        st.sidebar.info(f"**Run:** {selected_run_dir.name}")
 
     if view_mode == "Run Overview":
-        st.query_params.from_dict(
-            {
-                "output_dir": str(output_dir),
-                "view": "Run Overview",
-                "run": selected_run_name,
-            }
-        )
         render_run_overview(selected_run_dir)
         return
 
@@ -1810,10 +1762,7 @@ def main():
     # Sidebar: record selection
     st.sidebar.header("Record Selection")
     record_names = [d.name for d in record_dirs]
-    default_record_idx = 0
-    if "record" in query_params and query_params["record"] in record_names:
-        default_record_idx = record_names.index(query_params["record"])
-    selected_record_name = st.sidebar.selectbox("Select Record", record_names, index=default_record_idx)
+    selected_record_name = st.sidebar.selectbox("Select Record", record_names, key="record", bind="query-params")
     selected_record_dir = selected_run_dir / "records" / selected_record_name
 
     # Detect trial subdirectories
@@ -1833,23 +1782,9 @@ def main():
     selected_trial = None
     if trial_dirs:
         trial_names = [d.name for d in trial_dirs]
-        default_trial_idx = 0
-        if "trial" in query_params and query_params["trial"] in trial_names:
-            default_trial_idx = trial_names.index(query_params["trial"])
-        selected_trial = st.sidebar.selectbox("Select Trial", trial_names, index=default_trial_idx)
+        selected_trial = st.sidebar.selectbox("Select Trial", trial_names, key="trial", bind="query-params")
         selected_record_dir = selected_record_dir / selected_trial
 
-    # Update query params for deep linking
-    new_params = {
-        "output_dir": str(output_dir),
-        "view": "Record Detail",
-        "run": selected_run_name,
-        "record": selected_record_name,
-    }
-    if selected_trial:
-        new_params["trial"] = selected_trial
-    st.query_params.from_dict(new_params)
-
     # Load data
     result = load_record_result(selected_record_dir)
     metrics = load_record_metrics(selected_record_dir)

From 573b5bac626519c0a7e576a7a7579fab6c740282 Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Fri, 10 Apr 2026 18:50:42 -0400
Subject: [PATCH 04/23] Use pages instead of radio buttons

---
 apps/analysis.py | 67 +++++++++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index afb846ea..b198c05b 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -859,7 +859,7 @@ def _render_eva_scatter_plot(scatter_data: list[dict]):
 # ============================================================================
 
 
-def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""):
+def render_cross_run_comparison(run_dirs: list[Path]):
     """Render a comparison view across multiple runs."""
     st.markdown("### Cross-Run Comparison")
     st.caption("Compare aggregate metrics across all runs that have metrics data.")
@@ -1002,12 +1002,7 @@ def render_cross_run_comparison(run_dirs: list[Path], output_dir_str: str = ""):
     display_df = summary_df[display_cols].copy()
 
     # Add link column to navigate to Run Overview
-    output_dir_str = str(run_dirs[0].parent) if run_dirs else ""
-    display_df.insert(
-        0,
-        "link",
-        summary_df["run"].apply(lambda r: f"?output_dir={output_dir_str}&view=Run+Overview&run={r}"),
-    )
+    display_df.insert(0, "link", f"/run_overview?output_dir={run_dirs[0].parent}&run=" + summary_df["run"])
 
     composite_rename = {c: f"[EVA] {_EVA_COMPOSITE_DISPLAY[c]}" for c in table_composites}
     display_df = display_df.rename(columns={"label": "Run", "records": "# Records", **composite_rename, **col_rename})
@@ -1708,15 +1703,8 @@ def _render_sidebar_run_metadata(run_name: str, run_config: dict):
     st.sidebar.info("\n\n".join(metadata_parts))
 
 
-# ============================================================================
-# Main App
-# ============================================================================
-
-
-def main():
-    st.set_page_config(page_title="EVA Results Analysis", layout="wide")
-    st.title("EVA Results Analysis")
-
+def _get_run_dirs():
+    """Get run directories, showing an error if none found."""
     output_dir = Path(
         st.sidebar.text_input("Output directory", value=_DEFAULT_OUTPUT_DIR, key="output_dir", bind="query-params")
     )
@@ -1725,18 +1713,12 @@ def main():
 
     if not run_dirs:
         st.error(f"No run directories found in {output_dir}")
-        return
+        st.stop()
 
-    # View mode
-    st.sidebar.header("View")
-    view_options = ["Cross-Run Comparison", "Run Overview", "Record Detail"]
-    view_mode = st.sidebar.radio("View", view_options, key="view", label_visibility="collapsed", bind="query-params")
+    return run_dirs
 
-    if view_mode == "Cross-Run Comparison":
-        render_cross_run_comparison([output_dir / d.name for d in run_dirs], str(output_dir))
-        return
 
-    # Sidebar: run selection
+def _select_run(run_dirs: list[Path]):
     st.sidebar.header("Run Selection")
     selected_run_dir = st.sidebar.selectbox(
         "Select Run", run_dirs, format_func=lambda d: d.name, key="run", bind="query-params"
@@ -1748,11 +1730,10 @@ def main():
     else:
         st.sidebar.info(f"**Run:** {selected_run_dir.name}")
 
-    if view_mode == "Run Overview":
-        render_run_overview(selected_run_dir)
-        return
+    return selected_run_dir
 
-    # Record detail view
+
+def render_record_detail(selected_run_dir: Path):
     record_dirs = get_record_directories(selected_run_dir)
 
     if not record_dirs:
@@ -1891,5 +1872,33 @@ def main():
         render_audio_analysis_tab(selected_record_dir)
 
 
+# ============================================================================
+# Main App
+# ============================================================================
+
+
+def cross_run_comparison():
+    render_cross_run_comparison(_get_run_dirs())
+
+
+def run_overview():
+    render_run_overview(_select_run(_get_run_dirs()))
+
+
+def record_detail():
+    render_record_detail(_select_run(_get_run_dirs()))
+
+
+def main():
+    st.set_page_config(page_title="EVA Results Analysis", layout="wide")
+
+    pages = (
+        st.Page(cross_run_comparison, title="Cross-Run Comparison", icon=":material/compare_arrows:"),
+        st.Page(run_overview, title="Run Overview", icon=":material/summarize:"),
+        st.Page(record_detail, title="Record Detail", icon=":material/article:"),
+    )
+    st.navigation(pages).run()
+
+
 if __name__ == "__main__":
     main()

From 70c3b88d0d05e4129465f9cbabde1258b7b585ef Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Fri, 10 Apr 2026 18:51:31 -0400
Subject: [PATCH 05/23] Add EVA favicon

---
 apps/analysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index b198c05b..fa34eced 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -1890,7 +1890,7 @@ def record_detail():
 
 
 def main():
-    st.set_page_config(page_title="EVA Results Analysis", layout="wide")
+    st.set_page_config(page_title="EVA Results Analysis", layout="wide", page_icon="website/public/favicon.svg")
 
     pages = (
         st.Page(cross_run_comparison, title="Cross-Run Comparison", icon=":material/compare_arrows:"),

From 92b79c8ea9535c31173baa79b989dab7b779d3bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabrielle=20Gauthier=20Melanc=CC=A7on?=
 <gabrielle.gm@servicenow.com>
Date: Tue, 7 Apr 2026 17:44:07 -0400
Subject: [PATCH 06/23] Fix user turn not advancing after interruption

---
 src/eva/metrics/processor.py            | 19 +++++---
 tests/fixtures/processor_histories.json | 63 +++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 6 deletions(-)

diff --git a/src/eva/metrics/processor.py b/src/eva/metrics/processor.py
index 38fd6803..660e7ce9 100644
--- a/src/eva/metrics/processor.py
+++ b/src/eva/metrics/processor.py
@@ -89,16 +89,23 @@ class _TurnExtractionState:
     # user_speech lands at the same turn.
     rollback_advance_consumed_by_user: bool = False
 
-    def advance_turn_if_needed(self) -> None:
+    def advance_turn_if_needed(self, from_audio_start: bool = False) -> None:
         """Advance turn if the assistant responded since the last user event.
 
         Called on audio_start(elevenlabs_user) and audit_log/user events.
-        After an interruption, hold_turn consumes one advance without incrementing.
+        After an interruption, hold_turn suppresses one advance from audit_log/user
+        (late STT from the interrupted session) but never blocks audio_start
+        (the user speaking again always starts a new turn).
         """
         if self.hold_turn:
-            self.hold_turn = False
-            self.assistant_spoke_in_turn = False
-            return
+            if from_audio_start:
+                # New user speech — clear hold_turn but still advance
+                self.hold_turn = False
+            else:
+                # Late STT chunk from interrupted session — consume without advancing
+                self.hold_turn = False
+                self.assistant_spoke_in_turn = False
+                return
         if self.assistant_spoke_in_turn:
             self.turn_num += 1
             self.assistant_spoke_in_turn = False
@@ -326,7 +333,7 @@ def _handle_audio_start(
             state.assistant_spoke_in_turn = True
             state.pending_advance_after_rollback = False
         state.rollback_advance_consumed_by_user = False
-        state.advance_turn_if_needed()
+        state.advance_turn_if_needed(from_audio_start=True)
         # Mark the NEW turn (after advance) as a user-interrupted turn — the user's interrupting speech
         # lands here, symmetric with assistant_interrupted_turns.
         if state.pending_user_interrupts_label:
diff --git a/tests/fixtures/processor_histories.json b/tests/fixtures/processor_histories.json
index 10afc5cb..21133a90 100644
--- a/tests/fixtures/processor_histories.json
+++ b/tests/fixtures/processor_histories.json
@@ -980,5 +980,68 @@
       "assistant_interrupted_turns": [],
       "user_interrupted_turns": [2]
     }
+  },
+  {
+    "id": "assistant_interrupts_then_new_user_turn",
+    "description": "Based on record 5.1.3: Assistant audio_start overlaps with user audio by ~50ms (timing artifact, not a real barge-in). This triggers assistant_interrupted_turns and hold_turn. The user then speaks again in a NEW audio session. Tests that hold_turn does NOT prevent the new user audio_start from advancing the turn — user speaking again must always start a new turn.",
+    "history": [
+      {"timestamp_ms": 1000, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "Hello! How can I help you today?"}},
+      {"timestamp_ms": 1100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.0}},
+      {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello, how can I help you today?"}}},
+      {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 1.8}},
+
+      {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 2.0}},
+      {"timestamp_ms": 2100, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "My flight got canceled and I want a full refund."}}},
+      {"timestamp_ms": 2200, "source": "audit_log", "event_type": "user", "data": "My flight got cancelled and I want a full refund."},
+      {"timestamp_ms": 2350, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 2.35}},
+      {"timestamp_ms": 2400, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 2.4}},
+      {"timestamp_ms": 2500, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I'm sorry to hear that. Could you provide your confirmation number?"}},
+      {"timestamp_ms": 2550, "source": "audit_log", "event_type": "assistant", "data": "I'm sorry to hear that. Could you provide your confirmation number?"},
+      {"timestamp_ms": 3000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 3.0}},
+      {"timestamp_ms": 3100, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I'm sorry to hear that. Could you provide your confirmation number?"}}},
+
+      {"timestamp_ms": 4000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 4.0}},
+      {"timestamp_ms": 4100, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "Confirmation code is Z5OROH and last name is White."}}},
+      {"timestamp_ms": 4200, "source": "audit_log", "event_type": "user", "data": "Confirmation code is Z5OROH."},
+      {"timestamp_ms": 4300, "source": "audit_log", "event_type": "user", "data": "And last name is White."},
+      {"timestamp_ms": 4500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 4.5}},
+      {"timestamp_ms": 4600, "source": "audit_log", "event_type": "tool_call", "data": {"tool": "get_reservation", "parameters": {"confirmation_number": "Z5OROH", "last_name": "White"}}},
+      {"timestamp_ms": 4700, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "get_reservation", "response": {"status": "success", "reservation": {"confirmation_number": "Z5OROH"}}}},
+      {"timestamp_ms": 5000, "source": "pipecat", "event_type": "tts_text", "data": {"frame": "I found your reservation. Your flight was canceled, so you are eligible for a full refund. Shall I proceed?"}},
+      {"timestamp_ms": 5050, "source": "audit_log", "event_type": "assistant", "data": "I found your reservation. Your flight was canceled, so you are eligible for a full refund. Shall I proceed?"},
+      {"timestamp_ms": 5100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 5.1}},
+      {"timestamp_ms": 5500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 5.5}},
+      {"timestamp_ms": 5600, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I found your reservation. Your flight was canceled, so you are eligible for a full refund. Shall I proceed?"}}},
+
+      {"timestamp_ms": 6000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 6.0}},
+      {"timestamp_ms": 6100, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "Yes, go ahead."}}},
+      {"timestamp_ms": 6200, "source": "audit_log", "event_type": "user", "data": "Yes, go ahead."},
+      {"timestamp_ms": 6500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 6.5}},
+      {"timestamp_ms": 7000, "source": "elevenlabs", "event_type": "connection_state", "data": {"data": {"state": "session_ended"}}}
+    ],
+    "expected": {
+      "transcribed_assistant_turns": {"0": "Hello, how can I help you today?", "1": "[assistant interrupts] I'm sorry to hear that. Could you provide your confirmation number?", "2": "I found your reservation. Your flight was canceled, so you are eligible for a full refund. Shall I proceed?"},
+      "transcribed_user_turns": {"1": "My flight got cancelled and I want a full refund.", "2": "Confirmation code is Z5OROH. And last name is White.", "3": "Yes, go ahead."},
+      "intended_assistant_turns": {"0": "Hello! How can I help you today?", "1": "[assistant interrupts] I'm sorry to hear that. Could you provide your confirmation number?", "2": "I found your reservation. Your flight was canceled, so you are eligible for a full refund. Shall I proceed?"},
+      "intended_user_turns": {"1": "My flight got canceled and I want a full refund.", "2": "Confirmation code is Z5OROH and last name is White.", "3": "Yes, go ahead."},
+      "audio_timestamps_assistant_turns": {"0": [[1.0, 1.8]], "1": [[2.35, 3.0]], "2": [[5.1, 5.5]]},
+      "audio_timestamps_user_turns": {"1": [[2.0, 2.4]], "2": [[4.0, 4.5]], "3": [[6.0, 6.5]]},
+      "num_assistant_turns": 3,
+      "num_user_turns": 3,
+      "num_tool_calls": 1,
+      "tool_called": ["get_reservation"],
+      "conversation_trace": [
+        {"role": "assistant", "content": "Hello! How can I help you today?", "type": "intended", "turn_id": 0},
+        {"role": "user", "content": "My flight got cancelled and I want a full refund. [likely cut off by assistant]", "type": "transcribed", "turn_id": 1},
+        {"role": "assistant", "content": "[assistant interrupts] I'm sorry to hear that. Could you provide your confirmation number?", "type": "intended", "turn_id": 1},
+        {"role": "user", "content": "Confirmation code is Z5OROH. And last name is White.", "type": "transcribed", "turn_id": 2},
+        {"tool_name": "get_reservation", "parameters": {"confirmation_number": "Z5OROH", "last_name": "White"}, "type": "tool_call", "turn_id": 2},
+        {"tool_name": "get_reservation", "tool_response": {"status": "success", "reservation": {"confirmation_number": "Z5OROH"}}, "type": "tool_response", "turn_id": 2},
+        {"role": "assistant", "content": "I found your reservation. Your flight was canceled, so you are eligible for a full refund. Shall I proceed?", "type": "intended", "turn_id": 2},
+        {"role": "user", "content": "Yes, go ahead.", "type": "transcribed", "turn_id": 3}
+      ],
+      "assistant_interrupted_turns": [1],
+      "user_interrupted_turns": []
+    }
   }
 ]

From 615a52362140c949f656c59053a7fc4906fa11bb Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Mon, 13 Apr 2026 19:27:41 +0000
Subject: [PATCH 07/23] Add audio analysis support to app

---
 apps/README.md      |  40 +++
 apps/analysis.py    |   7 +-
 apps/audio_plots.py | 639 ++++++++++++++++++++++++++++++++++++++++++++
 pyproject.toml      |   3 +
 uv.lock             | 217 ++++++++++++++-
 5 files changed, 904 insertions(+), 2 deletions(-)
 create mode 100644 apps/audio_plots.py

diff --git a/apps/README.md b/apps/README.md
index 2da58a34..d5474bc0 100644
--- a/apps/README.md
+++ b/apps/README.md
@@ -45,3 +45,43 @@ EVA_OUTPUT_DIR=path/to/results streamlit run apps/analysis.py
 3. **Run Selection** — Pick a run (with metadata summary)
 4. **Record Selection** — Pick a record within the selected run
 5. **Trial Selection** — If a record has multiple trials, pick one
+
+---
+
+## Audio Analysis Tab
+
+The **Audio Analysis** tab in the Record Detail view renders an interactive Plotly figure built from the audio files and timestamp logs of a single trial. It is implemented in `apps/audio_plots.py`.
+
+### Subplots
+
+| Row | Content | Always shown |
+|-----|---------|--------------|
+| 1 | Mixed audio waveform, colour-coded by speaker | Yes |
+| 2 | Mixed audio spectrogram | Optional (checkbox) |
+| 3 | ElevenLabs audio waveform, colour-coded by speaker | Yes |
+| 4 | ElevenLabs audio spectrogram | Optional (checkbox) |
+| 5 | Speaker Turn Timeline with per-turn durations and pause markers | Yes |
+
+Toggle spectrograms on or off using the checkboxes above the chart. Results are cached per trial so switching between records is fast after the first load.
+
+### Colour Coding
+
+| Colour | Meaning |
+|--------|---------|
+| Blue | User speaker turn |
+| Orange | Assistant speaker turn |
+| Gray (semi-transparent line) | Silence — audio not covered by any speaker turn |
+| Gray shaded box | Pause — gap between consecutive speaker turns |
+
+Colours are chosen for visibility in both Streamlit light and dark mode.
+
+### Hover Tooltips
+
+Hovering over any waveform sample shows the **transcript text** for the active speaker turn, along with the turn start/end time and duration. Hovering over a pause region shows the pause duration and the from/to speakers. The timeline row shows the same transcript text when hovering over each bar.
+
+### Silence vs. Pause
+
+- **Pause** — derived from speaker turn event logs. The gap between one speaker's audio end event and the next speaker's audio start event: `pause = turns[i+1].start − turns[i].end`. Only recorded when `> 0`.
+- **Silence** — derived from the waveform timeline. Any portion of the audio not covered by a speaker turn event (including audio before the first turn or after the last turn).
+
+A Pause always coincides with a Silence region, but Silence can be wider (e.g. leading/trailing audio with no events).
diff --git a/apps/analysis.py b/apps/analysis.py
index 4e651752..fa34eced 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -24,6 +24,7 @@
 from eva.metrics.registry import get_global_registry
 from eva.models.record import EvaluationRecord
 from eva.models.results import ConversationResult, RecordMetrics
+from apps.audio_plots import render_audio_analysis_tab
 
 # ============================================================================
 # Configuration
@@ -1813,12 +1814,13 @@ def render_record_detail(selected_run_dir: Path):
     st.divider()
 
     # Tabs
-    tab1, tab2, tab3, tab4 = st.tabs(
+    tab1, tab2, tab3, tab4, tab5 = st.tabs(
         [
             "Conversation Trace",
             "Transcript",
             "Metrics Detail",
             "Processed Data",
+            "Audio Analysis",
         ]
     )
 
@@ -1866,6 +1868,9 @@ def render_record_detail(selected_run_dir: Path):
     with tab4:
         render_processed_data_tab(metrics)
 
+    with tab5:
+        render_audio_analysis_tab(selected_record_dir)
+
 
 # ============================================================================
 # Main App
diff --git a/apps/audio_plots.py b/apps/audio_plots.py
new file mode 100644
index 00000000..ba6928a3
--- /dev/null
+++ b/apps/audio_plots.py
@@ -0,0 +1,639 @@
+"""
+Interactive audio visualization for the EVA Streamlit app.
+
+Adapted from EVA-Bench/downloads/plot_script/plot_timestamp.py.
+Renders a Plotly figure directly into a Streamlit tab without writing files.
+
+Layout (dynamic — spectrograms are optional):
+  Row 1        : audio_mixed waveform, colour-coded by speaker turn
+  Row 2 (opt)  : audio_mixed spectrogram
+  Row 3        : ElevenLabs waveform, colour-coded by speaker turn
+  Row 4 (opt)  : ElevenLabs spectrogram
+  Row 5        : Speaker Turn Timeline
+"""
+
+import json
+import warnings
+from pathlib import Path
+
+import numpy as np
+import streamlit as st
+from pydub import AudioSegment
+import librosa
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+
+
+# =============================================================================
+# Colours — visible in both Streamlit light and dark mode
+# =============================================================================
+
+USER_COLOR  = "#4A90D9"                 # mid-blue   — clear on white & dark
+ASST_COLOR  = "#E8724A"                 # orange-red — clear on white & dark
+GAP_COLOR   = "rgba(140,140,140,0.55)"  # neutral gray for silence gaps
+USER_FILL   = "rgba(74,144,217,0.22)"
+ASST_FILL   = "rgba(232,114,74,0.22)"
+PAUSE_FILL  = "rgba(140,140,140,0.18)"
+
+
+# =============================================================================
+# Parsing / loading helpers
+# =============================================================================
+
+def _parse_elevenlabs_events(events_file: Path) -> dict:
+    events = []
+    with open(events_file) as f:
+        for line in f:
+            if line.strip():
+                events.append(json.loads(line))
+
+    audio_events = [e for e in events if e.get("event_type") in ("audio_start", "audio_end")]
+    audio_events.sort(key=lambda x: x.get("audio_timestamp", 0))
+
+    active_turns: dict = {}
+    turns: list = []
+    for event in audio_events:
+        user = event.get("user")
+        etype = event.get("event_type")
+        ts = event.get("audio_timestamp")
+        if etype == "audio_start":
+            if user not in active_turns or active_turns[user].get("end") is not None:
+                active_turns[user] = {"user": user, "start": ts, "end": None}
+        elif etype == "audio_end":
+            if user in active_turns and active_turns[user].get("end") is None:
+                active_turns[user]["end"] = ts
+                active_turns[user]["duration"] = ts - active_turns[user]["start"]
+                turns.append(active_turns[user].copy())
+
+    turns.sort(key=lambda x: x["start"])
+    return {"turns": turns}
+
+
+def _calculate_pauses(turns: list) -> list:
+    pauses = []
+    for i in range(len(turns) - 1):
+        cur, nxt = turns[i], turns[i + 1]
+        if cur["end"] and nxt["start"]:
+            gap = nxt["start"] - cur["end"]
+            if gap > 0:
+                pauses.append({
+                    "from_speaker": cur["user"],
+                    "to_speaker": nxt["user"],
+                    "start": cur["end"],
+                    "end": nxt["start"],
+                    "duration_seconds": gap,
+                })
+    return pauses
+
+
+def _parse_transcript(transcript_file: Path) -> dict:
+    result: dict = {"user": [], "assistant": []}
+    if not transcript_file or not transcript_file.exists():
+        return result
+    with open(transcript_file) as f:
+        for line in f:
+            if line.strip():
+                entry = json.loads(line)
+                role = entry.get("type", "")
+                content = entry.get("content", "")
+                if role == "user":
+                    result["user"].append(content)
+                elif role == "assistant":
+                    result["assistant"].append(content)
+    return result
+
+
+def _load_pydub(path: Path) -> tuple:
+    seg = AudioSegment.from_file(str(path))
+    if seg.channels > 1:
+        seg = seg.set_channels(1)
+    sr = seg.frame_rate
+    y = np.array(seg.get_array_of_samples()).astype(np.float32) / 32768.0
+    return y, sr
+
+
+def _load_librosa(path: Path) -> tuple:
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message="PySoundFile failed")
+        warnings.filterwarnings("ignore", category=FutureWarning, message=".*audioread.*")
+        return librosa.load(str(path), sr=None, mono=True)
+
+
+def _downsample(y: np.ndarray, sr: float, target_rate: int = 100) -> tuple:
+    duration = len(y) / sr
+    target = max(2, int(duration * target_rate))
+    if len(y) > target:
+        step = max(1, len(y) // target)
+        y_ds = y[::step]
+        sr_ds = sr * len(y_ds) / len(y)
+    else:
+        y_ds, sr_ds = y, sr
+    return y_ds, sr_ds
+
+
+def _wrap(text: str, width: int = 80) -> str:
+    words = text.split()
+    lines, current, length = [], [], 0
+    for word in words:
+        if length + len(word) + 1 > width and current:
+            lines.append(" ".join(current))
+            current, length = [word], len(word)
+        else:
+            current.append(word)
+            length += len(word) + 1
+    if current:
+        lines.append(" ".join(current))
+    return "<br>".join(lines)
+
+
+# =============================================================================
+# Data preparation
+# =============================================================================
+
+def _prepare_data(record_dir: Path) -> dict:
+    audio_mixed = next(record_dir.glob("audio_mixed*.wav"), record_dir / "audio_mixed.wav")
+    audio_el    = record_dir / "elevenlabs_audio_recording.mp3"
+    events_file = record_dir / "elevenlabs_events.jsonl"
+    transcript  = record_dir / "transcript.jsonl"
+
+    # Turns / pauses
+    if events_file.exists():
+        turns = _parse_elevenlabs_events(events_file)["turns"]
+    else:
+        turns = []
+    pauses = _calculate_pauses(turns)
+
+    start_time = min((t["start"] for t in turns), default=0)
+    turns_rel = [{
+        "user":     t["user"],
+        "start":    t["start"] - start_time,
+        "end":      (t["end"] - start_time) if t["end"] else None,
+        "duration": t.get("duration", (t["end"] - t["start"]) if t["end"] else 0),
+    } for t in turns]
+    pauses_rel = [{
+        "from_speaker":     p["from_speaker"],
+        "to_speaker":       p["to_speaker"],
+        "start":            p["start"] - start_time,
+        "end":              p["end"]   - start_time,
+        "duration_seconds": p["duration_seconds"],
+    } for p in pauses]
+
+    transcript_map = _parse_transcript(transcript)
+
+    # Mixed audio
+    y_mixed, sr_mixed, duration, mixed_loaded = None, None, 0.0, False
+    if audio_mixed.exists():
+        try:
+            y_mixed, sr_mixed = _load_pydub(audio_mixed)
+            duration = len(y_mixed) / sr_mixed
+            mixed_loaded = True
+        except Exception:
+            pass
+
+    plot_xlim = [0, max(duration, 1.0)]
+
+    if mixed_loaded:
+        y_ds, _ = _downsample(y_mixed, sr_mixed)
+        t_mixed = np.linspace(0, duration, len(y_ds))
+    else:
+        y_ds = np.array([])
+        t_mixed = np.array([])
+
+    # ElevenLabs audio
+    el_y_ds, el_t, el_sr_ds, el_loaded = np.array([]), np.array([]), 1.0, False
+    el_spec = None
+    if audio_el.exists():
+        try:
+            _el_y, _el_sr = _load_librosa(audio_el)
+            el_y_ds, _ = _downsample(_el_y, _el_sr)
+            el_sr_ds   = _el_sr * len(el_y_ds) / len(_el_y)
+            el_t       = np.linspace(0, len(_el_y) / _el_sr, len(el_y_ds))
+            el_loaded  = True
+            D = librosa.amplitude_to_db(
+                np.abs(librosa.stft(_el_y, hop_length=512, n_fft=2048)), ref=np.max)
+            freqs = librosa.fft_frequencies(sr=int(_el_sr), n_fft=2048)
+            times = librosa.frames_to_time(np.arange(D.shape[1]),
+                                           sr=int(_el_sr), hop_length=512)
+            el_spec = (D, freqs, times)
+        except Exception:
+            pass
+
+    # Mixed spectrogram
+    mixed_spec = None
+    if mixed_loaded and len(y_ds) > 0:
+        try:
+            sr_ds = sr_mixed * len(y_ds) / len(y_mixed)
+            D     = librosa.amplitude_to_db(
+                np.abs(librosa.stft(y_ds, hop_length=512, n_fft=2048)), ref=np.max)
+            freqs = librosa.fft_frequencies(sr=int(sr_ds), n_fft=2048)
+            times = librosa.frames_to_time(np.arange(D.shape[1]),
+                                           sr=int(sr_ds), hop_length=512)
+            mixed_spec = (D, freqs, times)
+        except Exception:
+            pass
+
+    return {
+        "duration":     duration,
+        "plot_xlim":    plot_xlim,
+        "mixed_loaded": mixed_loaded,
+        "y_ds":         y_ds,
+        "t_mixed":      t_mixed,
+        "el_loaded":    el_loaded,
+        "el_y_ds":      el_y_ds,
+        "el_t":         el_t,
+        "el_sr_ds":     el_sr_ds,
+        "mixed_spec":   mixed_spec,
+        "el_spec":      el_spec,
+        "turns_rel":    turns_rel,
+        "pauses_rel":   pauses_rel,
+        "transcript_map": transcript_map,
+    }
+
+
+# =============================================================================
+# Plotly figure builder
+# =============================================================================
+
+def _build_figure(data: dict,
+                  show_mixed_spec: bool = False,
+                  show_el_spec: bool = False,
+                  title_suffix: str = "") -> go.Figure:
+
+    turns_rel      = data["turns_rel"]
+    pauses_rel     = data["pauses_rel"]
+    transcript_map = data["transcript_map"]
+    plot_xlim      = data["plot_xlim"]
+
+    # ------------------------------------------------------------------ #
+    # Dynamic row layout
+    # ------------------------------------------------------------------ #
+    row_keys: list[str] = ["mixed_waveform"]
+    if show_mixed_spec and data["mixed_spec"]:
+        row_keys.append("mixed_spec")
+    row_keys.append("el_waveform")
+    if show_el_spec and data["el_spec"]:
+        row_keys.append("el_spec")
+    row_keys.append("timeline")
+
+    _titles = {
+        "mixed_waveform": "Waveform \u2014 audio_mixed.wav",
+        "mixed_spec":     "Spectrogram \u2014 audio_mixed.wav",
+        "el_waveform":    "Waveform \u2014 elevenlabs_audio_recording.mp3",
+        "el_spec":        "Spectrogram \u2014 elevenlabs_audio_recording.mp3",
+        "timeline":       "Speaker Turn Timeline",
+    }
+    _heights = {
+        "mixed_waveform": 1.5,
+        "mixed_spec":     1.3,
+        "el_waveform":    1.5,
+        "el_spec":        1.3,
+        "timeline":       1.5,
+    }
+
+    n_rows     = len(row_keys)
+    row_of     = {k: i + 1 for i, k in enumerate(row_keys)}
+    row_heights = [_heights[k] for k in row_keys]
+
+    fig = make_subplots(
+        rows=n_rows, cols=1,
+        shared_xaxes=True,
+        subplot_titles=[_titles[k] for k in row_keys],
+        row_heights=row_heights,
+        vertical_spacing=0.05,
+    )
+
+    fig.update_layout(
+        title=dict(
+            text=f"Speaker Turn Analysis \u2014 Pause Detection{title_suffix}",
+            font=dict(size=15),
+        ),
+        height=max(500, 320 * n_rows),
+        hovermode="closest",
+        legend=dict(
+            orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1,
+            bordercolor="rgba(128,128,128,0.4)", borderwidth=1,
+        ),
+    )
+
+    # ------------------------------------------------------------------ #
+    # Centralised legend — one dummy trace per category, added once.
+    # All real traces use showlegend=False + legendgroup for toggling.
+    # ------------------------------------------------------------------ #
+    for _name, _color, _symbol in [
+        ("User",      USER_COLOR,                  "square"),
+        ("Assistant", ASST_COLOR,                  "square"),
+        ("Silence",   "rgba(140,140,140,0.55)",    "square"),
+        ("Pause",     "rgba(140,140,140,0.40)",    "square-open"),
+    ]:
+        fig.add_trace(go.Scatter(
+            x=[None], y=[None], mode="markers",
+            marker=dict(color=_color, size=12, symbol=_symbol,
+                        line=dict(color=_color, width=2)),
+            name=_name, legendgroup=_name, showlegend=True,
+        ), row=1, col=1)
+
+    # ------------------------------------------------------------------ #
+    # Hover text — per-sample transcript strings
+    # ------------------------------------------------------------------ #
+    def _hover_texts(time_array: np.ndarray) -> list:
+        if len(time_array) == 0:
+            return []
+        texts = np.full(len(time_array), "", dtype=object)
+        tc: dict = {"user": 0, "assistant": 0}
+        for turn in turns_rel:
+            if not turn["end"]:
+                continue
+            is_asst = turn["user"] == "pipecat_agent"
+            speaker = "Assistant" if is_asst else "User"
+            key     = "assistant" if is_asst else "user"
+            tx_list = transcript_map[key]
+            text    = tx_list[tc[key]] if tc[key] < len(tx_list) else "(no transcript)"
+            tc[key] += 1
+            hover = (f"<b>{speaker}</b><br>"
+                     f"t\u00a0=\u00a0{turn['start']:.2f}s\u2013{turn['end']:.2f}s "
+                     f"({turn['duration']:.1f}s)<br><br>"
+                     f"{_wrap(text)}")
+            mask = (time_array >= turn["start"]) & (time_array <= turn["end"])
+            texts[mask] = hover
+        for pause in pauses_rel:
+            hover = (f"<b>Pause</b><br>"
+                     f"t\u00a0=\u00a0{pause['start']:.2f}s\u2013{pause['end']:.2f}s<br>"
+                     f"Duration:\u00a0{pause['duration_seconds'] * 1000:.0f}\u00a0ms<br>"
+                     f"{pause['from_speaker']}\u00a0\u2192\u00a0{pause['to_speaker']}")
+            mask = (time_array >= pause["start"]) & (time_array <= pause["end"])
+            texts[mask] = hover
+        return texts.tolist()
+
+    # ------------------------------------------------------------------ #
+    # Colour-coded waveform — one Scatter trace per speaker segment
+    # ------------------------------------------------------------------ #
+    def _colored_waveform(row: int, y: np.ndarray, t: np.ndarray,
+                          y_range: list) -> None:
+        """Split waveform into per-speaker segments and colour each differently."""
+        if len(y) == 0:
+            fig.add_annotation(
+                text="No file available", xref="x domain", yref="y domain",
+                x=0.5, y=0.5, showarrow=False, font=dict(color="gray", size=11),
+                row=row, col=1)
+            fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0], row=row, col=1)
+            return
+
+        # Build ordered segment list: (t_start, t_end, label)
+        turn_segs = sorted(
+            [(tr["start"], tr["end"],
+              "asst" if tr["user"] == "pipecat_agent" else "user")
+             for tr in turns_rel if tr["end"]],
+            key=lambda s: s[0],
+        )
+        segments: list[tuple] = []
+        prev_end = 0.0
+        for seg_s, seg_e, spk in turn_segs:
+            if seg_s > prev_end + 1e-3:
+                segments.append((prev_end, seg_s, "gap"))
+            segments.append((seg_s, seg_e, spk))
+            prev_end = seg_e
+        duration = float(t[-1]) if len(t) > 0 else 0.0
+        if prev_end < duration - 1e-3:
+            segments.append((prev_end, duration, "gap"))
+
+        _color_map = {"user": USER_COLOR, "asst": ASST_COLOR, "gap": GAP_COLOR}
+        _name_map  = {"user": "User",     "asst": "Assistant", "gap": "Silence"}
+
+        for seg_s, seg_e, spk in segments:
+            mask = (t >= seg_s) & (t <= seg_e)
+            if not mask.any():
+                continue
+            name = _name_map[spk]
+
+            fig.add_trace(go.Scatter(
+                x=t[mask].tolist(), y=y[mask].tolist(),
+                mode="lines",
+                line=dict(width=1.0, color=_color_map[spk]),
+                opacity=0.85 if spk != "gap" else 0.45,
+                name=name, legendgroup=name, showlegend=False,
+                text=_hover_texts(t[mask]),
+                hovertemplate="%{text}<extra></extra>",
+            ), row=row, col=1)
+
+        # Pause vrects (visual only)
+        for pause in pauses_rel:
+            fig.add_vrect(x0=pause["start"], x1=pause["end"],
+                          fillcolor=PAUSE_FILL, line_width=0, layer="below",
+                          row=row, col=1)
+
+        fig.update_yaxes(title_text="Amplitude", range=y_range, row=row, col=1)
+
+    # ------------------------------------------------------------------ #
+    # Spectrogram row — heatmap + invisible transcript strip
+    # ------------------------------------------------------------------ #
+    def _spec_row(row: int, spec: tuple, label: str) -> None:
+        D, freqs, times = spec
+
+        fig.add_trace(go.Heatmap(
+            z=D, x=times, y=freqs,
+            colorscale="Viridis", zmin=-80, zmax=0,
+            colorbar=dict(title="dB", thickness=12, len=0.12, x=1.01),
+            hovertemplate=(
+                "t=%{x:.2f}s  freq=%{y:.0f}Hz  %{z:.1f}dB"
+                "<extra>" + label + "</extra>"
+            ),
+            showscale=True,
+        ), row=row, col=1)
+
+        # Transcript strip at freq_max for hover
+        strip_t  = np.asarray(times, dtype=float)
+        freq_max = float(freqs[-1])
+        fig.add_trace(go.Scatter(
+            x=strip_t.tolist(), y=[freq_max] * len(strip_t),
+            mode="markers", marker=dict(opacity=0, size=6),
+            showlegend=False, name="",
+            text=_hover_texts(strip_t),
+            hovertemplate="%{text}<extra>Transcript</extra>",
+        ), row=row, col=1)
+
+        # Turn boundary vrects
+        for turn in turns_rel:
+            if not turn["end"]:
+                continue
+            color = ASST_FILL if turn["user"] == "pipecat_agent" else USER_FILL
+            fig.add_vrect(x0=turn["start"], x1=turn["end"],
+                          fillcolor=color, line_width=0, layer="below",
+                          row=row, col=1)
+        for pause in pauses_rel:
+            fig.add_vrect(x0=pause["start"], x1=pause["end"],
+                          fillcolor=PAUSE_FILL, line_width=0, layer="below",
+                          row=row, col=1)
+
+        fig.update_yaxes(title_text="Freq (Hz)", row=row, col=1)
+
+    def _no_file(row: int) -> None:
+        fig.add_annotation(
+            text="No file available", xref="x domain", yref="y domain",
+            x=0.5, y=0.5, showarrow=False, font=dict(color="gray", size=11),
+            row=row, col=1)
+
+    # ---- Mixed waveform ----
+    if data["mixed_loaded"] and len(data["y_ds"]) > 0:
+        y_range = [float(data["y_ds"].min() * 1.1), float(data["y_ds"].max() * 1.1)]
+        _colored_waveform(row_of["mixed_waveform"], data["y_ds"], data["t_mixed"], y_range)
+    else:
+        _no_file(row_of["mixed_waveform"])
+        fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0],
+                         row=row_of["mixed_waveform"], col=1)
+
+    # ---- Mixed spectrogram (optional) ----
+    if "mixed_spec" in row_of:
+        if data["mixed_spec"]:
+            _spec_row(row_of["mixed_spec"], data["mixed_spec"], "Mixed Spec")
+        else:
+            _no_file(row_of["mixed_spec"])
+            fig.update_yaxes(title_text="Freq (Hz)", row=row_of["mixed_spec"], col=1)
+
+    # ---- ElevenLabs waveform ----
+    if data["el_loaded"] and len(data["el_y_ds"]) > 0:
+        el_range = [float(data["el_y_ds"].min() * 1.1), float(data["el_y_ds"].max() * 1.1)]
+        _colored_waveform(row_of["el_waveform"], data["el_y_ds"], data["el_t"], el_range)
+    else:
+        _no_file(row_of["el_waveform"])
+        fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0],
+                         row=row_of["el_waveform"], col=1)
+
+    # ---- ElevenLabs spectrogram (optional) ----
+    if "el_spec" in row_of:
+        if data["el_spec"]:
+            _spec_row(row_of["el_spec"], data["el_spec"], "EL Spec")
+        else:
+            _no_file(row_of["el_spec"])
+            fig.update_yaxes(title_text="Freq (Hz)", row=row_of["el_spec"], col=1)
+
+    # ---- Timeline ----
+    tl: dict = {"user": 0, "assistant": 0}
+    tl_row   = row_of["timeline"]
+
+    for turn in turns_rel:
+        if not turn["end"]:
+            continue
+        is_asst  = turn["user"] == "pipecat_agent"
+        speaker  = "Assistant" if is_asst else "User"
+        y_pos    = 2.0 if is_asst else 1.0
+        bar_fill = "rgba(232,114,74,0.80)" if is_asst else "rgba(74,144,217,0.80)"
+        bar_line = "rgba(180,70,30,1)"     if is_asst else "rgba(30,90,170,1)"
+        key      = "assistant" if is_asst else "user"
+
+        texts = transcript_map[key]
+        text  = texts[tl[key]] if tl[key] < len(texts) else "(no transcript)"
+        tl[key] += 1
+
+        hover = (f"<b>{speaker}</b><br>"
+                 f"t\u00a0=\u00a0{turn['start']:.2f}s\u2013{turn['end']:.2f}s "
+                 f"({turn['duration']:.1f}s)<br><br>{_wrap(text)}")
+
+        # Visual bar (hoverinfo='skip' — corners are too sparse)
+        fig.add_trace(go.Scatter(
+            x=[turn["start"], turn["end"], turn["end"], turn["start"], turn["start"]],
+            y=[y_pos - 0.38, y_pos - 0.38, y_pos + 0.38, y_pos + 0.38, y_pos - 0.38],
+            fill="toself", fillcolor=bar_fill, line=dict(color=bar_line, width=1),
+            mode="lines", hoverinfo="skip",
+            name=speaker, legendgroup=speaker, showlegend=False,
+        ), row=tl_row, col=1)
+
+        # Dense hover strip at bar midline (~2 pts/sec, min 5)
+        n_pts   = max(5, int(turn["duration"] * 2))
+        x_strip = np.linspace(turn["start"], turn["end"], n_pts).tolist()
+        fig.add_trace(go.Scatter(
+            x=x_strip, y=[y_pos] * n_pts,
+            mode="markers", marker=dict(opacity=0, size=10),
+            hovertext=hover, hoverinfo="text",
+            showlegend=False, name="",
+        ), row=tl_row, col=1)
+
+        fig.add_annotation(
+            x=turn["start"] + turn["duration"] / 2, y=y_pos,
+            text=f"{turn['duration']:.1f}s",
+            showarrow=False, font=dict(size=8, color="white"),
+            xref=f"x{tl_row}", yref=f"y{tl_row}",
+        )
+
+    for pause in pauses_rel:
+        hover = (f"<b>Pause</b><br>"
+                 f"t\u00a0=\u00a0{pause['start']:.2f}s\u2013{pause['end']:.2f}s<br>"
+                 f"Duration:\u00a0{pause['duration_seconds'] * 1000:.0f}\u00a0ms<br>"
+                 f"{pause['from_speaker']}\u00a0\u2192\u00a0{pause['to_speaker']}")
+
+        fig.add_trace(go.Scatter(
+            x=[pause["start"], pause["end"], pause["end"], pause["start"], pause["start"]],
+            y=[1.15, 1.15, 1.85, 1.85, 1.15],
+            fill="toself", fillcolor="rgba(140,140,140,0.40)",
+            line=dict(color="rgba(180,60,60,0.8)", width=1, dash="dash"),
+            mode="lines", hoverinfo="skip",
+            name="Pause", legendgroup="Pause", showlegend=False,
+        ), row=tl_row, col=1)
+
+        n_pts   = max(5, int(pause["duration_seconds"] * 2))
+        x_strip = np.linspace(pause["start"], pause["end"], n_pts).tolist()
+        fig.add_trace(go.Scatter(
+            x=x_strip, y=[1.5] * n_pts,
+            mode="markers", marker=dict(opacity=0, size=10),
+            hovertext=hover, hoverinfo="text",
+            showlegend=False, name="",
+        ), row=tl_row, col=1)
+
+        fig.add_annotation(
+            x=pause["start"] + pause["duration_seconds"] / 2, y=1.5,
+            text=f"{pause['duration_seconds'] * 1000:.0f}ms",
+            showarrow=False, font=dict(size=7, color="dimgray"),
+            bgcolor="rgba(255,255,255,0.7)",
+            xref=f"x{tl_row}", yref=f"y{tl_row}",
+        )
+
+    fig.update_yaxes(
+        tickvals=[1, 2], ticktext=["User", "Assistant"], range=[0.5, 2.5],
+        title_text="Speaker", row=tl_row, col=1,
+    )
+    fig.update_xaxes(title_text="Time (seconds)", row=tl_row, col=1)
+
+    # Shared x-range + grid for all rows
+    for r in range(1, n_rows + 1):
+        fig.update_xaxes(range=plot_xlim, showgrid=True,
+                         gridcolor="rgba(128,128,128,0.15)", row=r, col=1)
+        fig.update_yaxes(showgrid=True,
+                         gridcolor="rgba(128,128,128,0.15)", row=r, col=1)
+
+    return fig
+
+
+# =============================================================================
+# Streamlit tab renderer
+# =============================================================================
+
+def render_audio_analysis_tab(record_dir: Path) -> None:
+    """Render the Audio Analysis tab for a given record / trial directory."""
+    st.markdown("### Audio Analysis")
+
+    events_file = record_dir / "elevenlabs_events.jsonl"
+    audio_mixed = next(record_dir.glob("audio_mixed*.wav"), record_dir / "audio_mixed.wav")
+
+    if not events_file.exists() and not audio_mixed.exists():
+        st.info("No audio files found in this record directory.")
+        return
+
+    # Spectrogram toggles
+    col1, col2 = st.columns(2)
+    with col1:
+        show_mixed_spec = st.checkbox("Show Mixed Audio Spectrogram", value=False)
+    with col2:
+        show_el_spec = st.checkbox("Show ElevenLabs Spectrogram", value=False)
+
+    @st.cache_data(show_spinner="Loading audio and building interactive plot\u2026")
+    def _cached(path_str: str, mixed_spec: bool, el_spec: bool) -> go.Figure:
+        return _build_figure(
+            _prepare_data(Path(path_str)),
+            show_mixed_spec=mixed_spec,
+            show_el_spec=el_spec,
+        )
+
+    try:
+        fig = _cached(str(record_dir), show_mixed_spec, show_el_spec)
+        st.plotly_chart(fig, use_container_width=True, theme="streamlit")
+    except Exception as exc:
+        st.error(f"Could not render audio plot: {exc}")
diff --git a/pyproject.toml b/pyproject.toml
index 9f59b76c..be7c6d9c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,6 +71,9 @@ apps = [
     "plotly>=5.0",
     "streamlit>=1.56.0",
     "streamlit-diff-viewer>=0.0.2",
+    "librosa>=0.11",
+    "soundfile>=0.13",
+    "audioread>=3.1",
 ]
 
 [project.scripts]
diff --git a/uv.lock b/uv.lock
index fafa81dc..43787fb1 100644
--- a/uv.lock
+++ b/uv.lock
@@ -290,6 +290,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ee/ca/baf2b9cc7e96c179bb4a54f30fcd83e6ecb340031bde68f486403f943768/audioop_lts-0.2.2-cp313-cp313t-win_arm64.whl", hash = "sha256:c174e322bb5783c099aaf87faeb240c8d210686b04bd61dfd05a8e5a83d88969", size = 24603, upload-time = "2025-08-05T16:42:57.571Z" },
 ]
 
+[[package]]
+name = "audioread"
+version = "3.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "standard-aifc", marker = "python_full_version >= '3.13'" },
+    { name = "standard-sunau", marker = "python_full_version >= '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a1/4a/874ecf9b472f998130c2b5e145dcdb9f6131e84786111489103b66772143/audioread-3.1.0.tar.gz", hash = "sha256:1c4ab2f2972764c896a8ac61ac53e261c8d29f0c6ccd652f84e18f08a4cab190", size = 20082, upload-time = "2025-10-26T19:44:13.484Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/16/fbe8e1e185a45042f7cd3a282def5bb8d95bb69ab9e9ef6a5368aa17e426/audioread-3.1.0-py3-none-any.whl", hash = "sha256:b30d1df6c5d3de5dcef0fb0e256f6ea17bdcf5f979408df0297d8a408e2971b4", size = 23143, upload-time = "2025-10-26T19:44:12.016Z" },
+]
+
 [[package]]
 name = "azure-cognitiveservices-speech"
 version = "1.48.2"
@@ -670,6 +683,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c3/be/d0d44e092656fe7a06b55e6103cbce807cdbdee17884a5367c68c9860853/dataclasses_json-0.6.7-py3-none-any.whl", hash = "sha256:0dbf33f26c8d5305befd61b39d2b3414e8a407bedc2834dea9b8d642666fb40a", size = 28686, upload-time = "2024-06-09T16:20:16.715Z" },
 ]
 
+[[package]]
+name = "decorator"
+version = "5.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" },
+]
+
 [[package]]
 name = "deepgram-sdk"
 version = "3.11.0"
@@ -790,8 +812,11 @@ dependencies = [
 
 [package.optional-dependencies]
 apps = [
+    { name = "audioread" },
+    { name = "librosa" },
     { name = "pandas" },
     { name = "plotly" },
+    { name = "soundfile" },
     { name = "streamlit" },
     { name = "streamlit-diff-viewer" },
 ]
@@ -810,6 +835,7 @@ requires-dist = [
     { name = "aiofiles", specifier = ">=23.0" },
     { name = "anthropic", specifier = ">=0.83.0" },
     { name = "assemblyai", specifier = ">=0.17.0" },
+    { name = "audioread", marker = "extra == 'apps'", specifier = ">=3.1" },
     { name = "azure-cognitiveservices-speech", specifier = ">=1.31.0" },
     { name = "cartesia", specifier = ">=1.0.0" },
     { name = "deepgram-sdk", specifier = ">=3.5.0,<4.0.0" },
@@ -823,6 +849,7 @@ requires-dist = [
     { name = "inflect", specifier = ">=7.0.0" },
     { name = "jaconv", specifier = ">=0.3.0" },
     { name = "jiwer", specifier = ">=3.0.0" },
+    { name = "librosa", marker = "extra == 'apps'", specifier = ">=0.11" },
     { name = "litellm", specifier = ">=1.30.0" },
     { name = "more-itertools", specifier = ">=10.0.0" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.5" },
@@ -846,7 +873,8 @@ requires-dist = [
     { name = "regex", specifier = ">=2023.0.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1.0" },
     { name = "setuptools", specifier = ">=65.0.0" },
-    { name = "streamlit", marker = "extra == 'apps'", specifier = ">=1.56.0" },
+    { name = "soundfile", marker = "extra == 'apps'", specifier = ">=0.13" },
+    { name = "streamlit", marker = "extra == 'apps'", specifier = ">=1.51.0" },
     { name = "streamlit-diff-viewer", marker = "extra == 'apps'", specifier = ">=0.0.2" },
     { name = "structlog", specifier = ">=23.0" },
     { name = "tqdm", specifier = ">=4.65" },
@@ -1623,6 +1651,44 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
 ]
 
+[[package]]
+name = "lazy-loader"
+version = "0.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/49/ac/21a1f8aa3777f5658576777ea76bfb124b702c520bbe90edf4ae9915eafa/lazy_loader-0.5.tar.gz", hash = "sha256:717f9179a0dbed357012ddad50a5ad3d5e4d9a0b8712680d4e687f5e6e6ed9b3", size = 15294, upload-time = "2026-03-06T15:45:09.054Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8a/a1/8d812e53a5da1687abb10445275d41a8b13adb781bbf7196ddbcf8d88505/lazy_loader-0.5-py3-none-any.whl", hash = "sha256:ab0ea149e9c554d4ffeeb21105ac60bed7f3b4fd69b1d2360a4add51b170b005", size = 8044, upload-time = "2026-03-06T15:45:07.668Z" },
+]
+
+[[package]]
+name = "librosa"
+version = "0.11.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "audioread" },
+    { name = "decorator" },
+    { name = "joblib" },
+    { name = "lazy-loader" },
+    { name = "msgpack" },
+    { name = "numba" },
+    { name = "numpy" },
+    { name = "pooch" },
+    { name = "scikit-learn" },
+    { name = "scipy" },
+    { name = "soundfile" },
+    { name = "soxr" },
+    { name = "standard-aifc", marker = "python_full_version >= '3.13'" },
+    { name = "standard-sunau", marker = "python_full_version >= '3.13'" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/64/36/360b5aafa0238e29758729e9486c6ed92a6f37fa403b7875e06c115cdf4a/librosa-0.11.0.tar.gz", hash = "sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908", size = 327001, upload-time = "2025-03-11T15:09:54.884Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b5/ba/c63c5786dfee4c3417094c4b00966e61e4a63efecee22cb7b4c0387dda83/librosa-0.11.0-py3-none-any.whl", hash = "sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1", size = 260749, upload-time = "2025-03-11T15:09:52.982Z" },
+]
+
 [[package]]
 name = "librt"
 version = "0.8.1"
@@ -1841,6 +1907,41 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" },
 ]
 
+[[package]]
+name = "msgpack"
+version = "1.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4d/f2/bfb55a6236ed8725a96b0aa3acbd0ec17588e6a2c3b62a93eb513ed8783f/msgpack-1.1.2.tar.gz", hash = "sha256:3b60763c1373dd60f398488069bcdc703cd08a711477b5d480eecc9f9626f47e", size = 173581, upload-time = "2025-10-08T09:15:56.596Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2c/97/560d11202bcd537abca693fd85d81cebe2107ba17301de42b01ac1677b69/msgpack-1.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2e86a607e558d22985d856948c12a3fa7b42efad264dca8a3ebbcfa2735d786c", size = 82271, upload-time = "2025-10-08T09:14:49.967Z" },
+    { url = "https://files.pythonhosted.org/packages/83/04/28a41024ccbd67467380b6fb440ae916c1e4f25e2cd4c63abe6835ac566e/msgpack-1.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:283ae72fc89da59aa004ba147e8fc2f766647b1251500182fac0350d8af299c0", size = 84914, upload-time = "2025-10-08T09:14:50.958Z" },
+    { url = "https://files.pythonhosted.org/packages/71/46/b817349db6886d79e57a966346cf0902a426375aadc1e8e7a86a75e22f19/msgpack-1.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:61c8aa3bd513d87c72ed0b37b53dd5c5a0f58f2ff9f26e1555d3bd7948fb7296", size = 416962, upload-time = "2025-10-08T09:14:51.997Z" },
+    { url = "https://files.pythonhosted.org/packages/da/e0/6cc2e852837cd6086fe7d8406af4294e66827a60a4cf60b86575a4a65ca8/msgpack-1.1.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:454e29e186285d2ebe65be34629fa0e8605202c60fbc7c4c650ccd41870896ef", size = 426183, upload-time = "2025-10-08T09:14:53.477Z" },
+    { url = "https://files.pythonhosted.org/packages/25/98/6a19f030b3d2ea906696cedd1eb251708e50a5891d0978b012cb6107234c/msgpack-1.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7bc8813f88417599564fafa59fd6f95be417179f76b40325b500b3c98409757c", size = 411454, upload-time = "2025-10-08T09:14:54.648Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/cd/9098fcb6adb32187a70b7ecaabf6339da50553351558f37600e53a4a2a23/msgpack-1.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bafca952dc13907bdfdedfc6a5f579bf4f292bdd506fadb38389afa3ac5b208e", size = 422341, upload-time = "2025-10-08T09:14:56.328Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/ae/270cecbcf36c1dc85ec086b33a51a4d7d08fc4f404bdbc15b582255d05ff/msgpack-1.1.2-cp311-cp311-win32.whl", hash = "sha256:602b6740e95ffc55bfb078172d279de3773d7b7db1f703b2f1323566b878b90e", size = 64747, upload-time = "2025-10-08T09:14:57.882Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/79/309d0e637f6f37e83c711f547308b91af02b72d2326ddd860b966080ef29/msgpack-1.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:d198d275222dc54244bf3327eb8cbe00307d220241d9cec4d306d49a44e85f68", size = 71633, upload-time = "2025-10-08T09:14:59.177Z" },
+    { url = "https://files.pythonhosted.org/packages/73/4d/7c4e2b3d9b1106cd0aa6cb56cc57c6267f59fa8bfab7d91df5adc802c847/msgpack-1.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:86f8136dfa5c116365a8a651a7d7484b65b13339731dd6faebb9a0242151c406", size = 64755, upload-time = "2025-10-08T09:15:00.48Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/bd/8b0d01c756203fbab65d265859749860682ccd2a59594609aeec3a144efa/msgpack-1.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:70a0dff9d1f8da25179ffcf880e10cf1aad55fdb63cd59c9a49a1b82290062aa", size = 81939, upload-time = "2025-10-08T09:15:01.472Z" },
+    { url = "https://files.pythonhosted.org/packages/34/68/ba4f155f793a74c1483d4bdef136e1023f7bcba557f0db4ef3db3c665cf1/msgpack-1.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:446abdd8b94b55c800ac34b102dffd2f6aa0ce643c55dfc017ad89347db3dbdb", size = 85064, upload-time = "2025-10-08T09:15:03.764Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/60/a064b0345fc36c4c3d2c743c82d9100c40388d77f0b48b2f04d6041dbec1/msgpack-1.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c63eea553c69ab05b6747901b97d620bb2a690633c77f23feb0c6a947a8a7b8f", size = 417131, upload-time = "2025-10-08T09:15:05.136Z" },
+    { url = "https://files.pythonhosted.org/packages/65/92/a5100f7185a800a5d29f8d14041f61475b9de465ffcc0f3b9fba606e4505/msgpack-1.1.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:372839311ccf6bdaf39b00b61288e0557916c3729529b301c52c2d88842add42", size = 427556, upload-time = "2025-10-08T09:15:06.837Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/87/ffe21d1bf7d9991354ad93949286f643b2bb6ddbeab66373922b44c3b8cc/msgpack-1.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2929af52106ca73fcb28576218476ffbb531a036c2adbcf54a3664de124303e9", size = 404920, upload-time = "2025-10-08T09:15:08.179Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/41/8543ed2b8604f7c0d89ce066f42007faac1eaa7d79a81555f206a5cdb889/msgpack-1.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:be52a8fc79e45b0364210eef5234a7cf8d330836d0a64dfbb878efa903d84620", size = 415013, upload-time = "2025-10-08T09:15:09.83Z" },
+    { url = "https://files.pythonhosted.org/packages/41/0d/2ddfaa8b7e1cee6c490d46cb0a39742b19e2481600a7a0e96537e9c22f43/msgpack-1.1.2-cp312-cp312-win32.whl", hash = "sha256:1fff3d825d7859ac888b0fbda39a42d59193543920eda9d9bea44d958a878029", size = 65096, upload-time = "2025-10-08T09:15:11.11Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/ec/d431eb7941fb55a31dd6ca3404d41fbb52d99172df2e7707754488390910/msgpack-1.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1de460f0403172cff81169a30b9a92b260cb809c4cb7e2fc79ae8d0510c78b6b", size = 72708, upload-time = "2025-10-08T09:15:12.554Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/31/5b1a1f70eb0e87d1678e9624908f86317787b536060641d6798e3cf70ace/msgpack-1.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:be5980f3ee0e6bd44f3a9e9dea01054f175b50c3e6cdb692bc9424c0bbb8bf69", size = 64119, upload-time = "2025-10-08T09:15:13.589Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/31/b46518ecc604d7edf3a4f94cb3bf021fc62aa301f0cb849936968164ef23/msgpack-1.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4efd7b5979ccb539c221a4c4e16aac1a533efc97f3b759bb5a5ac9f6d10383bf", size = 81212, upload-time = "2025-10-08T09:15:14.552Z" },
+    { url = "https://files.pythonhosted.org/packages/92/dc/c385f38f2c2433333345a82926c6bfa5ecfff3ef787201614317b58dd8be/msgpack-1.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:42eefe2c3e2af97ed470eec850facbe1b5ad1d6eacdbadc42ec98e7dcf68b4b7", size = 84315, upload-time = "2025-10-08T09:15:15.543Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/68/93180dce57f684a61a88a45ed13047558ded2be46f03acb8dec6d7c513af/msgpack-1.1.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1fdf7d83102bf09e7ce3357de96c59b627395352a4024f6e2458501f158bf999", size = 412721, upload-time = "2025-10-08T09:15:16.567Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/ba/459f18c16f2b3fc1a1ca871f72f07d70c07bf768ad0a507a698b8052ac58/msgpack-1.1.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fac4be746328f90caa3cd4bc67e6fe36ca2bf61d5c6eb6d895b6527e3f05071e", size = 424657, upload-time = "2025-10-08T09:15:17.825Z" },
+    { url = "https://files.pythonhosted.org/packages/38/f8/4398c46863b093252fe67368b44edc6c13b17f4e6b0e4929dbf0bdb13f23/msgpack-1.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:fffee09044073e69f2bad787071aeec727183e7580443dfeb8556cbf1978d162", size = 402668, upload-time = "2025-10-08T09:15:19.003Z" },
+    { url = "https://files.pythonhosted.org/packages/28/ce/698c1eff75626e4124b4d78e21cca0b4cc90043afb80a507626ea354ab52/msgpack-1.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5928604de9b032bc17f5099496417f113c45bc6bc21b5c6920caf34b3c428794", size = 419040, upload-time = "2025-10-08T09:15:20.183Z" },
+    { url = "https://files.pythonhosted.org/packages/67/32/f3cd1667028424fa7001d82e10ee35386eea1408b93d399b09fb0aa7875f/msgpack-1.1.2-cp313-cp313-win32.whl", hash = "sha256:a7787d353595c7c7e145e2331abf8b7ff1e6673a6b974ded96e6d4ec09f00c8c", size = 65037, upload-time = "2025-10-08T09:15:21.416Z" },
+    { url = "https://files.pythonhosted.org/packages/74/07/1ed8277f8653c40ebc65985180b007879f6a836c525b3885dcc6448ae6cb/msgpack-1.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:a465f0dceb8e13a487e54c07d04ae3ba131c7c5b95e2612596eafde1dccf64a9", size = 72631, upload-time = "2025-10-08T09:15:22.431Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/db/0314e4e2db56ebcf450f277904ffd84a7988b9e5da8d0d61ab2d057df2b6/msgpack-1.1.2-cp313-cp313-win_arm64.whl", hash = "sha256:e69b39f8c0aa5ec24b57737ebee40be647035158f14ed4b40e6f150077e21a84", size = 64118, upload-time = "2025-10-08T09:15:23.402Z" },
+]
+
 [[package]]
 name = "multidict"
 version = "6.7.1"
@@ -2320,6 +2421,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
 ]
 
+[[package]]
+name = "pooch"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "packaging" },
+    { name = "platformdirs" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/83/43/85ef45e8b36c6a48546af7b266592dc32d7f67837a6514d111bced6d7d75/pooch-1.9.0.tar.gz", hash = "sha256:de46729579b9857ffd3e741987a2f6d5e0e03219892c167c6578c0091fb511ed", size = 61788, upload-time = "2026-01-30T19:15:09.649Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/2d/d4bf65e47cea8ff2c794a600c4fd1273a7902f268757c531e0ee9f18aa58/pooch-1.9.0-py3-none-any.whl", hash = "sha256:f265597baa9f760d25ceb29d0beb8186c243d6607b0f60b83ecf14078dbc703b", size = 67175, upload-time = "2026-01-30T19:15:08.36Z" },
+]
+
 [[package]]
 name = "pre-commit"
 version = "4.5.1"
@@ -3115,6 +3230,44 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" },
 ]
 
+[[package]]
+name = "scikit-learn"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "joblib" },
+    { name = "numpy" },
+    { name = "scipy" },
+    { name = "threadpoolctl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0e/d4/40988bf3b8e34feec1d0e6a051446b1f66225f8529b9309becaeef62b6c4/scikit_learn-1.8.0.tar.gz", hash = "sha256:9bccbb3b40e3de10351f8f5068e105d0f4083b1a65fa07b6634fbc401a6287fd", size = 7335585, upload-time = "2025-12-10T07:08:53.618Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c9/92/53ea2181da8ac6bf27170191028aee7251f8f841f8d3edbfdcaf2008fde9/scikit_learn-1.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:146b4d36f800c013d267b29168813f7a03a43ecd2895d04861f1240b564421da", size = 8595835, upload-time = "2025-12-10T07:07:39.385Z" },
+    { url = "https://files.pythonhosted.org/packages/01/18/d154dc1638803adf987910cdd07097d9c526663a55666a97c124d09fb96a/scikit_learn-1.8.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:f984ca4b14914e6b4094c5d52a32ea16b49832c03bd17a110f004db3c223e8e1", size = 8080381, upload-time = "2025-12-10T07:07:41.93Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/44/226142fcb7b7101e64fdee5f49dbe6288d4c7af8abf593237b70fca080a4/scikit_learn-1.8.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5e30adb87f0cc81c7690a84f7932dd66be5bac57cfe16b91cb9151683a4a2d3b", size = 8799632, upload-time = "2025-12-10T07:07:43.899Z" },
+    { url = "https://files.pythonhosted.org/packages/36/4d/4a67f30778a45d542bbea5db2dbfa1e9e100bf9ba64aefe34215ba9f11f6/scikit_learn-1.8.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ada8121bcb4dac28d930febc791a69f7cb1673c8495e5eee274190b73a4559c1", size = 9103788, upload-time = "2025-12-10T07:07:45.982Z" },
+    { url = "https://files.pythonhosted.org/packages/89/3c/45c352094cfa60050bcbb967b1faf246b22e93cb459f2f907b600f2ceda5/scikit_learn-1.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:c57b1b610bd1f40ba43970e11ce62821c2e6569e4d74023db19c6b26f246cb3b", size = 8081706, upload-time = "2025-12-10T07:07:48.111Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/46/5416595bb395757f754feb20c3d776553a386b661658fb21b7c814e89efe/scikit_learn-1.8.0-cp311-cp311-win_arm64.whl", hash = "sha256:2838551e011a64e3053ad7618dda9310175f7515f1742fa2d756f7c874c05961", size = 7688451, upload-time = "2025-12-10T07:07:49.873Z" },
+    { url = "https://files.pythonhosted.org/packages/90/74/e6a7cc4b820e95cc38cf36cd74d5aa2b42e8ffc2d21fe5a9a9c45c1c7630/scikit_learn-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5fb63362b5a7ddab88e52b6dbb47dac3fd7dafeee740dc6c8d8a446ddedade8e", size = 8548242, upload-time = "2025-12-10T07:07:51.568Z" },
+    { url = "https://files.pythonhosted.org/packages/49/d8/9be608c6024d021041c7f0b3928d4749a706f4e2c3832bbede4fb4f58c95/scikit_learn-1.8.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:5025ce924beccb28298246e589c691fe1b8c1c96507e6d27d12c5fadd85bfd76", size = 8079075, upload-time = "2025-12-10T07:07:53.697Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/47/f187b4636ff80cc63f21cd40b7b2d177134acaa10f6bb73746130ee8c2e5/scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4496bb2cf7a43ce1a2d7524a79e40bc5da45cf598dbf9545b7e8316ccba47bb4", size = 8660492, upload-time = "2025-12-10T07:07:55.574Z" },
+    { url = "https://files.pythonhosted.org/packages/97/74/b7a304feb2b49df9fafa9382d4d09061a96ee9a9449a7cbea7988dda0828/scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0bcfe4d0d14aec44921545fd2af2338c7471de9cb701f1da4c9d85906ab847a", size = 8931904, upload-time = "2025-12-10T07:07:57.666Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/c4/0ab22726a04ede56f689476b760f98f8f46607caecff993017ac1b64aa5d/scikit_learn-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:35c007dedb2ffe38fe3ee7d201ebac4a2deccd2408e8621d53067733e3c74809", size = 8019359, upload-time = "2025-12-10T07:07:59.838Z" },
+    { url = "https://files.pythonhosted.org/packages/24/90/344a67811cfd561d7335c1b96ca21455e7e472d281c3c279c4d3f2300236/scikit_learn-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:8c497fff237d7b4e07e9ef1a640887fa4fb765647f86fbe00f969ff6280ce2bb", size = 7641898, upload-time = "2025-12-10T07:08:01.36Z" },
+    { url = "https://files.pythonhosted.org/packages/03/aa/e22e0768512ce9255eba34775be2e85c2048da73da1193e841707f8f039c/scikit_learn-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d6ae97234d5d7079dc0040990a6f7aeb97cb7fa7e8945f1999a429b23569e0a", size = 8513770, upload-time = "2025-12-10T07:08:03.251Z" },
+    { url = "https://files.pythonhosted.org/packages/58/37/31b83b2594105f61a381fc74ca19e8780ee923be2d496fcd8d2e1147bd99/scikit_learn-1.8.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:edec98c5e7c128328124a029bceb09eda2d526997780fef8d65e9a69eead963e", size = 8044458, upload-time = "2025-12-10T07:08:05.336Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/5a/3f1caed8765f33eabb723596666da4ebbf43d11e96550fb18bdec42b467b/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:74b66d8689d52ed04c271e1329f0c61635bcaf5b926db9b12d58914cdc01fe57", size = 8610341, upload-time = "2025-12-10T07:08:07.732Z" },
+    { url = "https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8fdf95767f989b0cfedb85f7ed8ca215d4be728031f56ff5a519ee1e3276dc2e", size = 8900022, upload-time = "2025-12-10T07:08:09.862Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/f9/9b7563caf3ec8873e17a31401858efab6b39a882daf6c1bfa88879c0aa11/scikit_learn-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:2de443b9373b3b615aec1bb57f9baa6bb3a9bd093f1269ba95c17d870422b271", size = 7989409, upload-time = "2025-12-10T07:08:12.028Z" },
+    { url = "https://files.pythonhosted.org/packages/49/bd/1f4001503650e72c4f6009ac0c4413cb17d2d601cef6f71c0453da2732fc/scikit_learn-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:eddde82a035681427cbedded4e6eff5e57fa59216c2e3e90b10b19ab1d0a65c3", size = 7619760, upload-time = "2025-12-10T07:08:13.688Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/7d/a630359fc9dcc95496588c8d8e3245cc8fd81980251079bc09c70d41d951/scikit_learn-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7cc267b6108f0a1499a734167282c00c4ebf61328566b55ef262d48e9849c735", size = 8826045, upload-time = "2025-12-10T07:08:15.215Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/56/a0c86f6930cfcd1c7054a2bc417e26960bb88d32444fe7f71d5c2cfae891/scikit_learn-1.8.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:fe1c011a640a9f0791146011dfd3c7d9669785f9fed2b2a5f9e207536cf5c2fd", size = 8420324, upload-time = "2025-12-10T07:08:17.561Z" },
+    { url = "https://files.pythonhosted.org/packages/46/1e/05962ea1cebc1cf3876667ecb14c283ef755bf409993c5946ade3b77e303/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72358cce49465d140cc4e7792015bb1f0296a9742d5622c67e31399b75468b9e", size = 8680651, upload-time = "2025-12-10T07:08:19.952Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/56/a85473cd75f200c9759e3a5f0bcab2d116c92a8a02ee08ccd73b870f8bb4/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80832434a6cc114f5219211eec13dcbc16c2bac0e31ef64c6d346cde3cf054cb", size = 8925045, upload-time = "2025-12-10T07:08:22.11Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/b7/64d8cfa896c64435ae57f4917a548d7ac7a44762ff9802f75a79b77cb633/scikit_learn-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ee787491dbfe082d9c3013f01f5991658b0f38aa8177e4cd4bf434c58f551702", size = 8507994, upload-time = "2025-12-10T07:08:23.943Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/37/e192ea709551799379958b4c4771ec507347027bb7c942662c7fbeba31cb/scikit_learn-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf97c10a3f5a7543f9b88cbf488d33d175e9146115a451ae34568597ba33dcde", size = 7869518, upload-time = "2025-12-10T07:08:25.71Z" },
+]
+
 [[package]]
 name = "scipy"
 version = "1.17.1"
@@ -3211,6 +3364,25 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
 ]
 
+[[package]]
+name = "soundfile"
+version = "0.13.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cffi" },
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/64/28/e2a36573ccbcf3d57c00626a21fe51989380636e821b341d36ccca0c1c3a/soundfile-0.13.1-py2.py3-none-any.whl", hash = "sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445", size = 25751, upload-time = "2025-01-25T09:16:44.235Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/ab/73e97a5b3cc46bba7ff8650a1504348fa1863a6f9d57d7001c6b67c5f20e/soundfile-0.13.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:82dc664d19831933fe59adad199bf3945ad06d84bc111a5b4c0d3089a5b9ec33", size = 1142250, upload-time = "2025-01-25T09:16:47.583Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/e5/58fd1a8d7b26fc113af244f966ee3aecf03cb9293cb935daaddc1e455e18/soundfile-0.13.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:743f12c12c4054921e15736c6be09ac26b3b3d603aef6fd69f9dde68748f2593", size = 1101406, upload-time = "2025-01-25T09:16:49.662Z" },
+    { url = "https://files.pythonhosted.org/packages/58/ae/c0e4a53d77cf6e9a04179535766b3321b0b9ced5f70522e4caf9329f0046/soundfile-0.13.1-py2.py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9c9e855f5a4d06ce4213f31918653ab7de0c5a8d8107cd2427e44b42df547deb", size = 1235729, upload-time = "2025-01-25T09:16:53.018Z" },
+    { url = "https://files.pythonhosted.org/packages/57/5e/70bdd9579b35003a489fc850b5047beeda26328053ebadc1fb60f320f7db/soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:03267c4e493315294834a0870f31dbb3b28a95561b80b134f0bd3cf2d5f0e618", size = 1313646, upload-time = "2025-01-25T09:16:54.872Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/df/8c11dc4dfceda14e3003bb81a0d0edcaaf0796dd7b4f826ea3e532146bba/soundfile-0.13.1-py2.py3-none-win32.whl", hash = "sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5", size = 899881, upload-time = "2025-01-25T09:16:56.663Z" },
+    { url = "https://files.pythonhosted.org/packages/14/e9/6b761de83277f2f02ded7e7ea6f07828ec78e4b229b80e4ca55dd205b9dc/soundfile-0.13.1-py2.py3-none-win_amd64.whl", hash = "sha256:1e70a05a0626524a69e9f0f4dd2ec174b4e9567f4d8b6c11d38b5c289be36ee9", size = 1019162, upload-time = "2025-01-25T09:16:59.573Z" },
+]
+
 [[package]]
 name = "soxr"
 version = "0.5.0.post1"
@@ -3232,6 +3404,40 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/bc/10/440f1ba3d4955e0dc740bbe4ce8968c254a3d644d013eb75eea729becdb8/soxr-0.5.0.post1-cp312-abi3-win_amd64.whl", hash = "sha256:b1be9fee90afb38546bdbd7bde714d1d9a8c5a45137f97478a83b65e7f3146f6", size = 164937, upload-time = "2024-08-31T03:43:23.671Z" },
 ]
 
+[[package]]
+name = "standard-aifc"
+version = "3.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "audioop-lts", marker = "python_full_version >= '3.13'" },
+    { name = "standard-chunk", marker = "python_full_version >= '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c4/53/6050dc3dde1671eb3db592c13b55a8005e5040131f7509cef0215212cb84/standard_aifc-3.13.0.tar.gz", hash = "sha256:64e249c7cb4b3daf2fdba4e95721f811bde8bdfc43ad9f936589b7bb2fae2e43", size = 15240, upload-time = "2024-10-30T16:01:31.772Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c3/52/5fbb203394cc852334d1575cc020f6bcec768d2265355984dfd361968f36/standard_aifc-3.13.0-py3-none-any.whl", hash = "sha256:f7ae09cc57de1224a0dd8e3eb8f73830be7c3d0bc485de4c1f82b4a7f645ac66", size = 10492, upload-time = "2024-10-30T16:01:07.071Z" },
+]
+
+[[package]]
+name = "standard-chunk"
+version = "3.13.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/06/ce1bb165c1f111c7d23a1ad17204d67224baa69725bb6857a264db61beaf/standard_chunk-3.13.0.tar.gz", hash = "sha256:4ac345d37d7e686d2755e01836b8d98eda0d1a3ee90375e597ae43aaf064d654", size = 4672, upload-time = "2024-10-30T16:18:28.326Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7a/90/a5c1084d87767d787a6caba615aa50dc587229646308d9420c960cb5e4c0/standard_chunk-3.13.0-py3-none-any.whl", hash = "sha256:17880a26c285189c644bd5bd8f8ed2bdb795d216e3293e6dbe55bbd848e2982c", size = 4944, upload-time = "2024-10-30T16:18:26.694Z" },
+]
+
+[[package]]
+name = "standard-sunau"
+version = "3.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "audioop-lts", marker = "python_full_version >= '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/e3/ce8d38cb2d70e05ffeddc28bb09bad77cfef979eb0a299c9117f7ed4e6a9/standard_sunau-3.13.0.tar.gz", hash = "sha256:b319a1ac95a09a2378a8442f403c66f4fd4b36616d6df6ae82b8e536ee790908", size = 9368, upload-time = "2024-10-30T16:01:41.626Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/34/ae/e3707f6c1bc6f7aa0df600ba8075bfb8a19252140cd595335be60e25f9ee/standard_sunau-3.13.0-py3-none-any.whl", hash = "sha256:53af624a9529c41062f4c2fd33837f297f3baa196b0cfceffea6555654602622", size = 7364, upload-time = "2024-10-30T16:01:28.003Z" },
+]
+
 [[package]]
 name = "starlette"
 version = "0.52.1"
@@ -3316,6 +3522,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926, upload-time = "2026-02-07T10:45:32.24Z" },
 ]
 
+[[package]]
+name = "threadpoolctl"
+version = "3.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" },
+]
+
 [[package]]
 name = "tiktoken"
 version = "0.12.0"

From cd2c4fa242d52b2d88d6ff23c0c429da910dc5ac Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Fri, 10 Apr 2026 16:45:11 -0400
Subject: [PATCH 08/23] Update Streamlit to 1.56

---
 uv.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/uv.lock b/uv.lock
index 43787fb1..d6351fb3 100644
--- a/uv.lock
+++ b/uv.lock
@@ -874,7 +874,7 @@ requires-dist = [
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1.0" },
     { name = "setuptools", specifier = ">=65.0.0" },
     { name = "soundfile", marker = "extra == 'apps'", specifier = ">=0.13" },
-    { name = "streamlit", marker = "extra == 'apps'", specifier = ">=1.51.0" },
+    { name = "streamlit", marker = "extra == 'apps'", specifier = ">=1.56.0" },
     { name = "streamlit-diff-viewer", marker = "extra == 'apps'", specifier = ">=0.0.2" },
     { name = "structlog", specifier = ">=23.0" },
     { name = "tqdm", specifier = ">=4.65" },

From 2872fbf00ed712bd41e882c7104dba50aa654a73 Mon Sep 17 00:00:00 2001
From: "joseph.marinier" <joseph.marinier@servicenow.com>
Date: Fri, 10 Apr 2026 18:50:42 -0400
Subject: [PATCH 09/23] Use pages instead of radio buttons

---
 apps/analysis.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/apps/analysis.py b/apps/analysis.py
index fa34eced..b4a48c5e 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -1003,6 +1003,7 @@ def render_cross_run_comparison(run_dirs: list[Path]):
 
     # Add link column to navigate to Run Overview
     display_df.insert(0, "link", f"/run_overview?output_dir={run_dirs[0].parent}&run=" + summary_df["run"])
+    display_df.insert(0, "link", f"/run_overview?output_dir={run_dirs[0].parent}&run=" + summary_df["run"])
 
     composite_rename = {c: f"[EVA] {_EVA_COMPOSITE_DISPLAY[c]}" for c in table_composites}
     display_df = display_df.rename(columns={"label": "Run", "records": "# Records", **composite_rename, **col_rename})

From 2f3a2c64ccf88bb1807559a0747c33c48be67c2b Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Tue, 14 Apr 2026 00:27:16 +0000
Subject: [PATCH 10/23] Updating timestamp extraction similar to turn-taking
 metric

---
 apps/audio_plots.py | 527 +++++++++++++++++++++++++++++---------------
 1 file changed, 347 insertions(+), 180 deletions(-)

diff --git a/apps/audio_plots.py b/apps/audio_plots.py
index ba6928a3..5b5d07a1 100644
--- a/apps/audio_plots.py
+++ b/apps/audio_plots.py
@@ -10,6 +10,16 @@
   Row 3        : ElevenLabs waveform, colour-coded by speaker turn
   Row 4 (opt)  : ElevenLabs spectrogram
   Row 5        : Speaker Turn Timeline
+
+Turn data is loaded from metrics.json (same source as the turn_taking metric):
+  context.audio_timestamps_user_turns / audio_timestamps_assistant_turns
+    → dict[turn_id → list[(abs_start, abs_end)]] — may have multiple segments per turn
+  context.transcribed_*_turns / intended_*_turns
+    → dict[turn_id → str] — keyed by the same turn IDs
+  metrics.turn_taking.details.per_turn_latency
+    → dict[turn_id → seconds] — user_last_seg_end → asst_first_seg_start
+
+Falls back to parsing elevenlabs_events.jsonl directly when metrics.json is absent.
 """
 
 import json
@@ -37,10 +47,100 @@
 
 
 # =============================================================================
-# Parsing / loading helpers
+# Turn data loading — metrics.json first, elevenlabs_events.jsonl fallback
 # =============================================================================
 
-def _parse_elevenlabs_events(events_file: Path) -> dict:
+def _load_metrics_context(record_dir: Path) -> dict | None:
+    """Load metrics.json; return None if absent."""
+    metrics_file = record_dir / "metrics.json"
+    if not metrics_file.exists():
+        return None
+    with open(metrics_file) as f:
+        return json.load(f)
+
+
+def _build_turns_from_metrics(metrics_data: dict) -> list[dict] | None:
+    """Build a turns list from metrics.json using the same timestamps the turn_taking metric uses.
+
+    Each turn dict has:
+      turn_id, speaker ("user"|"assistant"),
+      segments [(rel_start, rel_end), ...],  ← may be >1 for interrupted turns
+      start, end, duration,
+      transcript_heard, transcript_intended,
+      latency_s (user→assistant gap, user turns only), timing_label.
+    """
+    ctx = metrics_data.get("context") or {}
+    user_ts  = ctx.get("audio_timestamps_user_turns")  or {}
+    asst_ts  = ctx.get("audio_timestamps_assistant_turns") or {}
+    if not user_ts and not asst_ts:
+        return None
+
+    transcribed_user = ctx.get("transcribed_user_turns")  or {}
+    transcribed_asst = ctx.get("transcribed_assistant_turns") or {}
+    intended_user    = ctx.get("intended_user_turns")   or {}
+    intended_asst    = ctx.get("intended_assistant_turns") or {}
+
+    # Per-turn latency / timing label from turn_taking metric (if already computed)
+    metrics      = metrics_data.get("metrics") or {}
+    tt_details   = (metrics.get("turn_taking") or {}).get("details") or {}
+    per_turn_latency = {int(k): v for k, v in (tt_details.get("per_turn_latency") or {}).items()}
+    per_turn_labels  = {int(k): v for k, v in (tt_details.get("per_turn_judge_timing_ratings") or {}).items()}
+
+    # Reference time: earliest timestamp across all turns
+    all_starts = [
+        segs[0][0]
+        for segs in list(user_ts.values()) + list(asst_ts.values())
+        if segs
+    ]
+    t0 = min(all_starts) if all_starts else 0.0
+
+    def _rel(segs: list) -> list[tuple[float, float]]:
+        return [(s - t0, e - t0) for s, e in segs] if segs else []
+
+    turns: list[dict] = []
+
+    for tid_str, segs in asst_ts.items():
+        if not segs:
+            continue
+        tid = int(tid_str)
+        rel = _rel(segs)
+        turns.append({
+            "turn_id":             tid,
+            "speaker":             "assistant",
+            "segments":            rel,
+            "start":               rel[0][0],
+            "end":                 rel[-1][1],
+            "duration":            rel[-1][1] - rel[0][0],
+            "transcript_heard":    transcribed_asst.get(tid_str, ""),
+            "transcript_intended": intended_asst.get(tid_str, ""),
+            "latency_s":           None,
+            "timing_label":        None,
+        })
+
+    for tid_str, segs in user_ts.items():
+        if not segs:
+            continue
+        tid = int(tid_str)
+        rel = _rel(segs)
+        turns.append({
+            "turn_id":             tid,
+            "speaker":             "user",
+            "segments":            rel,
+            "start":               rel[0][0],
+            "end":                 rel[-1][1],
+            "duration":            rel[-1][1] - rel[0][0],
+            "transcript_heard":    transcribed_user.get(tid_str, ""),
+            "transcript_intended": intended_user.get(tid_str, ""),
+            "latency_s":           per_turn_latency.get(tid),
+            "timing_label":        per_turn_labels.get(tid),
+        })
+
+    turns.sort(key=lambda t: t["start"])
+    return turns
+
+
+def _parse_elevenlabs_events(events_file: Path) -> list[dict]:
+    """Fallback: parse elevenlabs_events.jsonl into a flat turns list (no turn IDs)."""
     events = []
     with open(events_file) as f:
         for line in f:
@@ -50,65 +150,103 @@ def _parse_elevenlabs_events(events_file: Path) -> dict:
     audio_events = [e for e in events if e.get("event_type") in ("audio_start", "audio_end")]
     audio_events.sort(key=lambda x: x.get("audio_timestamp", 0))
 
-    active_turns: dict = {}
-    turns: list = []
-    for event in audio_events:
-        user = event.get("user")
-        etype = event.get("event_type")
-        ts = event.get("audio_timestamp")
+    active: dict = {}
+    raw: list = []
+    for ev in audio_events:
+        user  = ev.get("user")
+        etype = ev.get("event_type")
+        ts    = ev.get("audio_timestamp")
         if etype == "audio_start":
-            if user not in active_turns or active_turns[user].get("end") is not None:
-                active_turns[user] = {"user": user, "start": ts, "end": None}
+            if user not in active or active[user].get("end") is not None:
+                active[user] = {"user": user, "start": ts, "end": None}
         elif etype == "audio_end":
-            if user in active_turns and active_turns[user].get("end") is None:
-                active_turns[user]["end"] = ts
-                active_turns[user]["duration"] = ts - active_turns[user]["start"]
-                turns.append(active_turns[user].copy())
-
-    turns.sort(key=lambda x: x["start"])
-    return {"turns": turns}
-
-
-def _calculate_pauses(turns: list) -> list:
+            if user in active and active[user].get("end") is None:
+                active[user]["end"] = ts
+                active[user]["duration"] = ts - active[user]["start"]
+                raw.append(active[user].copy())
+
+    raw.sort(key=lambda x: x["start"])
+    t0 = min((t["start"] for t in raw), default=0.0)
+
+    user_idx = asst_idx = 0
+    turns: list[dict] = []
+    for i, t in enumerate(raw):
+        is_asst = t["user"] == "pipecat_agent"
+        speaker = "assistant" if is_asst else "user"
+        s_rel   = t["start"] - t0
+        e_rel   = t["end"]   - t0
+        turns.append({
+            "turn_id":             i,
+            "speaker":             speaker,
+            "segments":            [(s_rel, e_rel)],
+            "start":               s_rel,
+            "end":                 e_rel,
+            "duration":            t.get("duration", e_rel - s_rel),
+            "transcript_heard":    "",
+            "transcript_intended": "",
+            "latency_s":           None,
+            "timing_label":        None,
+            "_seq_idx":            asst_idx if is_asst else user_idx,
+        })
+        if is_asst:
+            asst_idx += 1
+        else:
+            user_idx += 1
+    return turns
+
+
+def _patch_fallback_transcripts(turns: list[dict], transcript_file: Path) -> None:
+    """Fill transcript fields in fallback turns from transcript.jsonl using sequential order."""
+    tx: dict[str, list[str]] = {"user": [], "assistant": []}
+    if transcript_file.exists():
+        with open(transcript_file) as f:
+            for line in f:
+                if line.strip():
+                    entry = json.loads(line)
+                    role  = entry.get("type", "")
+                    content = entry.get("content", "")
+                    if role in tx:
+                        tx[role].append(content)
+    for turn in turns:
+        idx  = turn.pop("_seq_idx", 0)
+        key  = "assistant" if turn["speaker"] == "assistant" else "user"
+        text = tx[key][idx] if idx < len(tx[key]) else ""
+        turn["transcript_heard"]    = text
+        turn["transcript_intended"] = text
+
+
+def _calculate_pauses(turns_rel: list[dict]) -> list[dict]:
+    """Compute pause gaps between consecutive audio segments across all turns."""
+    all_segs = sorted(
+        [(s, e, turn["speaker"]) for turn in turns_rel for s, e in turn["segments"]],
+        key=lambda x: x[0],
+    )
     pauses = []
-    for i in range(len(turns) - 1):
-        cur, nxt = turns[i], turns[i + 1]
-        if cur["end"] and nxt["start"]:
-            gap = nxt["start"] - cur["end"]
-            if gap > 0:
-                pauses.append({
-                    "from_speaker": cur["user"],
-                    "to_speaker": nxt["user"],
-                    "start": cur["end"],
-                    "end": nxt["start"],
-                    "duration_seconds": gap,
-                })
+    for i in range(len(all_segs) - 1):
+        cur_end   = all_segs[i][1]
+        nxt_start = all_segs[i + 1][0]
+        gap = nxt_start - cur_end
+        if gap > 0.001:
+            pauses.append({
+                "from_speaker":     all_segs[i][2],
+                "to_speaker":       all_segs[i + 1][2],
+                "start":            cur_end,
+                "end":              nxt_start,
+                "duration_seconds": gap,
+            })
     return pauses
 
 
-def _parse_transcript(transcript_file: Path) -> dict:
-    result: dict = {"user": [], "assistant": []}
-    if not transcript_file or not transcript_file.exists():
-        return result
-    with open(transcript_file) as f:
-        for line in f:
-            if line.strip():
-                entry = json.loads(line)
-                role = entry.get("type", "")
-                content = entry.get("content", "")
-                if role == "user":
-                    result["user"].append(content)
-                elif role == "assistant":
-                    result["assistant"].append(content)
-    return result
-
+# =============================================================================
+# Audio loading helpers
+# =============================================================================
 
 def _load_pydub(path: Path) -> tuple:
     seg = AudioSegment.from_file(str(path))
     if seg.channels > 1:
         seg = seg.set_channels(1)
     sr = seg.frame_rate
-    y = np.array(seg.get_array_of_samples()).astype(np.float32) / 32768.0
+    y  = np.array(seg.get_array_of_samples()).astype(np.float32) / 32768.0
     return y, sr
 
 
@@ -121,11 +259,11 @@ def _load_librosa(path: Path) -> tuple:
 
 def _downsample(y: np.ndarray, sr: float, target_rate: int = 100) -> tuple:
     duration = len(y) / sr
-    target = max(2, int(duration * target_rate))
+    target   = max(2, int(duration * target_rate))
     if len(y) > target:
-        step = max(1, len(y) // target)
-        y_ds = y[::step]
-        sr_ds = sr * len(y_ds) / len(y)
+        step    = max(1, len(y) // target)
+        y_ds    = y[::step]
+        sr_ds   = sr * len(y_ds) / len(y)
     else:
         y_ds, sr_ds = y, sr
     return y_ds, sr_ds
@@ -151,83 +289,75 @@ def _wrap(text: str, width: int = 80) -> str:
 # =============================================================================
 
 def _prepare_data(record_dir: Path) -> dict:
-    audio_mixed = next(record_dir.glob("audio_mixed*.wav"), record_dir / "audio_mixed.wav")
-    audio_el    = record_dir / "elevenlabs_audio_recording.mp3"
-    events_file = record_dir / "elevenlabs_events.jsonl"
-    transcript  = record_dir / "transcript.jsonl"
-
-    # Turns / pauses
-    if events_file.exists():
-        turns = _parse_elevenlabs_events(events_file)["turns"]
-    else:
-        turns = []
-    pauses = _calculate_pauses(turns)
-
-    start_time = min((t["start"] for t in turns), default=0)
-    turns_rel = [{
-        "user":     t["user"],
-        "start":    t["start"] - start_time,
-        "end":      (t["end"] - start_time) if t["end"] else None,
-        "duration": t.get("duration", (t["end"] - t["start"]) if t["end"] else 0),
-    } for t in turns]
-    pauses_rel = [{
-        "from_speaker":     p["from_speaker"],
-        "to_speaker":       p["to_speaker"],
-        "start":            p["start"] - start_time,
-        "end":              p["end"]   - start_time,
-        "duration_seconds": p["duration_seconds"],
-    } for p in pauses]
-
-    transcript_map = _parse_transcript(transcript)
-
-    # Mixed audio
+    audio_mixed  = next(record_dir.glob("audio_mixed*.wav"), record_dir / "audio_mixed.wav")
+    audio_el     = record_dir / "elevenlabs_audio_recording.mp3"
+    events_file  = record_dir / "elevenlabs_events.jsonl"
+    transcript   = record_dir / "transcript.jsonl"
+
+    # --- Turn data: prefer metrics.json (same source as turn_taking metric) ---
+    turns_rel: list[dict] = []
+    metrics_data = _load_metrics_context(record_dir)
+    if metrics_data:
+        built = _build_turns_from_metrics(metrics_data)
+        if built:
+            turns_rel = built
+
+    # Fallback: parse ElevenLabs event log directly
+    if not turns_rel and events_file.exists():
+        turns_rel = _parse_elevenlabs_events(events_file)
+        _patch_fallback_transcripts(turns_rel, transcript)
+
+    pauses_rel = _calculate_pauses(turns_rel)
+
+    # --- Audio: mixed ---
     y_mixed, sr_mixed, duration, mixed_loaded = None, None, 0.0, False
     if audio_mixed.exists():
         try:
             y_mixed, sr_mixed = _load_pydub(audio_mixed)
-            duration = len(y_mixed) / sr_mixed
+            duration    = len(y_mixed) / sr_mixed
             mixed_loaded = True
         except Exception:
             pass
 
-    plot_xlim = [0, max(duration, 1.0)]
+    # Use the later of audio duration and last turn end for x-axis
+    turns_end   = max((t["end"] for t in turns_rel), default=0.0)
+    plot_xlim   = [0, max(duration, turns_end, 1.0)]
 
     if mixed_loaded:
         y_ds, _ = _downsample(y_mixed, sr_mixed)
         t_mixed = np.linspace(0, duration, len(y_ds))
     else:
-        y_ds = np.array([])
+        y_ds    = np.array([])
         t_mixed = np.array([])
 
-    # ElevenLabs audio
-    el_y_ds, el_t, el_sr_ds, el_loaded = np.array([]), np.array([]), 1.0, False
-    el_spec = None
+    # --- Audio: ElevenLabs ---
+    el_y_ds, el_t, el_spec = np.array([]), np.array([]), None
+    el_loaded = False
     if audio_el.exists():
         try:
             _el_y, _el_sr = _load_librosa(audio_el)
-            el_y_ds, _ = _downsample(_el_y, _el_sr)
-            el_sr_ds   = _el_sr * len(el_y_ds) / len(_el_y)
-            el_t       = np.linspace(0, len(_el_y) / _el_sr, len(el_y_ds))
-            el_loaded  = True
-            D = librosa.amplitude_to_db(
+            el_y_ds, _    = _downsample(_el_y, _el_sr)
+            el_t          = np.linspace(0, len(_el_y) / _el_sr, len(el_y_ds))
+            el_loaded     = True
+            D      = librosa.amplitude_to_db(
                 np.abs(librosa.stft(_el_y, hop_length=512, n_fft=2048)), ref=np.max)
-            freqs = librosa.fft_frequencies(sr=int(_el_sr), n_fft=2048)
-            times = librosa.frames_to_time(np.arange(D.shape[1]),
-                                           sr=int(_el_sr), hop_length=512)
+            freqs  = librosa.fft_frequencies(sr=int(_el_sr), n_fft=2048)
+            times  = librosa.frames_to_time(np.arange(D.shape[1]),
+                                            sr=int(_el_sr), hop_length=512)
             el_spec = (D, freqs, times)
         except Exception:
             pass
 
-    # Mixed spectrogram
+    # --- Spectrogram: mixed ---
     mixed_spec = None
     if mixed_loaded and len(y_ds) > 0:
         try:
-            sr_ds = sr_mixed * len(y_ds) / len(y_mixed)
-            D     = librosa.amplitude_to_db(
+            sr_ds  = sr_mixed * len(y_ds) / len(y_mixed)
+            D      = librosa.amplitude_to_db(
                 np.abs(librosa.stft(y_ds, hop_length=512, n_fft=2048)), ref=np.max)
-            freqs = librosa.fft_frequencies(sr=int(sr_ds), n_fft=2048)
-            times = librosa.frames_to_time(np.arange(D.shape[1]),
-                                           sr=int(sr_ds), hop_length=512)
+            freqs  = librosa.fft_frequencies(sr=int(sr_ds), n_fft=2048)
+            times  = librosa.frames_to_time(np.arange(D.shape[1]),
+                                            sr=int(sr_ds), hop_length=512)
             mixed_spec = (D, freqs, times)
         except Exception:
             pass
@@ -241,12 +371,10 @@ def _prepare_data(record_dir: Path) -> dict:
         "el_loaded":    el_loaded,
         "el_y_ds":      el_y_ds,
         "el_t":         el_t,
-        "el_sr_ds":     el_sr_ds,
         "mixed_spec":   mixed_spec,
         "el_spec":      el_spec,
         "turns_rel":    turns_rel,
         "pauses_rel":   pauses_rel,
-        "transcript_map": transcript_map,
     }
 
 
@@ -259,10 +387,9 @@ def _build_figure(data: dict,
                   show_el_spec: bool = False,
                   title_suffix: str = "") -> go.Figure:
 
-    turns_rel      = data["turns_rel"]
-    pauses_rel     = data["pauses_rel"]
-    transcript_map = data["transcript_map"]
-    plot_xlim      = data["plot_xlim"]
+    turns_rel  = data["turns_rel"]
+    pauses_rel = data["pauses_rel"]
+    plot_xlim  = data["plot_xlim"]
 
     # ------------------------------------------------------------------ #
     # Dynamic row layout
@@ -290,8 +417,8 @@ def _build_figure(data: dict,
         "timeline":       1.5,
     }
 
-    n_rows     = len(row_keys)
-    row_of     = {k: i + 1 for i, k in enumerate(row_keys)}
+    n_rows      = len(row_keys)
+    row_of      = {k: i + 1 for i, k in enumerate(row_keys)}
     row_heights = [_heights[k] for k in row_keys]
 
     fig = make_subplots(
@@ -320,10 +447,10 @@ def _build_figure(data: dict,
     # All real traces use showlegend=False + legendgroup for toggling.
     # ------------------------------------------------------------------ #
     for _name, _color, _symbol in [
-        ("User",      USER_COLOR,                  "square"),
-        ("Assistant", ASST_COLOR,                  "square"),
-        ("Silence",   "rgba(140,140,140,0.55)",    "square"),
-        ("Pause",     "rgba(140,140,140,0.40)",    "square-open"),
+        ("User",      USER_COLOR,               "square"),
+        ("Assistant", ASST_COLOR,               "square"),
+        ("Silence",   "rgba(140,140,140,0.55)", "square"),
+        ("Pause",     "rgba(140,140,140,0.40)", "square-open"),
     ]:
         fig.add_trace(go.Scatter(
             x=[None], y=[None], mode="markers",
@@ -333,43 +460,53 @@ def _build_figure(data: dict,
         ), row=1, col=1)
 
     # ------------------------------------------------------------------ #
-    # Hover text — per-sample transcript strings
+    # Hover text — per-sample transcript strings, keyed by turn segment
     # ------------------------------------------------------------------ #
     def _hover_texts(time_array: np.ndarray) -> list:
         if len(time_array) == 0:
             return []
         texts = np.full(len(time_array), "", dtype=object)
-        tc: dict = {"user": 0, "assistant": 0}
+
         for turn in turns_rel:
-            if not turn["end"]:
-                continue
-            is_asst = turn["user"] == "pipecat_agent"
-            speaker = "Assistant" if is_asst else "User"
-            key     = "assistant" if is_asst else "user"
-            tx_list = transcript_map[key]
-            text    = tx_list[tc[key]] if tc[key] < len(tx_list) else "(no transcript)"
-            tc[key] += 1
-            hover = (f"<b>{speaker}</b><br>"
-                     f"t\u00a0=\u00a0{turn['start']:.2f}s\u2013{turn['end']:.2f}s "
-                     f"({turn['duration']:.1f}s)<br><br>"
-                     f"{_wrap(text)}")
-            mask = (time_array >= turn["start"]) & (time_array <= turn["end"])
-            texts[mask] = hover
+            speaker    = "Assistant" if turn["speaker"] == "assistant" else "User"
+            transcript = turn["transcript_heard"] or turn["transcript_intended"] or "(no transcript)"
+
+            latency_line = ""
+            if turn["speaker"] == "user" and turn.get("latency_s") is not None:
+                latency_line = (
+                    f"<br>Response latency:\u00a0{turn['latency_s'] * 1000:.0f}\u00a0ms"
+                    + (f"\u00a0({turn['timing_label']})" if turn.get("timing_label") else "")
+                )
+
+            hover = (
+                f"<b>Turn\u00a0{turn['turn_id']}\u00a0\u2014\u00a0{speaker}</b><br>"
+                f"t\u00a0=\u00a0{turn['start']:.2f}s\u2013{turn['end']:.2f}s "
+                f"({turn['duration']:.1f}s)"
+                + latency_line
+                + f"<br><br>{_wrap(transcript)}"
+            )
+
+            for seg_s, seg_e in turn["segments"]:
+                mask = (time_array >= seg_s) & (time_array <= seg_e)
+                texts[mask] = hover
+
         for pause in pauses_rel:
-            hover = (f"<b>Pause</b><br>"
-                     f"t\u00a0=\u00a0{pause['start']:.2f}s\u2013{pause['end']:.2f}s<br>"
-                     f"Duration:\u00a0{pause['duration_seconds'] * 1000:.0f}\u00a0ms<br>"
-                     f"{pause['from_speaker']}\u00a0\u2192\u00a0{pause['to_speaker']}")
+            hover = (
+                f"<b>Pause</b><br>"
+                f"t\u00a0=\u00a0{pause['start']:.2f}s\u2013{pause['end']:.2f}s<br>"
+                f"Duration:\u00a0{pause['duration_seconds'] * 1000:.0f}\u00a0ms<br>"
+                f"{pause['from_speaker']}\u00a0\u2192\u00a0{pause['to_speaker']}"
+            )
             mask = (time_array >= pause["start"]) & (time_array <= pause["end"])
             texts[mask] = hover
+
         return texts.tolist()
 
     # ------------------------------------------------------------------ #
-    # Colour-coded waveform — one Scatter trace per speaker segment
+    # Colour-coded waveform — one Scatter trace per contiguous segment
     # ------------------------------------------------------------------ #
     def _colored_waveform(row: int, y: np.ndarray, t: np.ndarray,
                           y_range: list) -> None:
-        """Split waveform into per-speaker segments and colour each differently."""
         if len(y) == 0:
             fig.add_annotation(
                 text="No file available", xref="x domain", yref="y domain",
@@ -378,16 +515,17 @@ def _colored_waveform(row: int, y: np.ndarray, t: np.ndarray,
             fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0], row=row, col=1)
             return
 
-        # Build ordered segment list: (t_start, t_end, label)
-        turn_segs = sorted(
-            [(tr["start"], tr["end"],
-              "asst" if tr["user"] == "pipecat_agent" else "user")
-             for tr in turns_rel if tr["end"]],
+        # Flat list of individual speaker audio segments, sorted by start time
+        all_segs = sorted(
+            [(s, e, "asst" if turn["speaker"] == "assistant" else "user")
+             for turn in turns_rel for s, e in turn["segments"]],
             key=lambda s: s[0],
         )
+
+        # Insert gap segments between speaker audio
         segments: list[tuple] = []
         prev_end = 0.0
-        for seg_s, seg_e, spk in turn_segs:
+        for seg_s, seg_e, spk in all_segs:
             if seg_s > prev_end + 1e-3:
                 segments.append((prev_end, seg_s, "gap"))
             segments.append((seg_s, seg_e, spk))
@@ -404,7 +542,6 @@ def _colored_waveform(row: int, y: np.ndarray, t: np.ndarray,
             if not mask.any():
                 continue
             name = _name_map[spk]
-
             fig.add_trace(go.Scatter(
                 x=t[mask].tolist(), y=y[mask].tolist(),
                 mode="lines",
@@ -440,7 +577,7 @@ def _spec_row(row: int, spec: tuple, label: str) -> None:
             showscale=True,
         ), row=row, col=1)
 
-        # Transcript strip at freq_max for hover
+        # Invisible transcript strip at freq_max
         strip_t  = np.asarray(times, dtype=float)
         freq_max = float(freqs[-1])
         fig.add_trace(go.Scatter(
@@ -451,11 +588,9 @@ def _spec_row(row: int, spec: tuple, label: str) -> None:
             hovertemplate="%{text}<extra>Transcript</extra>",
         ), row=row, col=1)
 
-        # Turn boundary vrects
+        # Turn boundary vrects (use envelope start/end per turn)
         for turn in turns_rel:
-            if not turn["end"]:
-                continue
-            color = ASST_FILL if turn["user"] == "pipecat_agent" else USER_FILL
+            color = ASST_FILL if turn["speaker"] == "assistant" else USER_FILL
             fig.add_vrect(x0=turn["start"], x1=turn["end"],
                           fillcolor=color, line_width=0, layer="below",
                           row=row, col=1)
@@ -506,38 +641,45 @@ def _no_file(row: int) -> None:
             _no_file(row_of["el_spec"])
             fig.update_yaxes(title_text="Freq (Hz)", row=row_of["el_spec"], col=1)
 
-    # ---- Timeline ----
-    tl: dict = {"user": 0, "assistant": 0}
-    tl_row   = row_of["timeline"]
+    # ------------------------------------------------------------------ #
+    # Speaker Turn Timeline
+    # ------------------------------------------------------------------ #
+    tl_row = row_of["timeline"]
 
     for turn in turns_rel:
-        if not turn["end"]:
-            continue
-        is_asst  = turn["user"] == "pipecat_agent"
+        is_asst  = turn["speaker"] == "assistant"
         speaker  = "Assistant" if is_asst else "User"
         y_pos    = 2.0 if is_asst else 1.0
         bar_fill = "rgba(232,114,74,0.80)" if is_asst else "rgba(74,144,217,0.80)"
         bar_line = "rgba(180,70,30,1)"     if is_asst else "rgba(30,90,170,1)"
-        key      = "assistant" if is_asst else "user"
-
-        texts = transcript_map[key]
-        text  = texts[tl[key]] if tl[key] < len(texts) else "(no transcript)"
-        tl[key] += 1
 
-        hover = (f"<b>{speaker}</b><br>"
-                 f"t\u00a0=\u00a0{turn['start']:.2f}s\u2013{turn['end']:.2f}s "
-                 f"({turn['duration']:.1f}s)<br><br>{_wrap(text)}")
-
-        # Visual bar (hoverinfo='skip' — corners are too sparse)
-        fig.add_trace(go.Scatter(
-            x=[turn["start"], turn["end"], turn["end"], turn["start"], turn["start"]],
-            y=[y_pos - 0.38, y_pos - 0.38, y_pos + 0.38, y_pos + 0.38, y_pos - 0.38],
-            fill="toself", fillcolor=bar_fill, line=dict(color=bar_line, width=1),
-            mode="lines", hoverinfo="skip",
-            name=speaker, legendgroup=speaker, showlegend=False,
-        ), row=tl_row, col=1)
+        transcript   = turn["transcript_heard"] or turn["transcript_intended"] or "(no transcript)"
+        latency_line = ""
+        if not is_asst and turn.get("latency_s") is not None:
+            latency_line = (
+                f"<br>Response latency:\u00a0{turn['latency_s'] * 1000:.0f}\u00a0ms"
+                + (f"\u00a0({turn['timing_label']})" if turn.get("timing_label") else "")
+            )
+
+        hover = (
+            f"<b>Turn\u00a0{turn['turn_id']}\u00a0\u2014\u00a0{speaker}</b><br>"
+            f"t\u00a0=\u00a0{turn['start']:.2f}s\u2013{turn['end']:.2f}s "
+            f"({turn['duration']:.1f}s)"
+            + latency_line
+            + f"<br><br>{_wrap(transcript)}"
+        )
 
-        # Dense hover strip at bar midline (~2 pts/sec, min 5)
+        # Visual bars — one per segment (handles multi-segment interrupted turns)
+        for seg_s, seg_e in turn["segments"]:
+            fig.add_trace(go.Scatter(
+                x=[seg_s, seg_e, seg_e, seg_s, seg_s],
+                y=[y_pos - 0.38, y_pos - 0.38, y_pos + 0.38, y_pos + 0.38, y_pos - 0.38],
+                fill="toself", fillcolor=bar_fill, line=dict(color=bar_line, width=1),
+                mode="lines", hoverinfo="skip",
+                name=speaker, legendgroup=speaker, showlegend=False,
+            ), row=tl_row, col=1)
+
+        # Dense hover strip across full turn envelope (~2 pts/sec, min 5)
         n_pts   = max(5, int(turn["duration"] * 2))
         x_strip = np.linspace(turn["start"], turn["end"], n_pts).tolist()
         fig.add_trace(go.Scatter(
@@ -547,19 +689,44 @@ def _no_file(row: int) -> None:
             showlegend=False, name="",
         ), row=tl_row, col=1)
 
+        # Duration label on the first (or only) segment
+        seg0_s, seg0_e = turn["segments"][0]
         fig.add_annotation(
-            x=turn["start"] + turn["duration"] / 2, y=y_pos,
-            text=f"{turn['duration']:.1f}s",
+            x=seg0_s + (seg0_e - seg0_s) / 2, y=y_pos,
+            text=f"T{turn['turn_id']}\u00a0{turn['duration']:.1f}s",
             showarrow=False, font=dict(size=8, color="white"),
             xref=f"x{tl_row}", yref=f"y{tl_row}",
         )
 
-    for pause in pauses_rel:
-        hover = (f"<b>Pause</b><br>"
-                 f"t\u00a0=\u00a0{pause['start']:.2f}s\u2013{pause['end']:.2f}s<br>"
-                 f"Duration:\u00a0{pause['duration_seconds'] * 1000:.0f}\u00a0ms<br>"
-                 f"{pause['from_speaker']}\u00a0\u2192\u00a0{pause['to_speaker']}")
+    # Latency arrows: user last-segment-end → assistant first-segment-start
+    user_by_id = {t["turn_id"]: t for t in turns_rel if t["speaker"] == "user"}
+    asst_by_id = {t["turn_id"]: t for t in turns_rel if t["speaker"] == "assistant"}
+    for tid, user_turn in user_by_id.items():
+        if not user_turn.get("latency_s") or user_turn["latency_s"] <= 0.05:
+            continue
+        asst_turn = asst_by_id.get(tid)
+        if asst_turn is None:
+            continue
+        user_end   = user_turn["segments"][-1][1]
+        asst_start = asst_turn["segments"][0][0]
+        if asst_start <= user_end:
+            continue
+        fig.add_annotation(
+            x=(user_end + asst_start) / 2, y=1.5,
+            text=f"\u2194\u00a0{user_turn['latency_s'] * 1000:.0f}ms",
+            showarrow=False, font=dict(size=7, color="dimgray"),
+            bgcolor="rgba(255,255,255,0.7)",
+            xref=f"x{tl_row}", yref=f"y{tl_row}",
+        )
 
+    # Pause boxes on timeline
+    for pause in pauses_rel:
+        hover = (
+            f"<b>Pause</b><br>"
+            f"t\u00a0=\u00a0{pause['start']:.2f}s\u2013{pause['end']:.2f}s<br>"
+            f"Duration:\u00a0{pause['duration_seconds'] * 1000:.0f}\u00a0ms<br>"
+            f"{pause['from_speaker']}\u00a0\u2192\u00a0{pause['to_speaker']}"
+        )
         fig.add_trace(go.Scatter(
             x=[pause["start"], pause["end"], pause["end"], pause["start"], pause["start"]],
             y=[1.15, 1.15, 1.85, 1.85, 1.15],

From 3c2bd3995eee4b7d3794cda3b00d1f71abe51660 Mon Sep 17 00:00:00 2001
From: nhhoang96 <10899923+nhhoang96@users.noreply.github.com>
Date: Tue, 14 Apr 2026 00:32:47 +0000
Subject: [PATCH 11/23] Apply pre-commit

---
 apps/analysis.py    |   2 +-
 apps/audio_plots.py | 604 +++++++++++++++++++++++++-------------------
 2 files changed, 346 insertions(+), 260 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index fa34eced..ecf60e18 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -21,10 +21,10 @@
 from diff_viewer import diff_viewer
 
 import eva.metrics  # noqa: F401
+from apps.audio_plots import render_audio_analysis_tab
 from eva.metrics.registry import get_global_registry
 from eva.models.record import EvaluationRecord
 from eva.models.results import ConversationResult, RecordMetrics
-from apps.audio_plots import render_audio_analysis_tab
 
 # ============================================================================
 # Configuration
diff --git a/apps/audio_plots.py b/apps/audio_plots.py
index 5b5d07a1..574407e2 100644
--- a/apps/audio_plots.py
+++ b/apps/audio_plots.py
@@ -1,5 +1,4 @@
-"""
-Interactive audio visualization for the EVA Streamlit app.
+"""Interactive audio visualization for the EVA Streamlit app.
 
 Adapted from EVA-Bench/downloads/plot_script/plot_timestamp.py.
 Renders a Plotly figure directly into a Streamlit tab without writing files.
@@ -26,30 +25,30 @@
 import warnings
 from pathlib import Path
 
-import numpy as np
-import streamlit as st
-from pydub import AudioSegment
 import librosa
+import numpy as np
 import plotly.graph_objects as go
+import streamlit as st
 from plotly.subplots import make_subplots
-
+from pydub import AudioSegment
 
 # =============================================================================
 # Colours — visible in both Streamlit light and dark mode
 # =============================================================================
 
-USER_COLOR  = "#4A90D9"                 # mid-blue   — clear on white & dark
-ASST_COLOR  = "#E8724A"                 # orange-red — clear on white & dark
-GAP_COLOR   = "rgba(140,140,140,0.55)"  # neutral gray for silence gaps
-USER_FILL   = "rgba(74,144,217,0.22)"
-ASST_FILL   = "rgba(232,114,74,0.22)"
-PAUSE_FILL  = "rgba(140,140,140,0.18)"
+USER_COLOR = "#4A90D9"  # mid-blue   — clear on white & dark
+ASST_COLOR = "#E8724A"  # orange-red — clear on white & dark
+GAP_COLOR = "rgba(140,140,140,0.55)"  # neutral gray for silence gaps
+USER_FILL = "rgba(74,144,217,0.22)"
+ASST_FILL = "rgba(232,114,74,0.22)"
+PAUSE_FILL = "rgba(140,140,140,0.18)"
 
 
 # =============================================================================
 # Turn data loading — metrics.json first, elevenlabs_events.jsonl fallback
 # =============================================================================
 
+
 def _load_metrics_context(record_dir: Path) -> dict | None:
     """Load metrics.json; return None if absent."""
     metrics_file = record_dir / "metrics.json"
@@ -70,28 +69,24 @@ def _build_turns_from_metrics(metrics_data: dict) -> list[dict] | None:
       latency_s (user→assistant gap, user turns only), timing_label.
     """
     ctx = metrics_data.get("context") or {}
-    user_ts  = ctx.get("audio_timestamps_user_turns")  or {}
-    asst_ts  = ctx.get("audio_timestamps_assistant_turns") or {}
+    user_ts = ctx.get("audio_timestamps_user_turns") or {}
+    asst_ts = ctx.get("audio_timestamps_assistant_turns") or {}
     if not user_ts and not asst_ts:
         return None
 
-    transcribed_user = ctx.get("transcribed_user_turns")  or {}
+    transcribed_user = ctx.get("transcribed_user_turns") or {}
     transcribed_asst = ctx.get("transcribed_assistant_turns") or {}
-    intended_user    = ctx.get("intended_user_turns")   or {}
-    intended_asst    = ctx.get("intended_assistant_turns") or {}
+    intended_user = ctx.get("intended_user_turns") or {}
+    intended_asst = ctx.get("intended_assistant_turns") or {}
 
     # Per-turn latency / timing label from turn_taking metric (if already computed)
-    metrics      = metrics_data.get("metrics") or {}
-    tt_details   = (metrics.get("turn_taking") or {}).get("details") or {}
+    metrics = metrics_data.get("metrics") or {}
+    tt_details = (metrics.get("turn_taking") or {}).get("details") or {}
     per_turn_latency = {int(k): v for k, v in (tt_details.get("per_turn_latency") or {}).items()}
-    per_turn_labels  = {int(k): v for k, v in (tt_details.get("per_turn_judge_timing_ratings") or {}).items()}
+    per_turn_labels = {int(k): v for k, v in (tt_details.get("per_turn_judge_timing_ratings") or {}).items()}
 
     # Reference time: earliest timestamp across all turns
-    all_starts = [
-        segs[0][0]
-        for segs in list(user_ts.values()) + list(asst_ts.values())
-        if segs
-    ]
+    all_starts = [segs[0][0] for segs in list(user_ts.values()) + list(asst_ts.values()) if segs]
     t0 = min(all_starts) if all_starts else 0.0
 
     def _rel(segs: list) -> list[tuple[float, float]]:
@@ -104,36 +99,40 @@ def _rel(segs: list) -> list[tuple[float, float]]:
             continue
         tid = int(tid_str)
         rel = _rel(segs)
-        turns.append({
-            "turn_id":             tid,
-            "speaker":             "assistant",
-            "segments":            rel,
-            "start":               rel[0][0],
-            "end":                 rel[-1][1],
-            "duration":            rel[-1][1] - rel[0][0],
-            "transcript_heard":    transcribed_asst.get(tid_str, ""),
-            "transcript_intended": intended_asst.get(tid_str, ""),
-            "latency_s":           None,
-            "timing_label":        None,
-        })
+        turns.append(
+            {
+                "turn_id": tid,
+                "speaker": "assistant",
+                "segments": rel,
+                "start": rel[0][0],
+                "end": rel[-1][1],
+                "duration": rel[-1][1] - rel[0][0],
+                "transcript_heard": transcribed_asst.get(tid_str, ""),
+                "transcript_intended": intended_asst.get(tid_str, ""),
+                "latency_s": None,
+                "timing_label": None,
+            }
+        )
 
     for tid_str, segs in user_ts.items():
         if not segs:
             continue
         tid = int(tid_str)
         rel = _rel(segs)
-        turns.append({
-            "turn_id":             tid,
-            "speaker":             "user",
-            "segments":            rel,
-            "start":               rel[0][0],
-            "end":                 rel[-1][1],
-            "duration":            rel[-1][1] - rel[0][0],
-            "transcript_heard":    transcribed_user.get(tid_str, ""),
-            "transcript_intended": intended_user.get(tid_str, ""),
-            "latency_s":           per_turn_latency.get(tid),
-            "timing_label":        per_turn_labels.get(tid),
-        })
+        turns.append(
+            {
+                "turn_id": tid,
+                "speaker": "user",
+                "segments": rel,
+                "start": rel[0][0],
+                "end": rel[-1][1],
+                "duration": rel[-1][1] - rel[0][0],
+                "transcript_heard": transcribed_user.get(tid_str, ""),
+                "transcript_intended": intended_user.get(tid_str, ""),
+                "latency_s": per_turn_latency.get(tid),
+                "timing_label": per_turn_labels.get(tid),
+            }
+        )
 
     turns.sort(key=lambda t: t["start"])
     return turns
@@ -153,9 +152,9 @@ def _parse_elevenlabs_events(events_file: Path) -> list[dict]:
     active: dict = {}
     raw: list = []
     for ev in audio_events:
-        user  = ev.get("user")
+        user = ev.get("user")
         etype = ev.get("event_type")
-        ts    = ev.get("audio_timestamp")
+        ts = ev.get("audio_timestamp")
         if etype == "audio_start":
             if user not in active or active[user].get("end") is not None:
                 active[user] = {"user": user, "start": ts, "end": None}
@@ -173,21 +172,23 @@ def _parse_elevenlabs_events(events_file: Path) -> list[dict]:
     for i, t in enumerate(raw):
         is_asst = t["user"] == "pipecat_agent"
         speaker = "assistant" if is_asst else "user"
-        s_rel   = t["start"] - t0
-        e_rel   = t["end"]   - t0
-        turns.append({
-            "turn_id":             i,
-            "speaker":             speaker,
-            "segments":            [(s_rel, e_rel)],
-            "start":               s_rel,
-            "end":                 e_rel,
-            "duration":            t.get("duration", e_rel - s_rel),
-            "transcript_heard":    "",
-            "transcript_intended": "",
-            "latency_s":           None,
-            "timing_label":        None,
-            "_seq_idx":            asst_idx if is_asst else user_idx,
-        })
+        s_rel = t["start"] - t0
+        e_rel = t["end"] - t0
+        turns.append(
+            {
+                "turn_id": i,
+                "speaker": speaker,
+                "segments": [(s_rel, e_rel)],
+                "start": s_rel,
+                "end": e_rel,
+                "duration": t.get("duration", e_rel - s_rel),
+                "transcript_heard": "",
+                "transcript_intended": "",
+                "latency_s": None,
+                "timing_label": None,
+                "_seq_idx": asst_idx if is_asst else user_idx,
+            }
+        )
         if is_asst:
             asst_idx += 1
         else:
@@ -203,15 +204,15 @@ def _patch_fallback_transcripts(turns: list[dict], transcript_file: Path) -> Non
             for line in f:
                 if line.strip():
                     entry = json.loads(line)
-                    role  = entry.get("type", "")
+                    role = entry.get("type", "")
                     content = entry.get("content", "")
                     if role in tx:
                         tx[role].append(content)
     for turn in turns:
-        idx  = turn.pop("_seq_idx", 0)
-        key  = "assistant" if turn["speaker"] == "assistant" else "user"
+        idx = turn.pop("_seq_idx", 0)
+        key = "assistant" if turn["speaker"] == "assistant" else "user"
         text = tx[key][idx] if idx < len(tx[key]) else ""
-        turn["transcript_heard"]    = text
+        turn["transcript_heard"] = text
         turn["transcript_intended"] = text
 
 
@@ -223,17 +224,19 @@ def _calculate_pauses(turns_rel: list[dict]) -> list[dict]:
     )
     pauses = []
     for i in range(len(all_segs) - 1):
-        cur_end   = all_segs[i][1]
+        cur_end = all_segs[i][1]
         nxt_start = all_segs[i + 1][0]
         gap = nxt_start - cur_end
         if gap > 0.001:
-            pauses.append({
-                "from_speaker":     all_segs[i][2],
-                "to_speaker":       all_segs[i + 1][2],
-                "start":            cur_end,
-                "end":              nxt_start,
-                "duration_seconds": gap,
-            })
+            pauses.append(
+                {
+                    "from_speaker": all_segs[i][2],
+                    "to_speaker": all_segs[i + 1][2],
+                    "start": cur_end,
+                    "end": nxt_start,
+                    "duration_seconds": gap,
+                }
+            )
     return pauses
 
 
@@ -241,12 +244,13 @@ def _calculate_pauses(turns_rel: list[dict]) -> list[dict]:
 # Audio loading helpers
 # =============================================================================
 
+
 def _load_pydub(path: Path) -> tuple:
     seg = AudioSegment.from_file(str(path))
     if seg.channels > 1:
         seg = seg.set_channels(1)
     sr = seg.frame_rate
-    y  = np.array(seg.get_array_of_samples()).astype(np.float32) / 32768.0
+    y = np.array(seg.get_array_of_samples()).astype(np.float32) / 32768.0
     return y, sr
 
 
@@ -259,11 +263,11 @@ def _load_librosa(path: Path) -> tuple:
 
 def _downsample(y: np.ndarray, sr: float, target_rate: int = 100) -> tuple:
     duration = len(y) / sr
-    target   = max(2, int(duration * target_rate))
+    target = max(2, int(duration * target_rate))
     if len(y) > target:
-        step    = max(1, len(y) // target)
-        y_ds    = y[::step]
-        sr_ds   = sr * len(y_ds) / len(y)
+        step = max(1, len(y) // target)
+        y_ds = y[::step]
+        sr_ds = sr * len(y_ds) / len(y)
     else:
         y_ds, sr_ds = y, sr
     return y_ds, sr_ds
@@ -288,11 +292,12 @@ def _wrap(text: str, width: int = 80) -> str:
 # Data preparation
 # =============================================================================
 
+
 def _prepare_data(record_dir: Path) -> dict:
-    audio_mixed  = next(record_dir.glob("audio_mixed*.wav"), record_dir / "audio_mixed.wav")
-    audio_el     = record_dir / "elevenlabs_audio_recording.mp3"
-    events_file  = record_dir / "elevenlabs_events.jsonl"
-    transcript   = record_dir / "transcript.jsonl"
+    audio_mixed = next(record_dir.glob("audio_mixed*.wav"), record_dir / "audio_mixed.wav")
+    audio_el = record_dir / "elevenlabs_audio_recording.mp3"
+    events_file = record_dir / "elevenlabs_events.jsonl"
+    transcript = record_dir / "transcript.jsonl"
 
     # --- Turn data: prefer metrics.json (same source as turn_taking metric) ---
     turns_rel: list[dict] = []
@@ -314,20 +319,20 @@ def _prepare_data(record_dir: Path) -> dict:
     if audio_mixed.exists():
         try:
             y_mixed, sr_mixed = _load_pydub(audio_mixed)
-            duration    = len(y_mixed) / sr_mixed
+            duration = len(y_mixed) / sr_mixed
             mixed_loaded = True
         except Exception:
             pass
 
     # Use the later of audio duration and last turn end for x-axis
-    turns_end   = max((t["end"] for t in turns_rel), default=0.0)
-    plot_xlim   = [0, max(duration, turns_end, 1.0)]
+    turns_end = max((t["end"] for t in turns_rel), default=0.0)
+    plot_xlim = [0, max(duration, turns_end, 1.0)]
 
     if mixed_loaded:
         y_ds, _ = _downsample(y_mixed, sr_mixed)
         t_mixed = np.linspace(0, duration, len(y_ds))
     else:
-        y_ds    = np.array([])
+        y_ds = np.array([])
         t_mixed = np.array([])
 
     # --- Audio: ElevenLabs ---
@@ -336,14 +341,12 @@ def _prepare_data(record_dir: Path) -> dict:
     if audio_el.exists():
         try:
             _el_y, _el_sr = _load_librosa(audio_el)
-            el_y_ds, _    = _downsample(_el_y, _el_sr)
-            el_t          = np.linspace(0, len(_el_y) / _el_sr, len(el_y_ds))
-            el_loaded     = True
-            D      = librosa.amplitude_to_db(
-                np.abs(librosa.stft(_el_y, hop_length=512, n_fft=2048)), ref=np.max)
-            freqs  = librosa.fft_frequencies(sr=int(_el_sr), n_fft=2048)
-            times  = librosa.frames_to_time(np.arange(D.shape[1]),
-                                            sr=int(_el_sr), hop_length=512)
+            el_y_ds, _ = _downsample(_el_y, _el_sr)
+            el_t = np.linspace(0, len(_el_y) / _el_sr, len(el_y_ds))
+            el_loaded = True
+            D = librosa.amplitude_to_db(np.abs(librosa.stft(_el_y, hop_length=512, n_fft=2048)), ref=np.max)
+            freqs = librosa.fft_frequencies(sr=int(_el_sr), n_fft=2048)
+            times = librosa.frames_to_time(np.arange(D.shape[1]), sr=int(_el_sr), hop_length=512)
             el_spec = (D, freqs, times)
         except Exception:
             pass
@@ -352,29 +355,27 @@ def _prepare_data(record_dir: Path) -> dict:
     mixed_spec = None
     if mixed_loaded and len(y_ds) > 0:
         try:
-            sr_ds  = sr_mixed * len(y_ds) / len(y_mixed)
-            D      = librosa.amplitude_to_db(
-                np.abs(librosa.stft(y_ds, hop_length=512, n_fft=2048)), ref=np.max)
-            freqs  = librosa.fft_frequencies(sr=int(sr_ds), n_fft=2048)
-            times  = librosa.frames_to_time(np.arange(D.shape[1]),
-                                            sr=int(sr_ds), hop_length=512)
+            sr_ds = sr_mixed * len(y_ds) / len(y_mixed)
+            D = librosa.amplitude_to_db(np.abs(librosa.stft(y_ds, hop_length=512, n_fft=2048)), ref=np.max)
+            freqs = librosa.fft_frequencies(sr=int(sr_ds), n_fft=2048)
+            times = librosa.frames_to_time(np.arange(D.shape[1]), sr=int(sr_ds), hop_length=512)
             mixed_spec = (D, freqs, times)
         except Exception:
             pass
 
     return {
-        "duration":     duration,
-        "plot_xlim":    plot_xlim,
+        "duration": duration,
+        "plot_xlim": plot_xlim,
         "mixed_loaded": mixed_loaded,
-        "y_ds":         y_ds,
-        "t_mixed":      t_mixed,
-        "el_loaded":    el_loaded,
-        "el_y_ds":      el_y_ds,
-        "el_t":         el_t,
-        "mixed_spec":   mixed_spec,
-        "el_spec":      el_spec,
-        "turns_rel":    turns_rel,
-        "pauses_rel":   pauses_rel,
+        "y_ds": y_ds,
+        "t_mixed": t_mixed,
+        "el_loaded": el_loaded,
+        "el_y_ds": el_y_ds,
+        "el_t": el_t,
+        "mixed_spec": mixed_spec,
+        "el_spec": el_spec,
+        "turns_rel": turns_rel,
+        "pauses_rel": pauses_rel,
     }
 
 
@@ -382,14 +383,14 @@ def _prepare_data(record_dir: Path) -> dict:
 # Plotly figure builder
 # =============================================================================
 
-def _build_figure(data: dict,
-                  show_mixed_spec: bool = False,
-                  show_el_spec: bool = False,
-                  title_suffix: str = "") -> go.Figure:
 
-    turns_rel  = data["turns_rel"]
+def _build_figure(
+    data: dict, show_mixed_spec: bool = False, show_el_spec: bool = False, title_suffix: str = ""
+) -> go.Figure:
+
+    turns_rel = data["turns_rel"]
     pauses_rel = data["pauses_rel"]
-    plot_xlim  = data["plot_xlim"]
+    plot_xlim = data["plot_xlim"]
 
     # ------------------------------------------------------------------ #
     # Dynamic row layout
@@ -404,25 +405,26 @@ def _build_figure(data: dict,
 
     _titles = {
         "mixed_waveform": "Waveform \u2014 audio_mixed.wav",
-        "mixed_spec":     "Spectrogram \u2014 audio_mixed.wav",
-        "el_waveform":    "Waveform \u2014 elevenlabs_audio_recording.mp3",
-        "el_spec":        "Spectrogram \u2014 elevenlabs_audio_recording.mp3",
-        "timeline":       "Speaker Turn Timeline",
+        "mixed_spec": "Spectrogram \u2014 audio_mixed.wav",
+        "el_waveform": "Waveform \u2014 elevenlabs_audio_recording.mp3",
+        "el_spec": "Spectrogram \u2014 elevenlabs_audio_recording.mp3",
+        "timeline": "Speaker Turn Timeline",
     }
     _heights = {
         "mixed_waveform": 1.5,
-        "mixed_spec":     1.3,
-        "el_waveform":    1.5,
-        "el_spec":        1.3,
-        "timeline":       1.5,
+        "mixed_spec": 1.3,
+        "el_waveform": 1.5,
+        "el_spec": 1.3,
+        "timeline": 1.5,
     }
 
-    n_rows      = len(row_keys)
-    row_of      = {k: i + 1 for i, k in enumerate(row_keys)}
+    n_rows = len(row_keys)
+    row_of = {k: i + 1 for i, k in enumerate(row_keys)}
     row_heights = [_heights[k] for k in row_keys]
 
     fig = make_subplots(
-        rows=n_rows, cols=1,
+        rows=n_rows,
+        cols=1,
         shared_xaxes=True,
         subplot_titles=[_titles[k] for k in row_keys],
         row_heights=row_heights,
@@ -437,8 +439,13 @@ def _build_figure(data: dict,
         height=max(500, 320 * n_rows),
         hovermode="closest",
         legend=dict(
-            orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1,
-            bordercolor="rgba(128,128,128,0.4)", borderwidth=1,
+            orientation="h",
+            yanchor="bottom",
+            y=1.02,
+            xanchor="right",
+            x=1,
+            bordercolor="rgba(128,128,128,0.4)",
+            borderwidth=1,
         ),
     )
 
@@ -447,17 +454,24 @@ def _build_figure(data: dict,
     # All real traces use showlegend=False + legendgroup for toggling.
     # ------------------------------------------------------------------ #
     for _name, _color, _symbol in [
-        ("User",      USER_COLOR,               "square"),
-        ("Assistant", ASST_COLOR,               "square"),
-        ("Silence",   "rgba(140,140,140,0.55)", "square"),
-        ("Pause",     "rgba(140,140,140,0.40)", "square-open"),
+        ("User", USER_COLOR, "square"),
+        ("Assistant", ASST_COLOR, "square"),
+        ("Silence", "rgba(140,140,140,0.55)", "square"),
+        ("Pause", "rgba(140,140,140,0.40)", "square-open"),
     ]:
-        fig.add_trace(go.Scatter(
-            x=[None], y=[None], mode="markers",
-            marker=dict(color=_color, size=12, symbol=_symbol,
-                        line=dict(color=_color, width=2)),
-            name=_name, legendgroup=_name, showlegend=True,
-        ), row=1, col=1)
+        fig.add_trace(
+            go.Scatter(
+                x=[None],
+                y=[None],
+                mode="markers",
+                marker=dict(color=_color, size=12, symbol=_symbol, line=dict(color=_color, width=2)),
+                name=_name,
+                legendgroup=_name,
+                showlegend=True,
+            ),
+            row=1,
+            col=1,
+        )
 
     # ------------------------------------------------------------------ #
     # Hover text — per-sample transcript strings, keyed by turn segment
@@ -468,22 +482,19 @@ def _hover_texts(time_array: np.ndarray) -> list:
         texts = np.full(len(time_array), "", dtype=object)
 
         for turn in turns_rel:
-            speaker    = "Assistant" if turn["speaker"] == "assistant" else "User"
+            speaker = "Assistant" if turn["speaker"] == "assistant" else "User"
             transcript = turn["transcript_heard"] or turn["transcript_intended"] or "(no transcript)"
 
             latency_line = ""
             if turn["speaker"] == "user" and turn.get("latency_s") is not None:
-                latency_line = (
-                    f"<br>Response latency:\u00a0{turn['latency_s'] * 1000:.0f}\u00a0ms"
-                    + (f"\u00a0({turn['timing_label']})" if turn.get("timing_label") else "")
+                latency_line = f"<br>Response latency:\u00a0{turn['latency_s'] * 1000:.0f}\u00a0ms" + (
+                    f"\u00a0({turn['timing_label']})" if turn.get("timing_label") else ""
                 )
 
             hover = (
                 f"<b>Turn\u00a0{turn['turn_id']}\u00a0\u2014\u00a0{speaker}</b><br>"
                 f"t\u00a0=\u00a0{turn['start']:.2f}s\u2013{turn['end']:.2f}s "
-                f"({turn['duration']:.1f}s)"
-                + latency_line
-                + f"<br><br>{_wrap(transcript)}"
+                f"({turn['duration']:.1f}s)" + latency_line + f"<br><br>{_wrap(transcript)}"
             )
 
             for seg_s, seg_e in turn["segments"]:
@@ -505,20 +516,29 @@ def _hover_texts(time_array: np.ndarray) -> list:
     # ------------------------------------------------------------------ #
     # Colour-coded waveform — one Scatter trace per contiguous segment
     # ------------------------------------------------------------------ #
-    def _colored_waveform(row: int, y: np.ndarray, t: np.ndarray,
-                          y_range: list) -> None:
+    def _colored_waveform(row: int, y: np.ndarray, t: np.ndarray, y_range: list) -> None:
         if len(y) == 0:
             fig.add_annotation(
-                text="No file available", xref="x domain", yref="y domain",
-                x=0.5, y=0.5, showarrow=False, font=dict(color="gray", size=11),
-                row=row, col=1)
+                text="No file available",
+                xref="x domain",
+                yref="y domain",
+                x=0.5,
+                y=0.5,
+                showarrow=False,
+                font=dict(color="gray", size=11),
+                row=row,
+                col=1,
+            )
             fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0], row=row, col=1)
             return
 
         # Flat list of individual speaker audio segments, sorted by start time
         all_segs = sorted(
-            [(s, e, "asst" if turn["speaker"] == "assistant" else "user")
-             for turn in turns_rel for s, e in turn["segments"]],
+            [
+                (s, e, "asst" if turn["speaker"] == "assistant" else "user")
+                for turn in turns_rel
+                for s, e in turn["segments"]
+            ],
             key=lambda s: s[0],
         )
 
@@ -535,28 +555,35 @@ def _colored_waveform(row: int, y: np.ndarray, t: np.ndarray,
             segments.append((prev_end, duration, "gap"))
 
         _color_map = {"user": USER_COLOR, "asst": ASST_COLOR, "gap": GAP_COLOR}
-        _name_map  = {"user": "User",     "asst": "Assistant", "gap": "Silence"}
+        _name_map = {"user": "User", "asst": "Assistant", "gap": "Silence"}
 
         for seg_s, seg_e, spk in segments:
             mask = (t >= seg_s) & (t <= seg_e)
             if not mask.any():
                 continue
             name = _name_map[spk]
-            fig.add_trace(go.Scatter(
-                x=t[mask].tolist(), y=y[mask].tolist(),
-                mode="lines",
-                line=dict(width=1.0, color=_color_map[spk]),
-                opacity=0.85 if spk != "gap" else 0.45,
-                name=name, legendgroup=name, showlegend=False,
-                text=_hover_texts(t[mask]),
-                hovertemplate="%{text}<extra></extra>",
-            ), row=row, col=1)
+            fig.add_trace(
+                go.Scatter(
+                    x=t[mask].tolist(),
+                    y=y[mask].tolist(),
+                    mode="lines",
+                    line=dict(width=1.0, color=_color_map[spk]),
+                    opacity=0.85 if spk != "gap" else 0.45,
+                    name=name,
+                    legendgroup=name,
+                    showlegend=False,
+                    text=_hover_texts(t[mask]),
+                    hovertemplate="%{text}<extra></extra>",
+                ),
+                row=row,
+                col=1,
+            )
 
         # Pause vrects (visual only)
         for pause in pauses_rel:
-            fig.add_vrect(x0=pause["start"], x1=pause["end"],
-                          fillcolor=PAUSE_FILL, line_width=0, layer="below",
-                          row=row, col=1)
+            fig.add_vrect(
+                x0=pause["start"], x1=pause["end"], fillcolor=PAUSE_FILL, line_width=0, layer="below", row=row, col=1
+            )
 
         fig.update_yaxes(title_text="Amplitude", range=y_range, row=row, col=1)
 
@@ -566,46 +593,65 @@ def _colored_waveform(row: int, y: np.ndarray, t: np.ndarray,
     def _spec_row(row: int, spec: tuple, label: str) -> None:
         D, freqs, times = spec
 
-        fig.add_trace(go.Heatmap(
-            z=D, x=times, y=freqs,
-            colorscale="Viridis", zmin=-80, zmax=0,
-            colorbar=dict(title="dB", thickness=12, len=0.12, x=1.01),
-            hovertemplate=(
-                "t=%{x:.2f}s  freq=%{y:.0f}Hz  %{z:.1f}dB"
-                "<extra>" + label + "</extra>"
+        fig.add_trace(
+            go.Heatmap(
+                z=D,
+                x=times,
+                y=freqs,
+                colorscale="Viridis",
+                zmin=-80,
+                zmax=0,
+                colorbar=dict(title="dB", thickness=12, len=0.12, x=1.01),
+                hovertemplate=("t=%{x:.2f}s  freq=%{y:.0f}Hz  %{z:.1f}dB<extra>" + label + "</extra>"),
+                showscale=True,
             ),
-            showscale=True,
-        ), row=row, col=1)
+            row=row,
+            col=1,
+        )
 
         # Invisible transcript strip at freq_max
-        strip_t  = np.asarray(times, dtype=float)
+        strip_t = np.asarray(times, dtype=float)
         freq_max = float(freqs[-1])
-        fig.add_trace(go.Scatter(
-            x=strip_t.tolist(), y=[freq_max] * len(strip_t),
-            mode="markers", marker=dict(opacity=0, size=6),
-            showlegend=False, name="",
-            text=_hover_texts(strip_t),
-            hovertemplate="%{text}<extra>Transcript</extra>",
-        ), row=row, col=1)
+        fig.add_trace(
+            go.Scatter(
+                x=strip_t.tolist(),
+                y=[freq_max] * len(strip_t),
+                mode="markers",
+                marker=dict(opacity=0, size=6),
+                showlegend=False,
+                name="",
+                text=_hover_texts(strip_t),
+                hovertemplate="%{text}<extra>Transcript</extra>",
+            ),
+            row=row,
+            col=1,
+        )
 
         # Turn boundary vrects (use envelope start/end per turn)
         for turn in turns_rel:
             color = ASST_FILL if turn["speaker"] == "assistant" else USER_FILL
-            fig.add_vrect(x0=turn["start"], x1=turn["end"],
-                          fillcolor=color, line_width=0, layer="below",
-                          row=row, col=1)
+            fig.add_vrect(
+                x0=turn["start"], x1=turn["end"], fillcolor=color, line_width=0, layer="below", row=row, col=1
+            )
         for pause in pauses_rel:
-            fig.add_vrect(x0=pause["start"], x1=pause["end"],
-                          fillcolor=PAUSE_FILL, line_width=0, layer="below",
-                          row=row, col=1)
+            fig.add_vrect(
+                x0=pause["start"], x1=pause["end"], fillcolor=PAUSE_FILL, line_width=0, layer="below", row=row, col=1
+            )
 
         fig.update_yaxes(title_text="Freq (Hz)", row=row, col=1)
 
     def _no_file(row: int) -> None:
         fig.add_annotation(
-            text="No file available", xref="x domain", yref="y domain",
-            x=0.5, y=0.5, showarrow=False, font=dict(color="gray", size=11),
-            row=row, col=1)
+            text="No file available",
+            xref="x domain",
+            yref="y domain",
+            x=0.5,
+            y=0.5,
+            showarrow=False,
+            font=dict(color="gray", size=11),
+            row=row,
+            col=1,
+        )
 
     # ---- Mixed waveform ----
     if data["mixed_loaded"] and len(data["y_ds"]) > 0:
@@ -613,8 +659,7 @@ def _no_file(row: int) -> None:
         _colored_waveform(row_of["mixed_waveform"], data["y_ds"], data["t_mixed"], y_range)
     else:
         _no_file(row_of["mixed_waveform"])
-        fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0],
-                         row=row_of["mixed_waveform"], col=1)
+        fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0], row=row_of["mixed_waveform"], col=1)
 
     # ---- Mixed spectrogram (optional) ----
     if "mixed_spec" in row_of:
@@ -630,8 +675,7 @@ def _no_file(row: int) -> None:
         _colored_waveform(row_of["el_waveform"], data["el_y_ds"], data["el_t"], el_range)
     else:
         _no_file(row_of["el_waveform"])
-        fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0],
-                         row=row_of["el_waveform"], col=1)
+        fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0], row=row_of["el_waveform"], col=1)
 
     # ---- ElevenLabs spectrogram (optional) ----
     if "el_spec" in row_of:
@@ -647,55 +691,72 @@ def _no_file(row: int) -> None:
     tl_row = row_of["timeline"]
 
     for turn in turns_rel:
-        is_asst  = turn["speaker"] == "assistant"
-        speaker  = "Assistant" if is_asst else "User"
-        y_pos    = 2.0 if is_asst else 1.0
+        is_asst = turn["speaker"] == "assistant"
+        speaker = "Assistant" if is_asst else "User"
+        y_pos = 2.0 if is_asst else 1.0
         bar_fill = "rgba(232,114,74,0.80)" if is_asst else "rgba(74,144,217,0.80)"
-        bar_line = "rgba(180,70,30,1)"     if is_asst else "rgba(30,90,170,1)"
+        bar_line = "rgba(180,70,30,1)" if is_asst else "rgba(30,90,170,1)"
 
-        transcript   = turn["transcript_heard"] or turn["transcript_intended"] or "(no transcript)"
+        transcript = turn["transcript_heard"] or turn["transcript_intended"] or "(no transcript)"
         latency_line = ""
         if not is_asst and turn.get("latency_s") is not None:
-            latency_line = (
-                f"<br>Response latency:\u00a0{turn['latency_s'] * 1000:.0f}\u00a0ms"
-                + (f"\u00a0({turn['timing_label']})" if turn.get("timing_label") else "")
+            latency_line = f"<br>Response latency:\u00a0{turn['latency_s'] * 1000:.0f}\u00a0ms" + (
+                f"\u00a0({turn['timing_label']})" if turn.get("timing_label") else ""
             )
 
         hover = (
             f"<b>Turn\u00a0{turn['turn_id']}\u00a0\u2014\u00a0{speaker}</b><br>"
             f"t\u00a0=\u00a0{turn['start']:.2f}s\u2013{turn['end']:.2f}s "
-            f"({turn['duration']:.1f}s)"
-            + latency_line
-            + f"<br><br>{_wrap(transcript)}"
+            f"({turn['duration']:.1f}s)" + latency_line + f"<br><br>{_wrap(transcript)}"
         )
 
         # Visual bars — one per segment (handles multi-segment interrupted turns)
         for seg_s, seg_e in turn["segments"]:
-            fig.add_trace(go.Scatter(
-                x=[seg_s, seg_e, seg_e, seg_s, seg_s],
-                y=[y_pos - 0.38, y_pos - 0.38, y_pos + 0.38, y_pos + 0.38, y_pos - 0.38],
-                fill="toself", fillcolor=bar_fill, line=dict(color=bar_line, width=1),
-                mode="lines", hoverinfo="skip",
-                name=speaker, legendgroup=speaker, showlegend=False,
-            ), row=tl_row, col=1)
+            fig.add_trace(
+                go.Scatter(
+                    x=[seg_s, seg_e, seg_e, seg_s, seg_s],
+                    y=[y_pos - 0.38, y_pos - 0.38, y_pos + 0.38, y_pos + 0.38, y_pos - 0.38],
+                    fill="toself",
+                    fillcolor=bar_fill,
+                    line=dict(color=bar_line, width=1),
+                    mode="lines",
+                    hoverinfo="skip",
+                    name=speaker,
+                    legendgroup=speaker,
+                    showlegend=False,
+                ),
+                row=tl_row,
+                col=1,
+            )
 
         # Dense hover strip across full turn envelope (~2 pts/sec, min 5)
-        n_pts   = max(5, int(turn["duration"] * 2))
+        n_pts = max(5, int(turn["duration"] * 2))
         x_strip = np.linspace(turn["start"], turn["end"], n_pts).tolist()
-        fig.add_trace(go.Scatter(
-            x=x_strip, y=[y_pos] * n_pts,
-            mode="markers", marker=dict(opacity=0, size=10),
-            hovertext=hover, hoverinfo="text",
-            showlegend=False, name="",
-        ), row=tl_row, col=1)
+        fig.add_trace(
+            go.Scatter(
+                x=x_strip,
+                y=[y_pos] * n_pts,
+                mode="markers",
+                marker=dict(opacity=0, size=10),
+                hovertext=hover,
+                hoverinfo="text",
+                showlegend=False,
+                name="",
+            ),
+            row=tl_row,
+            col=1,
+        )
 
         # Duration label on the first (or only) segment
         seg0_s, seg0_e = turn["segments"][0]
         fig.add_annotation(
-            x=seg0_s + (seg0_e - seg0_s) / 2, y=y_pos,
+            x=seg0_s + (seg0_e - seg0_s) / 2,
+            y=y_pos,
             text=f"T{turn['turn_id']}\u00a0{turn['duration']:.1f}s",
-            showarrow=False, font=dict(size=8, color="white"),
-            xref=f"x{tl_row}", yref=f"y{tl_row}",
+            showarrow=False,
+            font=dict(size=8, color="white"),
+            xref=f"x{tl_row}",
+            yref=f"y{tl_row}",
         )
 
     # Latency arrows: user last-segment-end → assistant first-segment-start
@@ -707,16 +768,19 @@ def _no_file(row: int) -> None:
         asst_turn = asst_by_id.get(tid)
         if asst_turn is None:
             continue
-        user_end   = user_turn["segments"][-1][1]
+        user_end = user_turn["segments"][-1][1]
         asst_start = asst_turn["segments"][0][0]
         if asst_start <= user_end:
             continue
         fig.add_annotation(
-            x=(user_end + asst_start) / 2, y=1.5,
+            x=(user_end + asst_start) / 2,
+            y=1.5,
             text=f"\u2194\u00a0{user_turn['latency_s'] * 1000:.0f}ms",
-            showarrow=False, font=dict(size=7, color="dimgray"),
+            showarrow=False,
+            font=dict(size=7, color="dimgray"),
             bgcolor="rgba(255,255,255,0.7)",
-            xref=f"x{tl_row}", yref=f"y{tl_row}",
+            xref=f"x{tl_row}",
+            yref=f"y{tl_row}",
         )
 
     # Pause boxes on timeline
@@ -727,44 +791,65 @@ def _no_file(row: int) -> None:
             f"Duration:\u00a0{pause['duration_seconds'] * 1000:.0f}\u00a0ms<br>"
             f"{pause['from_speaker']}\u00a0\u2192\u00a0{pause['to_speaker']}"
         )
-        fig.add_trace(go.Scatter(
-            x=[pause["start"], pause["end"], pause["end"], pause["start"], pause["start"]],
-            y=[1.15, 1.15, 1.85, 1.85, 1.15],
-            fill="toself", fillcolor="rgba(140,140,140,0.40)",
-            line=dict(color="rgba(180,60,60,0.8)", width=1, dash="dash"),
-            mode="lines", hoverinfo="skip",
-            name="Pause", legendgroup="Pause", showlegend=False,
-        ), row=tl_row, col=1)
-
-        n_pts   = max(5, int(pause["duration_seconds"] * 2))
+        fig.add_trace(
+            go.Scatter(
+                x=[pause["start"], pause["end"], pause["end"], pause["start"], pause["start"]],
+                y=[1.15, 1.15, 1.85, 1.85, 1.15],
+                fill="toself",
+                fillcolor="rgba(140,140,140,0.40)",
+                line=dict(color="rgba(180,60,60,0.8)", width=1, dash="dash"),
+                mode="lines",
+                hoverinfo="skip",
+                name="Pause",
+                legendgroup="Pause",
+                showlegend=False,
+            ),
+            row=tl_row,
+            col=1,
+        )
+
+        n_pts = max(5, int(pause["duration_seconds"] * 2))
         x_strip = np.linspace(pause["start"], pause["end"], n_pts).tolist()
-        fig.add_trace(go.Scatter(
-            x=x_strip, y=[1.5] * n_pts,
-            mode="markers", marker=dict(opacity=0, size=10),
-            hovertext=hover, hoverinfo="text",
-            showlegend=False, name="",
-        ), row=tl_row, col=1)
+        fig.add_trace(
+            go.Scatter(
+                x=x_strip,
+                y=[1.5] * n_pts,
+                mode="markers",
+                marker=dict(opacity=0, size=10),
+                hovertext=hover,
+                hoverinfo="text",
+                showlegend=False,
+                name="",
+            ),
+            row=tl_row,
+            col=1,
+        )
 
         fig.add_annotation(
-            x=pause["start"] + pause["duration_seconds"] / 2, y=1.5,
+            x=pause["start"] + pause["duration_seconds"] / 2,
+            y=1.5,
             text=f"{pause['duration_seconds'] * 1000:.0f}ms",
-            showarrow=False, font=dict(size=7, color="dimgray"),
+            showarrow=False,
+            font=dict(size=7, color="dimgray"),
             bgcolor="rgba(255,255,255,0.7)",
-            xref=f"x{tl_row}", yref=f"y{tl_row}",
+            xref=f"x{tl_row}",
+            yref=f"y{tl_row}",
         )
 
     fig.update_yaxes(
-        tickvals=[1, 2], ticktext=["User", "Assistant"], range=[0.5, 2.5],
-        title_text="Speaker", row=tl_row, col=1,
+        tickvals=[1, 2],
+        ticktext=["User", "Assistant"],
+        range=[0.5, 2.5],
+        title_text="Speaker",
+        row=tl_row,
+        col=1,
     )
     fig.update_xaxes(title_text="Time (seconds)", row=tl_row, col=1)
 
     # Shared x-range + grid for all rows
     for r in range(1, n_rows + 1):
-        fig.update_xaxes(range=plot_xlim, showgrid=True,
-                         gridcolor="rgba(128,128,128,0.15)", row=r, col=1)
-        fig.update_yaxes(showgrid=True,
-                         gridcolor="rgba(128,128,128,0.15)", row=r, col=1)
+        fig.update_xaxes(range=plot_xlim, showgrid=True, gridcolor="rgba(128,128,128,0.15)", row=r, col=1)
+        fig.update_yaxes(showgrid=True, gridcolor="rgba(128,128,128,0.15)", row=r, col=1)
 
     return fig
 
@@ -773,6 +858,7 @@ def _no_file(row: int) -> None:
 # Streamlit tab renderer
 # =============================================================================
 
+
 def render_audio_analysis_tab(record_dir: Path) -> None:
     """Render the Audio Analysis tab for a given record / trial directory."""
     st.markdown("### Audio Analysis")

From aa4c67babf67b6a70bde44cdc53ba75d921f34a5 Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Tue, 14 Apr 2026 00:52:30 +0000
Subject: [PATCH 12/23] Update package dependencies and remove warnings of
 deprecations

---
 apps/audio_plots.py |  2 +-
 uv.lock             | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/apps/audio_plots.py b/apps/audio_plots.py
index 5b5d07a1..2e119230 100644
--- a/apps/audio_plots.py
+++ b/apps/audio_plots.py
@@ -801,6 +801,6 @@ def _cached(path_str: str, mixed_spec: bool, el_spec: bool) -> go.Figure:
 
     try:
         fig = _cached(str(record_dir), show_mixed_spec, show_el_spec)
-        st.plotly_chart(fig, use_container_width=True, theme="streamlit")
+        st.plotly_chart(fig, width="stretch", theme="streamlit")
     except Exception as exc:
         st.error(f"Could not render audio plot: {exc}")
diff --git a/uv.lock b/uv.lock
index 709a8c1b..f5ec6cca 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1722,6 +1722,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973, upload-time = "2025-09-27T18:37:04.929Z" },
 ]
 
+[[package]]
+name = "marshmallow"
+version = "3.26.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/55/79/de6c16cc902f4fc372236926b0ce2ab7845268dcc30fb2fbb7f71b418631/marshmallow-3.26.2.tar.gz", hash = "sha256:bbe2adb5a03e6e3571b573f42527c6fe926e17467833660bebd11593ab8dfd57", size = 222095, upload-time = "2025-12-22T06:53:53.309303Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/be/2f/5108cb3ee4ba6501748c4908b908e55f42a5b66245b4cfe0c99326e1ef6e/marshmallow-3.26.2-py3-none-any.whl", hash = "sha256:013fa8a3c4c276c24d26d84ce934dc964e2aa794345a0f8c7e5a7191482c8a73", size = 50964, upload-time = "2025-12-22T06:53:51.801503Z" },
+]
+
 [[package]]
 name = "mdurl"
 version = "0.1.2"
@@ -3577,6 +3589,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
 ]
 
+[[package]]
+name = "typing-inspect"
+version = "0.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mypy-extensions" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dc/74/1789779d91f1961fa9438e9a8710cdae6bd138c80d7303996933d117264a/typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78", size = 13825, upload-time = "2023-05-24T20:25:47.612134Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/65/f3/107a22063bf27bdccf2024833d3445f4eea42b2e598abfbd46f6a63b6cb0/typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f", size = 8827, upload-time = "2023-05-24T20:25:45.287357Z" },
+]
+
 [[package]]
 name = "typing-inspection"
 version = "0.4.2"

From 969e692bb5c632cbf6245b13c7933be885ace666 Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Tue, 14 Apr 2026 01:13:22 +0000
Subject: [PATCH 13/23] Update fixes to pre-commit

---
 apps/audio_plots.py | 52 ++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/apps/audio_plots.py b/apps/audio_plots.py
index e034d00f..62d3bd38 100644
--- a/apps/audio_plots.py
+++ b/apps/audio_plots.py
@@ -432,21 +432,21 @@ def _build_figure(
     )
 
     fig.update_layout(
-        title=dict(
-            text=f"Speaker Turn Analysis \u2014 Pause Detection{title_suffix}",
-            font=dict(size=15),
-        ),
+        title={
+            "text": f"Speaker Turn Analysis \u2014 Pause Detection{title_suffix}",
+            "font": {"size": 15},
+        },
         height=max(500, 320 * n_rows),
         hovermode="closest",
-        legend=dict(
-            orientation="h",
-            yanchor="bottom",
-            y=1.02,
-            xanchor="right",
-            x=1,
-            bordercolor="rgba(128,128,128,0.4)",
-            borderwidth=1,
-        ),
+        legend={
+            "orientation": "h",
+            "yanchor": "bottom",
+            "y": 1.02,
+            "xanchor": "right",
+            "x": 1,
+            "bordercolor": "rgba(128,128,128,0.4)",
+            "borderwidth": 1,
+        },
     )
 
     # ------------------------------------------------------------------ #
@@ -464,7 +464,7 @@ def _build_figure(
                 x=[None],
                 y=[None],
                 mode="markers",
-                marker=dict(color=_color, size=12, symbol=_symbol, line=dict(color=_color, width=2)),
+                marker={"color": _color, "size": 12, "symbol": _symbol, "line": {"color": _color, "width": 2}},
                 name=_name,
                 legendgroup=_name,
                 showlegend=True,
@@ -525,7 +525,7 @@ def _colored_waveform(row: int, y: np.ndarray, t: np.ndarray, y_range: list) ->
                 x=0.5,
                 y=0.5,
                 showarrow=False,
-                font=dict(color="gray", size=11),
+                font={"color": "gray", "size": 11},
                 row=row,
                 col=1,
             )
@@ -567,7 +567,7 @@ def _colored_waveform(row: int, y: np.ndarray, t: np.ndarray, y_range: list) ->
                     x=t[mask].tolist(),
                     y=y[mask].tolist(),
                     mode="lines",
-                    line=dict(width=1.0, color=_color_map[spk]),
+                    line={"width": 1.0, "color": _color_map[spk]},
                     opacity=0.85 if spk != "gap" else 0.45,
                     name=name,
                     legendgroup=name,
@@ -601,7 +601,7 @@ def _spec_row(row: int, spec: tuple, label: str) -> None:
                 colorscale="Viridis",
                 zmin=-80,
                 zmax=0,
-                colorbar=dict(title="dB", thickness=12, len=0.12, x=1.01),
+                colorbar={"title": "dB", "thickness": 12, "len": 0.12, "x": 1.01},
                 hovertemplate=("t=%{x:.2f}s  freq=%{y:.0f}Hz  %{z:.1f}dB<extra>" + label + "</extra>"),
                 showscale=True,
             ),
@@ -617,7 +617,7 @@ def _spec_row(row: int, spec: tuple, label: str) -> None:
                 x=strip_t.tolist(),
                 y=[freq_max] * len(strip_t),
                 mode="markers",
-                marker=dict(opacity=0, size=6),
+                marker={"opacity": 0, "size": 6},
                 showlegend=False,
                 name="",
                 text=_hover_texts(strip_t),
@@ -648,7 +648,7 @@ def _no_file(row: int) -> None:
             x=0.5,
             y=0.5,
             showarrow=False,
-            font=dict(color="gray", size=11),
+            font={"color": "gray", "size": 11},
             row=row,
             col=1,
         )
@@ -718,7 +718,7 @@ def _no_file(row: int) -> None:
                     y=[y_pos - 0.38, y_pos - 0.38, y_pos + 0.38, y_pos + 0.38, y_pos - 0.38],
                     fill="toself",
                     fillcolor=bar_fill,
-                    line=dict(color=bar_line, width=1),
+                    line={"color": bar_line, "width": 1},
                     mode="lines",
                     hoverinfo="skip",
                     name=speaker,
@@ -737,7 +737,7 @@ def _no_file(row: int) -> None:
                 x=x_strip,
                 y=[y_pos] * n_pts,
                 mode="markers",
-                marker=dict(opacity=0, size=10),
+                marker={"opacity": 0, "size": 10},
                 hovertext=hover,
                 hoverinfo="text",
                 showlegend=False,
@@ -754,7 +754,7 @@ def _no_file(row: int) -> None:
             y=y_pos,
             text=f"T{turn['turn_id']}\u00a0{turn['duration']:.1f}s",
             showarrow=False,
-            font=dict(size=8, color="white"),
+            font={"size": 8, "color": "white"},
             xref=f"x{tl_row}",
             yref=f"y{tl_row}",
         )
@@ -777,7 +777,7 @@ def _no_file(row: int) -> None:
             y=1.5,
             text=f"\u2194\u00a0{user_turn['latency_s'] * 1000:.0f}ms",
             showarrow=False,
-            font=dict(size=7, color="dimgray"),
+            font={"size": 7, "color": "dimgray"},
             bgcolor="rgba(255,255,255,0.7)",
             xref=f"x{tl_row}",
             yref=f"y{tl_row}",
@@ -797,7 +797,7 @@ def _no_file(row: int) -> None:
                 y=[1.15, 1.15, 1.85, 1.85, 1.15],
                 fill="toself",
                 fillcolor="rgba(140,140,140,0.40)",
-                line=dict(color="rgba(180,60,60,0.8)", width=1, dash="dash"),
+                line={"color": "rgba(180,60,60,0.8)", "width": 1, "dash": "dash"},
                 mode="lines",
                 hoverinfo="skip",
                 name="Pause",
@@ -815,7 +815,7 @@ def _no_file(row: int) -> None:
                 x=x_strip,
                 y=[1.5] * n_pts,
                 mode="markers",
-                marker=dict(opacity=0, size=10),
+                marker={"opacity": 0, "size": 10},
                 hovertext=hover,
                 hoverinfo="text",
                 showlegend=False,
@@ -830,7 +830,7 @@ def _no_file(row: int) -> None:
             y=1.5,
             text=f"{pause['duration_seconds'] * 1000:.0f}ms",
             showarrow=False,
-            font=dict(size=7, color="dimgray"),
+            font={"size": 7, "color": "dimgray"},
             bgcolor="rgba(255,255,255,0.7)",
             xref=f"x{tl_row}",
             yref=f"y{tl_row}",

From 603e41f1213b305c1bb307913ad8222ad890aa62 Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Wed, 15 Apr 2026 15:48:31 +0000
Subject: [PATCH 14/23] Update preloading for faster experience when switching
 between records. Change empty EL plots to warning textbox when the file is
 not available for that record.

---
 apps/analysis.py    |   6 +-
 apps/audio_plots.py | 152 ++++++++++++++++++++++++++++++++------------
 2 files changed, 116 insertions(+), 42 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index ecf60e18..6f3cedfb 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -21,7 +21,7 @@
 from diff_viewer import diff_viewer
 
 import eva.metrics  # noqa: F401
-from apps.audio_plots import render_audio_analysis_tab
+from apps.audio_plots import preload_audio_data, render_audio_analysis_tab
 from eva.metrics.registry import get_global_registry
 from eva.models.record import EvaluationRecord
 from eva.models.results import ConversationResult, RecordMetrics
@@ -1813,6 +1813,10 @@ def render_record_detail(selected_run_dir: Path):
 
     st.divider()
 
+    # Pre-load audio data before the tabs so the cache is warm when the user
+    # opens the Audio Analysis tab (or switches trials).
+    preload_audio_data(selected_record_dir)
+
     # Tabs
     tab1, tab2, tab3, tab4, tab5 = st.tabs(
         [
diff --git a/apps/audio_plots.py b/apps/audio_plots.py
index 62d3bd38..d68111b2 100644
--- a/apps/audio_plots.py
+++ b/apps/audio_plots.py
@@ -288,6 +288,18 @@ def _wrap(text: str, width: int = 80) -> str:
     return "<br>".join(lines)
 
 
+# =============================================================================
+# Spectrogram parameters
+# =============================================================================
+
+# Intermediate sample rate used for spectrogram computation.
+# 4 kHz preserves speech content up to 2 kHz (Nyquist) while keeping the
+# heatmap to roughly 60–250K cells for typical 5–90 s recordings.
+_SPEC_SR = 4000   # Hz
+_SPEC_N_FFT = 512  # → 257 freq bins, 7.8 Hz resolution
+_SPEC_HOP = 512   # → ~0.128 s/frame at 4 kHz
+
+
 # =============================================================================
 # Data preparation
 # =============================================================================
@@ -344,21 +356,34 @@ def _prepare_data(record_dir: Path) -> dict:
             el_y_ds, _ = _downsample(_el_y, _el_sr)
             el_t = np.linspace(0, len(_el_y) / _el_sr, len(el_y_ds))
             el_loaded = True
-            D = librosa.amplitude_to_db(np.abs(librosa.stft(_el_y, hop_length=512, n_fft=2048)), ref=np.max)
-            freqs = librosa.fft_frequencies(sr=int(_el_sr), n_fft=2048)
-            times = librosa.frames_to_time(np.arange(D.shape[1]), sr=int(_el_sr), hop_length=512)
-            el_spec = (D, freqs, times)
+            # Spectrogram: resample to _SPEC_SR (4 kHz) to get meaningful
+            # frequency content (0–2 kHz Nyquist) with a bounded heatmap.
+            # Times from frames_to_time start at 0 — aligned with el_t.
+            try:
+                _el_y_spec = librosa.resample(_el_y, orig_sr=_el_sr, target_sr=_SPEC_SR)
+                D = librosa.amplitude_to_db(
+                    np.abs(librosa.stft(_el_y_spec, hop_length=_SPEC_HOP, n_fft=_SPEC_N_FFT)), ref=np.max
+                )
+                freqs = librosa.fft_frequencies(sr=_SPEC_SR, n_fft=_SPEC_N_FFT)
+                times = librosa.frames_to_time(np.arange(D.shape[1]), sr=_SPEC_SR, hop_length=_SPEC_HOP)
+                el_spec = (D, freqs, times)
+            except Exception:
+                pass
         except Exception:
             pass
 
     # --- Spectrogram: mixed ---
+    # Resample to _SPEC_SR so both spectrograms share the same frequency axis
+    # and have meaningful content. Times start at 0 — aligned with t_mixed.
     mixed_spec = None
-    if mixed_loaded and len(y_ds) > 0:
+    if mixed_loaded:
         try:
-            sr_ds = sr_mixed * len(y_ds) / len(y_mixed)
-            D = librosa.amplitude_to_db(np.abs(librosa.stft(y_ds, hop_length=512, n_fft=2048)), ref=np.max)
-            freqs = librosa.fft_frequencies(sr=int(sr_ds), n_fft=2048)
-            times = librosa.frames_to_time(np.arange(D.shape[1]), sr=int(sr_ds), hop_length=512)
+            _y_spec = librosa.resample(y_mixed, orig_sr=sr_mixed, target_sr=_SPEC_SR)
+            D = librosa.amplitude_to_db(
+                np.abs(librosa.stft(_y_spec, hop_length=_SPEC_HOP, n_fft=_SPEC_N_FFT)), ref=np.max
+            )
+            freqs = librosa.fft_frequencies(sr=_SPEC_SR, n_fft=_SPEC_N_FFT)
+            times = librosa.frames_to_time(np.arange(D.shape[1]), sr=_SPEC_SR, hop_length=_SPEC_HOP)
             mixed_spec = (D, freqs, times)
         except Exception:
             pass
@@ -398,9 +423,10 @@ def _build_figure(
     row_keys: list[str] = ["mixed_waveform"]
     if show_mixed_spec and data["mixed_spec"]:
         row_keys.append("mixed_spec")
-    row_keys.append("el_waveform")
-    if show_el_spec and data["el_spec"]:
-        row_keys.append("el_spec")
+    if data["el_loaded"]:
+        row_keys.append("el_waveform")
+        if show_el_spec and data["el_spec"]:
+            row_keys.append("el_spec")
     row_keys.append("timeline")
 
     _titles = {
@@ -436,7 +462,7 @@ def _build_figure(
             "text": f"Speaker Turn Analysis \u2014 Pause Detection{title_suffix}",
             "font": {"size": 15},
         },
-        height=max(500, 320 * n_rows),
+        height=max(700, 420 * n_rows),
         hovermode="closest",
         legend={
             "orientation": "h",
@@ -516,7 +542,9 @@ def _hover_texts(time_array: np.ndarray) -> list:
     # ------------------------------------------------------------------ #
     # Colour-coded waveform — one Scatter trace per contiguous segment
     # ------------------------------------------------------------------ #
-    def _colored_waveform(row: int, y: np.ndarray, t: np.ndarray, y_range: list) -> None:
+    def _colored_waveform(
+        row: int, y: np.ndarray, t: np.ndarray, y_range: list, speaker_filter: set[str] | None = None
+    ) -> None:
         if len(y) == 0:
             fig.add_annotation(
                 text="No file available",
@@ -532,11 +560,14 @@ def _colored_waveform(row: int, y: np.ndarray, t: np.ndarray, y_range: list) ->
             fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0], row=row, col=1)
             return
 
-        # Flat list of individual speaker audio segments, sorted by start time
+        # Flat list of individual speaker audio segments, sorted by start time.
+        # speaker_filter restricts which speakers are coloured (e.g. {"assistant"} for
+        # the ElevenLabs recording which only contains TTS audio).
+        visible_turns = [turn for turn in turns_rel if speaker_filter is None or turn["speaker"] in speaker_filter]
         all_segs = sorted(
             [
                 (s, e, "asst" if turn["speaker"] == "assistant" else "user")
-                for turn in turns_rel
+                for turn in visible_turns
                 for s, e in turn["segments"]
             ],
             key=lambda s: s[0],
@@ -590,7 +621,7 @@ def _colored_waveform(row: int, y: np.ndarray, t: np.ndarray, y_range: list) ->
     # ------------------------------------------------------------------ #
     # Spectrogram row — heatmap + invisible transcript strip
     # ------------------------------------------------------------------ #
-    def _spec_row(row: int, spec: tuple, label: str) -> None:
+    def _spec_row(row: int, spec: tuple, label: str, speaker_filter: set[str] | None = None) -> None:
         D, freqs, times = spec
 
         fig.add_trace(
@@ -627,8 +658,10 @@ def _spec_row(row: int, spec: tuple, label: str) -> None:
             col=1,
         )
 
-        # Turn boundary vrects (use envelope start/end per turn)
-        for turn in turns_rel:
+        # Turn boundary vrects (use envelope start/end per turn).
+        # Restrict to speaker_filter when set (e.g. EL spectrogram only shows assistant turns).
+        visible_turns = [turn for turn in turns_rel if speaker_filter is None or turn["speaker"] in speaker_filter]
+        for turn in visible_turns:
             color = ASST_FILL if turn["speaker"] == "assistant" else USER_FILL
             fig.add_vrect(
                 x0=turn["start"], x1=turn["end"], fillcolor=color, line_width=0, layer="below", row=row, col=1
@@ -669,18 +702,22 @@ def _no_file(row: int) -> None:
             _no_file(row_of["mixed_spec"])
             fig.update_yaxes(title_text="Freq (Hz)", row=row_of["mixed_spec"], col=1)
 
-    # ---- ElevenLabs waveform ----
-    if data["el_loaded"] and len(data["el_y_ds"]) > 0:
-        el_range = [float(data["el_y_ds"].min() * 1.1), float(data["el_y_ds"].max() * 1.1)]
-        _colored_waveform(row_of["el_waveform"], data["el_y_ds"], data["el_t"], el_range)
-    else:
-        _no_file(row_of["el_waveform"])
-        fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0], row=row_of["el_waveform"], col=1)
+    # ---- ElevenLabs waveform (only present when el_loaded=True) ----
+    # speaker_filter={"user"}: the EL recording captures the ElevenLabs user-simulator's
+    # outgoing audio (user speech sent to the assistant).  Assistant-turn time ranges
+    # are silent in this file and should not be coloured as "Assistant".
+    if "el_waveform" in row_of:
+        if len(data["el_y_ds"]) > 0:
+            el_range = [float(data["el_y_ds"].min() * 1.1), float(data["el_y_ds"].max() * 1.1)]
+            _colored_waveform(row_of["el_waveform"], data["el_y_ds"], data["el_t"], el_range, speaker_filter={"user"})
+        else:
+            _no_file(row_of["el_waveform"])
+            fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0], row=row_of["el_waveform"], col=1)
 
     # ---- ElevenLabs spectrogram (optional) ----
     if "el_spec" in row_of:
         if data["el_spec"]:
-            _spec_row(row_of["el_spec"], data["el_spec"], "EL Spec")
+            _spec_row(row_of["el_spec"], data["el_spec"], "EL Spec", speaker_filter={"user"})
         else:
             _no_file(row_of["el_spec"])
             fig.update_yaxes(title_text="Freq (Hz)", row=row_of["el_spec"], col=1)
@@ -854,6 +891,35 @@ def _no_file(row: int) -> None:
     return fig
 
 
+# =============================================================================
+# Streamlit caching — module-level so the cache persists across reruns
+# =============================================================================
+
+
+@st.cache_data(show_spinner="Loading audio files\u2026")
+def _cache_audio_data(path_str: str) -> dict:
+    """Cache the heavy data-loading step (file I/O + spectrogram computation).
+
+    Keyed only on the record directory path, so the cache is shared across
+    all spectrogram-toggle states.  _build_figure() is fast and runs on each
+    rerun with the pre-loaded data.
+    """
+    return _prepare_data(Path(path_str))
+
+
+def preload_audio_data(record_dir: Path) -> None:
+    """Warm the audio-data cache for *record_dir*.
+
+    Call this before the tab widgets are rendered so the heavy I/O happens
+    while the rest of the page is being built, rather than on first tab open.
+    Silently skips records that have no audio files.
+    """
+    events_file = record_dir / "elevenlabs_events.jsonl"
+    audio_mixed = next(record_dir.glob("audio_mixed*.wav"), record_dir / "audio_mixed.wav")
+    if events_file.exists() or audio_mixed.exists():
+        _cache_audio_data(str(record_dir))
+
+
 # =============================================================================
 # Streamlit tab renderer
 # =============================================================================
@@ -870,23 +936,27 @@ def render_audio_analysis_tab(record_dir: Path) -> None:
         st.info("No audio files found in this record directory.")
         return
 
-    # Spectrogram toggles
-    col1, col2 = st.columns(2)
-    with col1:
+    try:
+        # Data is already cached by preload_audio_data(); this is a cache hit.
+        data = _cache_audio_data(str(record_dir))
+    except Exception as exc:
+        st.error(f"Could not load audio data: {exc}")
+        return
+
+    # Spectrogram toggles — side-by-side when EL is available, single when not
+    if data["el_loaded"]:
+        col1, col2 = st.columns(2)
+        with col1:
+            show_mixed_spec = st.checkbox("Show Mixed Audio Spectrogram", value=False)
+        with col2:
+            show_el_spec = st.checkbox("Show ElevenLabs Spectrogram", value=False)
+    else:
         show_mixed_spec = st.checkbox("Show Mixed Audio Spectrogram", value=False)
-    with col2:
-        show_el_spec = st.checkbox("Show ElevenLabs Spectrogram", value=False)
-
-    @st.cache_data(show_spinner="Loading audio and building interactive plot\u2026")
-    def _cached(path_str: str, mixed_spec: bool, el_spec: bool) -> go.Figure:
-        return _build_figure(
-            _prepare_data(Path(path_str)),
-            show_mixed_spec=mixed_spec,
-            show_el_spec=el_spec,
-        )
+        show_el_spec = False
+        st.info("ElevenLabs audio recording is not available for this record.")
 
     try:
-        fig = _cached(str(record_dir), show_mixed_spec, show_el_spec)
+        fig = _build_figure(data, show_mixed_spec=show_mixed_spec, show_el_spec=show_el_spec)
         st.plotly_chart(fig, width="stretch", theme="streamlit")
     except Exception as exc:
         st.error(f"Could not render audio plot: {exc}")

From 325be6c217aad55d945f1a6926f2bd1d7b1574b2 Mon Sep 17 00:00:00 2001
From: nhhoang96 <10899923+nhhoang96@users.noreply.github.com>
Date: Wed, 15 Apr 2026 15:50:07 +0000
Subject: [PATCH 15/23] Apply pre-commit

---
 apps/audio_plots.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/audio_plots.py b/apps/audio_plots.py
index d68111b2..fcc4da04 100644
--- a/apps/audio_plots.py
+++ b/apps/audio_plots.py
@@ -295,9 +295,9 @@ def _wrap(text: str, width: int = 80) -> str:
 # Intermediate sample rate used for spectrogram computation.
 # 4 kHz preserves speech content up to 2 kHz (Nyquist) while keeping the
 # heatmap to roughly 60–250K cells for typical 5–90 s recordings.
-_SPEC_SR = 4000   # Hz
+_SPEC_SR = 4000  # Hz
 _SPEC_N_FFT = 512  # → 257 freq bins, 7.8 Hz resolution
-_SPEC_HOP = 512   # → ~0.128 s/frame at 4 kHz
+_SPEC_HOP = 512  # → ~0.128 s/frame at 4 kHz
 
 
 # =============================================================================

From bf0db60aef4f24d0bc78247c3f96deac62b5ec76 Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Wed, 15 Apr 2026 17:37:32 +0000
Subject: [PATCH 16/23] Update naming convention and documentation. Ensure the
 timestamp calculation is consistent with turn-taking metrics

---
 apps/analysis.py    |   2 +-
 apps/audio_plots.py | 167 ++++++++++++++++++++++++++++++--------------
 2 files changed, 115 insertions(+), 54 deletions(-)

diff --git a/apps/analysis.py b/apps/analysis.py
index 6f3cedfb..ca272585 100644
--- a/apps/analysis.py
+++ b/apps/analysis.py
@@ -1824,7 +1824,7 @@ def render_record_detail(selected_run_dir: Path):
             "Transcript",
             "Metrics Detail",
             "Processed Data",
-            "Audio Analysis",
+            "Turn Taking Analysis",
         ]
     )
 
diff --git a/apps/audio_plots.py b/apps/audio_plots.py
index d68111b2..852cee76 100644
--- a/apps/audio_plots.py
+++ b/apps/audio_plots.py
@@ -1,24 +1,30 @@
 """Interactive audio visualization for the EVA Streamlit app.
 
-Adapted from EVA-Bench/downloads/plot_script/plot_timestamp.py.
 Renders a Plotly figure directly into a Streamlit tab without writing files.
 
 Layout (dynamic — spectrograms are optional):
   Row 1        : audio_mixed waveform, colour-coded by speaker turn
   Row 2 (opt)  : audio_mixed spectrogram
-  Row 3        : ElevenLabs waveform, colour-coded by speaker turn
+  Row 3        : ElevenLabs waveform, colour-coded by speaker turn (only when
+                 elevenlabs_audio_recording.mp3 exists in the record directory)
   Row 4 (opt)  : ElevenLabs spectrogram
   Row 5        : Speaker Turn Timeline
 
-Turn data is loaded from metrics.json (same source as the turn_taking metric):
-  context.audio_timestamps_user_turns / audio_timestamps_assistant_turns
-    → dict[turn_id → list[(abs_start, abs_end)]] — may have multiple segments per turn
-  context.transcribed_*_turns / intended_*_turns
-    → dict[turn_id → str] — keyed by the same turn IDs
-  metrics.turn_taking.details.per_turn_latency
-    → dict[turn_id → seconds] — user_last_seg_end → asst_first_seg_start
-
-Falls back to parsing elevenlabs_events.jsonl directly when metrics.json is absent.
+Turn data source (primary → fallback):
+  1. metrics.json context  — the same MetricContext fields that turn_taking.py uses:
+       context.audio_timestamps_user_turns / audio_timestamps_assistant_turns
+         → dict[turn_id → list[[abs_start, abs_end]]]  (may be multi-segment)
+       context.transcribed_*/intended_*_turns → dict[turn_id → str]
+       latency_s = asst.segments[0][0] − user.segments[-1][1]  (per turn_id, same formula)
+  2. elevenlabs_events.jsonl  — used when metrics.json is absent or has no timestamps:
+       one turn per completed audio_start/audio_end session
+       latency_s computed by temporal proximity (next assistant after this user)
+
+Spectrograms use a 4 kHz intermediate sample rate (_SPEC_SR) via librosa.resample so that:
+  • frequency content up to 2 kHz (Nyquist) is preserved — representative of speech
+  • heatmap size stays bounded (~60–250 K cells for 5–90 s recordings)
+  • time axis (librosa.frames_to_time, t=0 origin) aligns with the waveform time axis
+    (np.linspace(0, duration, n_samples), also t=0 origin)
 """
 
 import json
@@ -45,7 +51,9 @@
 
 
 # =============================================================================
-# Turn data loading — metrics.json first, elevenlabs_events.jsonl fallback
+# Turn data loading
+# Primary: metrics.json context (MetricContext fields, same as turn_taking.py)
+# Fallback: elevenlabs_events.jsonl (when metrics.json absent or has no timestamps)
 # =============================================================================
 
 
@@ -59,14 +67,20 @@ def _load_metrics_context(record_dir: Path) -> dict | None:
 
 
 def _build_turns_from_metrics(metrics_data: dict) -> list[dict] | None:
-    """Build a turns list from metrics.json using the same timestamps the turn_taking metric uses.
-
-    Each turn dict has:
-      turn_id, speaker ("user"|"assistant"),
-      segments [(rel_start, rel_end), ...],  ← may be >1 for interrupted turns
-      start, end, duration,
-      transcript_heard, transcript_intended,
-      latency_s (user→assistant gap, user turns only), timing_label.
+    """Build a turns list from MetricContext fields stored in metrics.json.
+
+    Uses the exact same fields that turn_taking.py operates on:
+      context.audio_timestamps_user_turns / audio_timestamps_assistant_turns
+        → dict[turn_id → list[[abs_start, abs_end]]] — may have multiple
+          segments per turn (e.g. interrupted turns)
+      context.transcribed_*/intended_*_turns
+        → dict[turn_id → str]
+
+    Latency is computed directly from the timestamps using the same formula
+    as turn_taking.py: asst.segments[0][0] − user.segments[-1][1]
+    (first assistant segment start − last user segment end, per turn_id).
+
+    Returns None when no timestamp data is present (falls back to EL log).
     """
     ctx = metrics_data.get("context") or {}
     user_ts = ctx.get("audio_timestamps_user_turns") or {}
@@ -79,13 +93,7 @@ def _build_turns_from_metrics(metrics_data: dict) -> list[dict] | None:
     intended_user = ctx.get("intended_user_turns") or {}
     intended_asst = ctx.get("intended_assistant_turns") or {}
 
-    # Per-turn latency / timing label from turn_taking metric (if already computed)
-    metrics = metrics_data.get("metrics") or {}
-    tt_details = (metrics.get("turn_taking") or {}).get("details") or {}
-    per_turn_latency = {int(k): v for k, v in (tt_details.get("per_turn_latency") or {}).items()}
-    per_turn_labels = {int(k): v for k, v in (tt_details.get("per_turn_judge_timing_ratings") or {}).items()}
-
-    # Reference time: earliest timestamp across all turns
+    # Reference time: earliest timestamp across all turns (same as turn_taking.py)
     all_starts = [segs[0][0] for segs in list(user_ts.values()) + list(asst_ts.values()) if segs]
     t0 = min(all_starts) if all_starts else 0.0
 
@@ -97,11 +105,10 @@ def _rel(segs: list) -> list[tuple[float, float]]:
     for tid_str, segs in asst_ts.items():
         if not segs:
             continue
-        tid = int(tid_str)
         rel = _rel(segs)
         turns.append(
             {
-                "turn_id": tid,
+                "turn_id": int(tid_str),
                 "speaker": "assistant",
                 "segments": rel,
                 "start": rel[0][0],
@@ -110,18 +117,20 @@ def _rel(segs: list) -> list[tuple[float, float]]:
                 "transcript_heard": transcribed_asst.get(tid_str, ""),
                 "transcript_intended": intended_asst.get(tid_str, ""),
                 "latency_s": None,
-                "timing_label": None,
             }
         )
 
     for tid_str, segs in user_ts.items():
         if not segs:
             continue
-        tid = int(tid_str)
         rel = _rel(segs)
+        # Latency: same formula as turn_taking.py — asst first-seg start − user last-seg end.
+        # Uses the matching assistant turn (same turn_id); None if no assistant turn exists.
+        a_segs = asst_ts.get(tid_str)
+        latency_s = round(a_segs[0][0] - segs[-1][1], 6) if a_segs else None
         turns.append(
             {
-                "turn_id": tid,
+                "turn_id": int(tid_str),
                 "speaker": "user",
                 "segments": rel,
                 "start": rel[0][0],
@@ -129,8 +138,7 @@ def _rel(segs: list) -> list[tuple[float, float]]:
                 "duration": rel[-1][1] - rel[0][0],
                 "transcript_heard": transcribed_user.get(tid_str, ""),
                 "transcript_intended": intended_user.get(tid_str, ""),
-                "latency_s": per_turn_latency.get(tid),
-                "timing_label": per_turn_labels.get(tid),
+                "latency_s": latency_s,
             }
         )
 
@@ -139,7 +147,21 @@ def _rel(segs: list) -> list[tuple[float, float]]:
 
 
 def _parse_elevenlabs_events(events_file: Path) -> list[dict]:
-    """Fallback: parse elevenlabs_events.jsonl into a flat turns list (no turn IDs)."""
+    """Parse elevenlabs_events.jsonl into a flat list of audio-session turns.
+
+    Each completed audio_start/audio_end pair for a participant becomes one
+    turn dict.  Turn IDs are sequential integers across all participants (not
+    per-speaker).  Transcripts and latencies are left empty here and filled in
+    by _patch_fallback_transcripts and _compute_and_patch_latencies.
+
+    Speaker assignment:
+      event["user"] == "pipecat_agent"  → speaker = "assistant"
+      anything else                     → speaker = "user" (EL user-simulator)
+
+    Time reference:
+      t0 = earliest audio_timestamp across all completed sessions.
+      All start/end values stored as relative seconds from t0.
+    """
     events = []
     with open(events_file) as f:
         for line in f:
@@ -185,7 +207,6 @@ def _parse_elevenlabs_events(events_file: Path) -> list[dict]:
                 "transcript_heard": "",
                 "transcript_intended": "",
                 "latency_s": None,
-                "timing_label": None,
                 "_seq_idx": asst_idx if is_asst else user_idx,
             }
         )
@@ -197,7 +218,12 @@ def _parse_elevenlabs_events(events_file: Path) -> list[dict]:
 
 
 def _patch_fallback_transcripts(turns: list[dict], transcript_file: Path) -> None:
-    """Fill transcript fields in fallback turns from transcript.jsonl using sequential order."""
+    """Fill transcript fields in EL-log turns from transcript.jsonl.
+
+    Matches transcripts to turns by sequential order per speaker role
+    (first user turn gets user transcript[0], second gets [1], etc.).
+    Called after _parse_elevenlabs_events, before _compute_and_patch_latencies.
+    """
     tx: dict[str, list[str]] = {"user": [], "assistant": []}
     if transcript_file.exists():
         with open(transcript_file) as f:
@@ -216,22 +242,56 @@ def _patch_fallback_transcripts(turns: list[dict], transcript_file: Path) -> Non
         turn["transcript_intended"] = text
 
 
+def _compute_and_patch_latencies(turns: list[dict]) -> None:
+    """Compute per-user-turn response latency and patch in-place.
+
+    Formula (identical to turn_taking.py _compute_per_turn_latency_and_timing_labels):
+      latency_s = asst.segments[0][0] - user.segments[-1][1]
+                  (first assistant segment start − last user segment end)
+
+    Matching: turn_taking.py matches by shared turn_id; here we match by
+    temporal proximity (next assistant turn after this user turn in time order)
+    — equivalent for linear conversations.
+    """
+    sorted_turns = sorted(turns, key=lambda t: t["start"])
+    for i, turn in enumerate(sorted_turns):
+        if turn["speaker"] != "user":
+            continue
+        for j in range(i + 1, len(sorted_turns)):
+            if sorted_turns[j]["speaker"] == "assistant":
+                latency_s = sorted_turns[j]["segments"][0][0] - turn["segments"][-1][1]
+                turn["latency_s"] = round(latency_s, 6)
+                break
+
+
 def _calculate_pauses(turns_rel: list[dict]) -> list[dict]:
-    """Compute pause gaps between consecutive audio segments across all turns."""
+    """Compute speaker-transition gaps, consistent with turn_taking.py.
+
+    Only gaps where the speaker changes (user→assistant or assistant→user)
+    are counted — mirroring the `if prev_role != next_role` guard in
+    turn_taking.py _format_conversation_context (lines 81-86).
+
+    Same-speaker consecutive segments (e.g. two user audio sessions back to
+    back) are ignored, as turn_taking.py does not treat these as pauses.
+    """
     all_segs = sorted(
         [(s, e, turn["speaker"]) for turn in turns_rel for s, e in turn["segments"]],
         key=lambda x: x[0],
     )
     pauses = []
     for i in range(len(all_segs) - 1):
+        from_spk = all_segs[i][2]
+        to_spk = all_segs[i + 1][2]
+        if from_spk == to_spk:
+            continue  # same-speaker gap — not a turn-taking transition
         cur_end = all_segs[i][1]
         nxt_start = all_segs[i + 1][0]
         gap = nxt_start - cur_end
         if gap > 0.001:
             pauses.append(
                 {
-                    "from_speaker": all_segs[i][2],
-                    "to_speaker": all_segs[i + 1][2],
+                    "from_speaker": from_spk,
+                    "to_speaker": to_spk,
                     "start": cur_end,
                     "end": nxt_start,
                     "duration_seconds": gap,
@@ -311,7 +371,11 @@ def _prepare_data(record_dir: Path) -> dict:
     events_file = record_dir / "elevenlabs_events.jsonl"
     transcript = record_dir / "transcript.jsonl"
 
-    # --- Turn data: prefer metrics.json (same source as turn_taking metric) ---
+    # --- Turn data ---
+    # Primary: metrics.json context — same fields turn_taking.py uses, with
+    # multi-segment turns, matched transcripts, and turn_id-based latency.
+    # Fallback: elevenlabs_events.jsonl — one entry per audio session, latency
+    # computed by temporal proximity when metrics.json is absent.
     turns_rel: list[dict] = []
     metrics_data = _load_metrics_context(record_dir)
     if metrics_data:
@@ -319,10 +383,10 @@ def _prepare_data(record_dir: Path) -> dict:
         if built:
             turns_rel = built
 
-    # Fallback: parse ElevenLabs event log directly
     if not turns_rel and events_file.exists():
         turns_rel = _parse_elevenlabs_events(events_file)
         _patch_fallback_transcripts(turns_rel, transcript)
+        _compute_and_patch_latencies(turns_rel)
 
     pauses_rel = _calculate_pauses(turns_rel)
 
@@ -513,9 +577,7 @@ def _hover_texts(time_array: np.ndarray) -> list:
 
             latency_line = ""
             if turn["speaker"] == "user" and turn.get("latency_s") is not None:
-                latency_line = f"<br>Response latency:\u00a0{turn['latency_s'] * 1000:.0f}\u00a0ms" + (
-                    f"\u00a0({turn['timing_label']})" if turn.get("timing_label") else ""
-                )
+                latency_line = f"<br>Response latency:\u00a0{turn['latency_s'] * 1000:.0f}\u00a0ms"
 
             hover = (
                 f"<b>Turn\u00a0{turn['turn_id']}\u00a0\u2014\u00a0{speaker}</b><br>"
@@ -703,13 +765,14 @@ def _no_file(row: int) -> None:
             fig.update_yaxes(title_text="Freq (Hz)", row=row_of["mixed_spec"], col=1)
 
     # ---- ElevenLabs waveform (only present when el_loaded=True) ----
-    # speaker_filter={"user"}: the EL recording captures the ElevenLabs user-simulator's
-    # outgoing audio (user speech sent to the assistant).  Assistant-turn time ranges
-    # are silent in this file and should not be coloured as "Assistant".
+    # No speaker_filter: turn times from the EL log cover both user and assistant,
+    # so both get colour-coded identically to the mixed waveform.  Assistant
+    # regions will show a flat/silent waveform since the EL file only captures
+    # the user-simulator's outgoing audio, which is expected.
     if "el_waveform" in row_of:
         if len(data["el_y_ds"]) > 0:
             el_range = [float(data["el_y_ds"].min() * 1.1), float(data["el_y_ds"].max() * 1.1)]
-            _colored_waveform(row_of["el_waveform"], data["el_y_ds"], data["el_t"], el_range, speaker_filter={"user"})
+            _colored_waveform(row_of["el_waveform"], data["el_y_ds"], data["el_t"], el_range)
         else:
             _no_file(row_of["el_waveform"])
             fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0], row=row_of["el_waveform"], col=1)
@@ -717,7 +780,7 @@ def _no_file(row: int) -> None:
     # ---- ElevenLabs spectrogram (optional) ----
     if "el_spec" in row_of:
         if data["el_spec"]:
-            _spec_row(row_of["el_spec"], data["el_spec"], "EL Spec", speaker_filter={"user"})
+            _spec_row(row_of["el_spec"], data["el_spec"], "EL Spec")
         else:
             _no_file(row_of["el_spec"])
             fig.update_yaxes(title_text="Freq (Hz)", row=row_of["el_spec"], col=1)
@@ -737,9 +800,7 @@ def _no_file(row: int) -> None:
         transcript = turn["transcript_heard"] or turn["transcript_intended"] or "(no transcript)"
         latency_line = ""
         if not is_asst and turn.get("latency_s") is not None:
-            latency_line = f"<br>Response latency:\u00a0{turn['latency_s'] * 1000:.0f}\u00a0ms" + (
-                f"\u00a0({turn['timing_label']})" if turn.get("timing_label") else ""
-            )
+            latency_line = f"<br>Response latency:\u00a0{turn['latency_s'] * 1000:.0f}\u00a0ms"
 
         hover = (
             f"<b>Turn\u00a0{turn['turn_id']}\u00a0\u2014\u00a0{speaker}</b><br>"

From dfd9b2a653e986fcab0f4720aa02410046edcc9f Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Wed, 15 Apr 2026 17:59:27 +0000
Subject: [PATCH 17/23] Normalizing silence and pause terminology. Update
 documentation to reflect changes.

---
 apps/README.md      |  59 ++++++++++++++------
 apps/audio_plots.py | 130 ++++++++++++++++++++++++++++++--------------
 2 files changed, 131 insertions(+), 58 deletions(-)

diff --git a/apps/README.md b/apps/README.md
index d5474bc0..d566c4f1 100644
--- a/apps/README.md
+++ b/apps/README.md
@@ -54,34 +54,59 @@ The **Audio Analysis** tab in the Record Detail view renders an interactive Plot
 
 ### Subplots
 
-| Row | Content | Always shown |
-|-----|---------|--------------|
-| 1 | Mixed audio waveform, colour-coded by speaker | Yes |
-| 2 | Mixed audio spectrogram | Optional (checkbox) |
-| 3 | ElevenLabs audio waveform, colour-coded by speaker | Yes |
-| 4 | ElevenLabs audio spectrogram | Optional (checkbox) |
-| 5 | Speaker Turn Timeline with per-turn durations and pause markers | Yes |
+| Row | Content | Shown when |
+|-----|---------|------------|
+| 1 | Mixed audio waveform, colour-coded by speaker turn | Always |
+| 2 | Mixed audio spectrogram | "Show Mixed Audio Spectrogram" checkbox is on |
+| 3 | ElevenLabs audio waveform, colour-coded by speaker turn | `elevenlabs_audio_recording.mp3` exists in the record directory |
+| 4 | ElevenLabs audio spectrogram | EL recording exists **and** "Show ElevenLabs Spectrogram" checkbox is on |
+| 5 | Speaker Turn Timeline with per-turn durations and pause markers | Always |
 
-Toggle spectrograms on or off using the checkboxes above the chart. Results are cached per trial so switching between records is fast after the first load.
+When `elevenlabs_audio_recording.mp3` is not found, rows 3 and 4 are hidden and an info message is shown instead. Spectrogram checkboxes appear above the chart only for the recordings that are available. Results are cached per trial so switching between records is fast after the first load.
+
+### Waveform Rendering
+
+Each waveform subplot is drawn in three layers:
+
+1. **Base trace** — the complete audio file rendered as a light gray line so the full recording duration is always visible, including regions between turns.
+2. **Speaker segments** — overlaid in colour on top of the base trace for each active turn window.
+3. **Pause bands** — semi-transparent gray rectangles over speaker-transition gaps, linked to the **Pause** legend item so they can be toggled on/off.
 
 ### Colour Coding
 
 | Colour | Meaning |
 |--------|---------|
 | Blue | User speaker turn |
-| Orange | Assistant speaker turn |
-| Gray (semi-transparent line) | Silence — audio not covered by any speaker turn |
-| Gray shaded box | Pause — gap between consecutive speaker turns |
+| Orange-red | Assistant speaker turn |
+| Gray shaded band | Pause — speaker-transition gap (user→assistant or assistant→user) |
 
-Colours are chosen for visibility in both Streamlit light and dark mode.
+Colours are chosen for visibility in both Streamlit light and dark mode. Clicking a legend item (User, Assistant, Pause) toggles that category across all subplots simultaneously.
 
 ### Hover Tooltips
 
-Hovering over any waveform sample shows the **transcript text** for the active speaker turn, along with the turn start/end time and duration. Hovering over a pause region shows the pause duration and the from/to speakers. The timeline row shows the same transcript text when hovering over each bar.
+Hovering over any waveform sample or timeline bar shows:
+- Turn ID, speaker, start/end time, and duration
+- Transcript text (heard and intended where available)
+- Response latency in ms for user turns (time from user's last segment end to assistant's first segment start)
+
+Hovering over a pause band shows the pause duration and the from/to speakers.
+
+### Pause Definition
+
+Pauses are computed consistently with `turn_taking.py`:
+
+- Only **speaker-transition gaps** count as pauses: a gap between a user segment end and the next assistant segment start, or vice versa.
+- Same-speaker consecutive segments (e.g. two user audio sessions back to back) are not marked as pauses.
+- Formula: `pause_duration = next_speaker.segments[0].start − current_speaker.segments[-1].end`
+- Only gaps `> 1 ms` are shown.
+
+### Turn Data Source
+
+Turn timestamps, transcripts, and response latencies are loaded in priority order:
 
-### Silence vs. Pause
+1. **`metrics.json` context** (primary) — uses the same `MetricContext` fields (`audio_timestamps_user_turns`, `audio_timestamps_assistant_turns`, `transcribed_*_turns`) that `turn_taking.py` operates on. Latency is computed as `asst.segments[0].start − user.segments[-1].end` per matching turn ID.
+2. **`elevenlabs_events.jsonl`** (fallback) — used when `metrics.json` is absent or contains no timestamp data. One entry per completed `audio_start`/`audio_end` session; latency computed by temporal proximity.
 
-- **Pause** — derived from speaker turn event logs. The gap between one speaker's audio end event and the next speaker's audio start event: `pause = turns[i+1].start − turns[i].end`. Only recorded when `> 0`.
-- **Silence** — derived from the waveform timeline. Any portion of the audio not covered by a speaker turn event (including audio before the first turn or after the last turn).
+### Spectrogram Details
 
-A Pause always coincides with a Silence region, but Silence can be wider (e.g. leading/trailing audio with no events).
+Spectrograms are computed at a 4 kHz intermediate sample rate (via `librosa.resample`) to preserve speech content up to 2 kHz (Nyquist) while keeping heatmap size bounded (~60–250 K cells for typical 5–90 s recordings). The time axis starts at `t = 0` to align with the waveform.
diff --git a/apps/audio_plots.py b/apps/audio_plots.py
index 3b8d15fd..4fd5db29 100644
--- a/apps/audio_plots.py
+++ b/apps/audio_plots.py
@@ -3,23 +3,35 @@
 Renders a Plotly figure directly into a Streamlit tab without writing files.
 
 Layout (dynamic — spectrograms are optional):
-  Row 1        : audio_mixed waveform, colour-coded by speaker turn
+  Row 1        : audio_mixed waveform
   Row 2 (opt)  : audio_mixed spectrogram
-  Row 3        : ElevenLabs waveform, colour-coded by speaker turn (only when
-                 elevenlabs_audio_recording.mp3 exists in the record directory)
+  Row 3        : ElevenLabs waveform (only when elevenlabs_audio_recording.mp3 exists)
   Row 4 (opt)  : ElevenLabs spectrogram
   Row 5        : Speaker Turn Timeline
 
+Waveform rendering:
+  • Full recording is always shown as a light gray base trace so the true audio
+    duration is visible even in regions where no speaker turn is active.
+  • Speaker segments are overlaid in colour: blue = user, orange-red = assistant.
+  • Pause regions (speaker-change gaps) are drawn as shaded bands linked to the
+    "Pause" legend item so they can be toggled on/off.
+  • Silence between same-speaker consecutive segments is not marked separately —
+    only speaker-transition gaps are treated as pauses (consistent with turn_taking.py).
+
 Turn data source (primary → fallback):
   1. metrics.json context  — the same MetricContext fields that turn_taking.py uses:
        context.audio_timestamps_user_turns / audio_timestamps_assistant_turns
          → dict[turn_id → list[[abs_start, abs_end]]]  (may be multi-segment)
        context.transcribed_*/intended_*_turns → dict[turn_id → str]
        latency_s = asst.segments[0][0] − user.segments[-1][1]  (per turn_id, same formula)
-  2. elevenlabs_events.jsonl  — used when metrics.json is absent or has no timestamps:
+  2. elevenlabs_events.jsonl  — fallback when metrics.json is absent or has no timestamps:
        one turn per completed audio_start/audio_end session
        latency_s computed by temporal proximity (next assistant after this user)
 
+X-axis range:
+  Covers the longest of: audio_mixed duration, ElevenLabs audio duration, last turn end.
+  Ensures neither audio file is clipped when they differ in length.
+
 Spectrograms use a 4 kHz intermediate sample rate (_SPEC_SR) via librosa.resample so that:
   • frequency content up to 2 kHz (Nyquist) is preserved — representative of speech
   • heatmap size stays bounded (~60–250 K cells for 5–90 s recordings)
@@ -44,7 +56,6 @@
 
 USER_COLOR = "#4A90D9"  # mid-blue   — clear on white & dark
 ASST_COLOR = "#E8724A"  # orange-red — clear on white & dark
-GAP_COLOR = "rgba(140,140,140,0.55)"  # neutral gray for silence gaps
 USER_FILL = "rgba(74,144,217,0.22)"
 ASST_FILL = "rgba(232,114,74,0.22)"
 PAUSE_FILL = "rgba(140,140,140,0.18)"
@@ -400,10 +411,6 @@ def _prepare_data(record_dir: Path) -> dict:
         except Exception:
             pass
 
-    # Use the later of audio duration and last turn end for x-axis
-    turns_end = max((t["end"] for t in turns_rel), default=0.0)
-    plot_xlim = [0, max(duration, turns_end, 1.0)]
-
     if mixed_loaded:
         y_ds, _ = _downsample(y_mixed, sr_mixed)
         t_mixed = np.linspace(0, duration, len(y_ds))
@@ -436,6 +443,12 @@ def _prepare_data(record_dir: Path) -> dict:
         except Exception:
             pass
 
+    # x-axis range: longest of mixed audio, EL audio, and last turn end.
+    # Ensures neither recording is clipped when the two files differ in length.
+    el_duration = float(el_t[-1]) if el_loaded and len(el_t) > 0 else 0.0
+    turns_end = max((t["end"] for t in turns_rel), default=0.0)
+    plot_xlim = [0, max(duration, el_duration, turns_end, 1.0)]
+
     # --- Spectrogram: mixed ---
     # Resample to _SPEC_SR so both spectrograms share the same frequency axis
     # and have meaningful content. Times start at 0 — aligned with t_mixed.
@@ -546,7 +559,6 @@ def _build_figure(
     for _name, _color, _symbol in [
         ("User", USER_COLOR, "square"),
         ("Assistant", ASST_COLOR, "square"),
-        ("Silence", "rgba(140,140,140,0.55)", "square"),
         ("Pause", "rgba(140,140,140,0.40)", "square-open"),
     ]:
         fig.add_trace(
@@ -602,7 +614,10 @@ def _hover_texts(time_array: np.ndarray) -> list:
         return texts.tolist()
 
     # ------------------------------------------------------------------ #
-    # Colour-coded waveform — one Scatter trace per contiguous segment
+    # Colour-coded waveform
+    # Layer 1 (bottom): full-recording base trace — light gray, no legend.
+    # Layer 2 (top):    speaker segments — blue (user) / orange-red (assistant).
+    # Layer 3:          pause shaded bands — linked to "Pause" legend toggle.
     # ------------------------------------------------------------------ #
     def _colored_waveform(
         row: int, y: np.ndarray, t: np.ndarray, y_range: list, speaker_filter: set[str] | None = None
@@ -622,9 +637,23 @@ def _colored_waveform(
             fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0], row=row, col=1)
             return
 
-        # Flat list of individual speaker audio segments, sorted by start time.
-        # speaker_filter restricts which speakers are coloured (e.g. {"assistant"} for
-        # the ElevenLabs recording which only contains TTS audio).
+        # Base trace — full recording at low opacity so gaps between turns
+        # are still visible and the x-axis always reflects the true duration.
+        fig.add_trace(
+            go.Scatter(
+                x=t.tolist(),
+                y=y.tolist(),
+                mode="lines",
+                line={"width": 0.8, "color": "rgba(160,160,160,0.35)"},
+                showlegend=False,
+                hoverinfo="skip",
+                name="",
+            ),
+            row=row,
+            col=1,
+        )
+
+        # Flat list of speaker audio segments, sorted by start time.
         visible_turns = [turn for turn in turns_rel if speaker_filter is None or turn["speaker"] in speaker_filter]
         all_segs = sorted(
             [
@@ -635,35 +664,22 @@ def _colored_waveform(
             key=lambda s: s[0],
         )
 
-        # Insert gap segments between speaker audio
-        segments: list[tuple] = []
-        prev_end = 0.0
+        _color_map = {"user": USER_COLOR, "asst": ASST_COLOR}
+        _name_map = {"user": "User", "asst": "Assistant"}
+
         for seg_s, seg_e, spk in all_segs:
-            if seg_s > prev_end + 1e-3:
-                segments.append((prev_end, seg_s, "gap"))
-            segments.append((seg_s, seg_e, spk))
-            prev_end = seg_e
-        duration = float(t[-1]) if len(t) > 0 else 0.0
-        if prev_end < duration - 1e-3:
-            segments.append((prev_end, duration, "gap"))
-
-        _color_map = {"user": USER_COLOR, "asst": ASST_COLOR, "gap": GAP_COLOR}
-        _name_map = {"user": "User", "asst": "Assistant", "gap": "Silence"}
-
-        for seg_s, seg_e, spk in segments:
             mask = (t >= seg_s) & (t <= seg_e)
             if not mask.any():
                 continue
-            name = _name_map[spk]
             fig.add_trace(
                 go.Scatter(
                     x=t[mask].tolist(),
                     y=y[mask].tolist(),
                     mode="lines",
                     line={"width": 1.0, "color": _color_map[spk]},
-                    opacity=0.85 if spk != "gap" else 0.45,
-                    name=name,
-                    legendgroup=name,
+                    opacity=0.85,
+                    name=_name_map[spk],
+                    legendgroup=_name_map[spk],
                     showlegend=False,
                     text=_hover_texts(t[mask]),
                     hovertemplate="%{text}<extra></extra>",
@@ -672,16 +688,34 @@ def _colored_waveform(
                 col=1,
             )
 
-        # Pause vrects (visual only)
+        # Pause shaded bands — Scatter traces so they toggle with the legend.
+        y0, y1 = y_range[0], y_range[1]
         for pause in pauses_rel:
-            fig.add_vrect(
-                x0=pause["start"], x1=pause["end"], fillcolor=PAUSE_FILL, line_width=0, layer="below", row=row, col=1
+            fig.add_trace(
+                go.Scatter(
+                    x=[pause["start"], pause["end"], pause["end"], pause["start"], pause["start"]],
+                    y=[y1, y1, y0, y0, y1],
+                    fill="toself",
+                    fillcolor=PAUSE_FILL,
+                    line={"width": 0},
+                    mode="lines",
+                    name="Pause",
+                    legendgroup="Pause",
+                    showlegend=False,
+                    hoverinfo="skip",
+                ),
+                row=row,
+                col=1,
             )
 
         fig.update_yaxes(title_text="Amplitude", range=y_range, row=row, col=1)
 
     # ------------------------------------------------------------------ #
-    # Spectrogram row — heatmap + invisible transcript strip
+    # Spectrogram row
+    # Layer 1: Heatmap (STFT at _SPEC_SR=4 kHz, 0–2 kHz Nyquist).
+    # Layer 2: Invisible transcript strip for hover tooltips.
+    # Layer 3: Speaker turn vrects (user / assistant fill colours).
+    # Layer 4: Pause shaded bands — linked to "Pause" legend toggle.
     # ------------------------------------------------------------------ #
     def _spec_row(row: int, spec: tuple, label: str, speaker_filter: set[str] | None = None) -> None:
         D, freqs, times = spec
@@ -720,17 +754,31 @@ def _spec_row(row: int, spec: tuple, label: str, speaker_filter: set[str] | None
             col=1,
         )
 
-        # Turn boundary vrects (use envelope start/end per turn).
-        # Restrict to speaker_filter when set (e.g. EL spectrogram only shows assistant turns).
+        # Speaker turn fill bands (envelope start/end per turn).
         visible_turns = [turn for turn in turns_rel if speaker_filter is None or turn["speaker"] in speaker_filter]
         for turn in visible_turns:
             color = ASST_FILL if turn["speaker"] == "assistant" else USER_FILL
             fig.add_vrect(
                 x0=turn["start"], x1=turn["end"], fillcolor=color, line_width=0, layer="below", row=row, col=1
             )
+        # Pause shaded bands — Scatter traces so they toggle with the legend.
+        f0, f1 = float(freqs[0]), float(freqs[-1])
         for pause in pauses_rel:
-            fig.add_vrect(
-                x0=pause["start"], x1=pause["end"], fillcolor=PAUSE_FILL, line_width=0, layer="below", row=row, col=1
+            fig.add_trace(
+                go.Scatter(
+                    x=[pause["start"], pause["end"], pause["end"], pause["start"], pause["start"]],
+                    y=[f1, f1, f0, f0, f1],
+                    fill="toself",
+                    fillcolor=PAUSE_FILL,
+                    line={"width": 0},
+                    mode="lines",
+                    name="Pause",
+                    legendgroup="Pause",
+                    showlegend=False,
+                    hoverinfo="skip",
+                ),
+                row=row,
+                col=1,
             )
 
         fig.update_yaxes(title_text="Freq (Hz)", row=row, col=1)

From 5125ec33c399678bf6920cc9bf4f8aebfe09e3d6 Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Wed, 15 Apr 2026 18:14:01 +0000
Subject: [PATCH 18/23] Removing base traces when toggling off user/ assistant
 on waveform plots

---
 apps/README.md      |  7 +++----
 apps/audio_plots.py | 24 +++---------------------
 2 files changed, 6 insertions(+), 25 deletions(-)

diff --git a/apps/README.md b/apps/README.md
index d566c4f1..5ed18405 100644
--- a/apps/README.md
+++ b/apps/README.md
@@ -66,11 +66,10 @@ When `elevenlabs_audio_recording.mp3` is not found, rows 3 and 4 are hidden and
 
 ### Waveform Rendering
 
-Each waveform subplot is drawn in three layers:
+Each waveform subplot is drawn in two layers:
 
-1. **Base trace** — the complete audio file rendered as a light gray line so the full recording duration is always visible, including regions between turns.
-2. **Speaker segments** — overlaid in colour on top of the base trace for each active turn window.
-3. **Pause bands** — semi-transparent gray rectangles over speaker-transition gaps, linked to the **Pause** legend item so they can be toggled on/off.
+1. **Speaker segments** — drawn in colour for each active turn window. Clicking a legend item (User or Assistant) hides all traces for that speaker.
+2. **Pause bands** — semi-transparent gray rectangles over speaker-transition gaps, linked to the **Pause** legend item so they can be toggled on/off.
 
 ### Colour Coding
 
diff --git a/apps/audio_plots.py b/apps/audio_plots.py
index 4fd5db29..2e70a4c6 100644
--- a/apps/audio_plots.py
+++ b/apps/audio_plots.py
@@ -10,13 +10,11 @@
   Row 5        : Speaker Turn Timeline
 
 Waveform rendering:
-  • Full recording is always shown as a light gray base trace so the true audio
-    duration is visible even in regions where no speaker turn is active.
-  • Speaker segments are overlaid in colour: blue = user, orange-red = assistant.
+  • Speaker segments are drawn in colour: blue = user, orange-red = assistant.
+    Toggling a legend item hides all traces in that group.
   • Pause regions (speaker-change gaps) are drawn as shaded bands linked to the
     "Pause" legend item so they can be toggled on/off.
-  • Silence between same-speaker consecutive segments is not marked separately —
-    only speaker-transition gaps are treated as pauses (consistent with turn_taking.py).
+  • Only speaker-transition gaps are treated as pauses (consistent with turn_taking.py).
 
 Turn data source (primary → fallback):
   1. metrics.json context  — the same MetricContext fields that turn_taking.py uses:
@@ -637,22 +635,6 @@ def _colored_waveform(
             fig.update_yaxes(title_text="Amplitude", range=[-1.0, 1.0], row=row, col=1)
             return
 
-        # Base trace — full recording at low opacity so gaps between turns
-        # are still visible and the x-axis always reflects the true duration.
-        fig.add_trace(
-            go.Scatter(
-                x=t.tolist(),
-                y=y.tolist(),
-                mode="lines",
-                line={"width": 0.8, "color": "rgba(160,160,160,0.35)"},
-                showlegend=False,
-                hoverinfo="skip",
-                name="",
-            ),
-            row=row,
-            col=1,
-        )
-
         # Flat list of speaker audio segments, sorted by start time.
         visible_turns = [turn for turn in turns_rel if speaker_filter is None or turn["speaker"] in speaker_filter]
         all_segs = sorted(

From 25c0934f3409cb7fc0a1d40c5aaca6dcbf446b2a Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Wed, 15 Apr 2026 18:30:00 +0000
Subject: [PATCH 19/23] Minor update on pause coloring

---
 apps/audio_plots.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/audio_plots.py b/apps/audio_plots.py
index 2e70a4c6..1b0a2aa5 100644
--- a/apps/audio_plots.py
+++ b/apps/audio_plots.py
@@ -557,7 +557,7 @@ def _build_figure(
     for _name, _color, _symbol in [
         ("User", USER_COLOR, "square"),
         ("Assistant", ASST_COLOR, "square"),
-        ("Pause", "rgba(140,140,140,0.40)", "square-open"),
+        ("Pause", "rgba(140,140,140,0.40)", "square"),
     ]:
         fig.add_trace(
             go.Scatter(

From 78dfa03db4cd7926f0c46ad24ed7e0b34f84d509 Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Wed, 15 Apr 2026 20:11:43 +0000
Subject: [PATCH 20/23] Add fixes to mismatch x-axis(time) across subplots

---
 apps/audio_plots.py | 164 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 132 insertions(+), 32 deletions(-)

diff --git a/apps/audio_plots.py b/apps/audio_plots.py
index 1b0a2aa5..09603235 100644
--- a/apps/audio_plots.py
+++ b/apps/audio_plots.py
@@ -11,7 +11,7 @@
 
 Waveform rendering:
   • Speaker segments are drawn in colour: blue = user, orange-red = assistant.
-    Toggling a legend item hides all traces in that group.
+    Toggling a legend item hides all traces for that speaker.
   • Pause regions (speaker-change gaps) are drawn as shaded bands linked to the
     "Pause" legend item so they can be toggled on/off.
   • Only speaker-transition gaps are treated as pauses (consistent with turn_taking.py).
@@ -38,6 +38,7 @@
 """
 
 import json
+import struct
 import warnings
 from pathlib import Path
 
@@ -314,12 +315,100 @@ def _calculate_pauses(turns_rel: list[dict]) -> list[dict]:
 # =============================================================================
 
 
+def _wav_actual_n_samples(path: Path) -> tuple[int, int, int] | None:
+    """Return (n_samples_per_channel, sr, sample_width) from actual WAV file bytes.
+
+    Scans the RIFF chunks to find the fmt and data chunks.  The data chunk size
+    is derived from (file_size − data_chunk_start) rather than from the header
+    field, which is frequently wrong when the recorder fails to update it.
+
+    Returns None for non-WAV files or unreadable files.
+    """
+    try:
+        with open(path, "rb") as f:
+            if f.read(4) != b"RIFF":
+                return None
+            f.read(4)  # RIFF chunk size — unreliable, ignore
+            if f.read(4) != b"WAVE":
+                return None
+            sr = channels = sample_width = None
+            while True:
+                hdr = f.read(8)
+                if len(hdr) < 8:
+                    break
+                cid = hdr[:4]
+                csz = struct.unpack("<I", hdr[4:])[0]
+                if csz > 100_000_000:
+                    break  # sanity guard against corrupt chunk sizes
+                if cid == b"fmt ":
+                    raw = f.read(min(csz, 16))
+                    _, channels, sr, _, _, bits = struct.unpack_from("<HHIIHH", raw)
+                    sample_width = bits // 8
+                    # skip remaining fmt bytes + RIFF pad (chunks are even-aligned)
+                    skip = (csz - min(csz, 16)) + (csz % 2)
+                    if skip > 0:
+                        f.seek(skip, 1)
+                elif cid == b"data":
+                    if sr and channels and sample_width:
+                        data_start = f.tell()
+                        file_size = path.stat().st_size
+                        actual_bytes = file_size - data_start
+                        n_samples = actual_bytes // (sample_width * channels)
+                        return n_samples, sr, sample_width
+                    break
+                else:
+                    # RIFF chunks are padded to even byte boundaries
+                    f.seek(csz + (csz % 2), 1)
+    except Exception:
+        pass
+    return None
+
+
 def _load_pydub(path: Path) -> tuple:
     seg = AudioSegment.from_file(str(path))
-    if seg.channels > 1:
+    n_channels = seg.channels
+    if n_channels > 1:
         seg = seg.set_channels(1)
     sr = seg.frame_rate
+    sample_width = seg.sample_width
     y = np.array(seg.get_array_of_samples()).astype(np.float32) / 32768.0
+
+    # WAV files from some recorders have an incorrect data-chunk size in the
+    # header (written before the call starts, never updated when it ends).
+    # Cross-check against the real file size and reload raw PCM if the
+    # declared duration is more than 5% shorter than the actual file content.
+    if path.suffix.lower() == ".wav":
+        info = _wav_actual_n_samples(path)
+        if info is not None:
+            n_actual, _, sw = info
+            dur_declared = len(y) / sr
+            dur_actual = n_actual / sr
+            if dur_actual > dur_declared + 1.0:
+                try:
+                    dtype = np.int16 if sw == 2 else np.int32
+                    divisor = 32768.0 if sw == 2 else 2_147_483_648.0
+                    with open(path, "rb") as f:
+                        # Re-seek to the data chunk start
+                        f.read(4); f.read(4); f.read(4)  # RIFF + size + WAVE
+                        while True:
+                            hdr = f.read(8)
+                            if len(hdr) < 8:
+                                break
+                            cid = hdr[:4]
+                            csz = struct.unpack("<I", hdr[4:])[0]
+                            if cid == b"data":
+                                raw = np.frombuffer(f.read(), dtype=dtype).astype(np.float32) / divisor
+                                if n_channels > 1:
+                                    raw = raw[: (len(raw) // n_channels) * n_channels]
+                                    raw = raw.reshape(-1, n_channels).mean(axis=1)
+                                y = raw
+                                break
+                            if csz > 100_000_000:
+                                break
+                            f.seek(csz + (csz % 2), 1)
+                except Exception:
+                    pass  # keep pydub result
+
     return y, sr
 
 
@@ -419,37 +508,38 @@ def _prepare_data(record_dir: Path) -> dict:
     # --- Audio: ElevenLabs ---
     el_y_ds, el_t, el_spec = np.array([]), np.array([]), None
     el_loaded = False
+    el_duration = 0.0
     if audio_el.exists():
         try:
             _el_y, _el_sr = _load_librosa(audio_el)
             el_y_ds, _ = _downsample(_el_y, _el_sr)
-            el_t = np.linspace(0, len(_el_y) / _el_sr, len(el_y_ds))
+            el_duration = len(_el_y) / _el_sr
+            el_t = np.linspace(0, el_duration, len(el_y_ds))
             el_loaded = True
-            # Spectrogram: resample to _SPEC_SR (4 kHz) to get meaningful
-            # frequency content (0–2 kHz Nyquist) with a bounded heatmap.
-            # Times from frames_to_time start at 0 — aligned with el_t.
+            # Spectrogram: resample to _SPEC_SR (4 kHz) for speech-range content.
+            # x axis pinned to el_duration via np.linspace so it aligns with el_t.
             try:
                 _el_y_spec = librosa.resample(_el_y, orig_sr=_el_sr, target_sr=_SPEC_SR)
                 D = librosa.amplitude_to_db(
                     np.abs(librosa.stft(_el_y_spec, hop_length=_SPEC_HOP, n_fft=_SPEC_N_FFT)), ref=np.max
                 )
                 freqs = librosa.fft_frequencies(sr=_SPEC_SR, n_fft=_SPEC_N_FFT)
-                times = librosa.frames_to_time(np.arange(D.shape[1]), sr=_SPEC_SR, hop_length=_SPEC_HOP)
+                times = np.linspace(0, el_duration, D.shape[1])
                 el_spec = (D, freqs, times)
             except Exception:
                 pass
         except Exception:
             pass
 
-    # x-axis range: longest of mixed audio, EL audio, and last turn end.
-    # Ensures neither recording is clipped when the two files differ in length.
-    el_duration = float(el_t[-1]) if el_loaded and len(el_t) > 0 else 0.0
-    turns_end = max((t["end"] for t in turns_rel), default=0.0)
-    plot_xlim = [0, max(duration, el_duration, turns_end, 1.0)]
+    # x-axis: audio file durations only.
+    # turns_end is excluded — turn timestamps can exceed the recording length
+    # and would push the axis beyond the actual audio.
+    plot_xlim = [0, max(duration if mixed_loaded else 0.0, el_duration, 1.0)]
 
     # --- Spectrogram: mixed ---
-    # Resample to _SPEC_SR so both spectrograms share the same frequency axis
-    # and have meaningful content. Times start at 0 — aligned with t_mixed.
+    # x axis pinned to `duration` via np.linspace so it aligns exactly with
+    # t_mixed. STFT data is unchanged; only the frame→time mapping differs from
+    # frames_to_time by at most one hop (128 ms), which is visually imperceptible.
     mixed_spec = None
     if mixed_loaded:
         try:
@@ -458,7 +548,7 @@ def _prepare_data(record_dir: Path) -> dict:
                 np.abs(librosa.stft(_y_spec, hop_length=_SPEC_HOP, n_fft=_SPEC_N_FFT)), ref=np.max
             )
             freqs = librosa.fft_frequencies(sr=_SPEC_SR, n_fft=_SPEC_N_FFT)
-            times = librosa.frames_to_time(np.arange(D.shape[1]), sr=_SPEC_SR, hop_length=_SPEC_HOP)
+            times = np.linspace(0, duration, D.shape[1])
             mixed_spec = (D, freqs, times)
         except Exception:
             pass
@@ -613,9 +703,9 @@ def _hover_texts(time_array: np.ndarray) -> list:
 
     # ------------------------------------------------------------------ #
     # Colour-coded waveform
-    # Layer 1 (bottom): full-recording base trace — light gray, no legend.
-    # Layer 2 (top):    speaker segments — blue (user) / orange-red (assistant).
-    # Layer 3:          pause shaded bands — linked to "Pause" legend toggle.
+    # Speaker segments — blue (user) / orange-red (assistant).
+    # Pause shaded bands — linked to "Pause" legend toggle.
+    # X-axis range is set by plot_xlim (independent of trace data extent).
     # ------------------------------------------------------------------ #
     def _colored_waveform(
         row: int, y: np.ndarray, t: np.ndarray, y_range: list, speaker_filter: set[str] | None = None
@@ -736,13 +826,6 @@ def _spec_row(row: int, spec: tuple, label: str, speaker_filter: set[str] | None
             col=1,
         )
 
-        # Speaker turn fill bands (envelope start/end per turn).
-        visible_turns = [turn for turn in turns_rel if speaker_filter is None or turn["speaker"] in speaker_filter]
-        for turn in visible_turns:
-            color = ASST_FILL if turn["speaker"] == "assistant" else USER_FILL
-            fig.add_vrect(
-                x0=turn["start"], x1=turn["end"], fillcolor=color, line_width=0, layer="below", row=row, col=1
-            )
         # Pause shaded bands — Scatter traces so they toggle with the legend.
         f0, f1 = float(freqs[0]), float(freqs[-1])
         for pause in pauses_rel:
@@ -987,13 +1070,29 @@ def _no_file(row: int) -> None:
 # =============================================================================
 
 
+def _audio_mtime(record_dir: Path) -> int:
+    """Return the most-recent mtime (seconds) of audio files in record_dir.
+
+    Included in the cache key so the cache is invalidated when a file changes
+    — e.g. when a new recording replaces a shorter one, or when the WAV was
+    still being written when preload_audio_data() was first called.
+    """
+    audio_mixed = next(record_dir.glob("audio_mixed*.wav"), record_dir / "audio_mixed.wav")
+    audio_el = record_dir / "elevenlabs_audio_recording.mp3"
+    mtime = 0
+    for p in (audio_mixed, audio_el):
+        if p.exists():
+            mtime = max(mtime, int(p.stat().st_mtime))
+    return mtime
+
+
 @st.cache_data(show_spinner="Loading audio files\u2026")
-def _cache_audio_data(path_str: str) -> dict:
+def _cache_audio_data(path_str: str, audio_mtime: int = 0) -> dict:
     """Cache the heavy data-loading step (file I/O + spectrogram computation).
 
-    Keyed only on the record directory path, so the cache is shared across
-    all spectrogram-toggle states.  _build_figure() is fast and runs on each
-    rerun with the pre-loaded data.
+    Keyed on the record directory path AND the audio file mtime so the cache
+    is automatically invalidated when the audio files change.
+    _build_figure() is fast and runs on each rerun with the pre-loaded data.
     """
     return _prepare_data(Path(path_str))
 
@@ -1008,7 +1107,7 @@ def preload_audio_data(record_dir: Path) -> None:
     events_file = record_dir / "elevenlabs_events.jsonl"
     audio_mixed = next(record_dir.glob("audio_mixed*.wav"), record_dir / "audio_mixed.wav")
     if events_file.exists() or audio_mixed.exists():
-        _cache_audio_data(str(record_dir))
+        _cache_audio_data(str(record_dir), _audio_mtime(record_dir))
 
 
 # =============================================================================
@@ -1028,8 +1127,8 @@ def render_audio_analysis_tab(record_dir: Path) -> None:
         return
 
     try:
-        # Data is already cached by preload_audio_data(); this is a cache hit.
-        data = _cache_audio_data(str(record_dir))
+        # Cache hit when mtime unchanged; re-loads if the file was updated.
+        data = _cache_audio_data(str(record_dir), _audio_mtime(record_dir))
     except Exception as exc:
         st.error(f"Could not load audio data: {exc}")
         return
@@ -1046,6 +1145,7 @@ def render_audio_analysis_tab(record_dir: Path) -> None:
         show_el_spec = False
         st.info("ElevenLabs audio recording is not available for this record.")
 
+
     try:
         fig = _build_figure(data, show_mixed_spec=show_mixed_spec, show_el_spec=show_el_spec)
         st.plotly_chart(fig, width="stretch", theme="streamlit")

From 7e4fd0dba305799564e0017779ccecf5c09577e0 Mon Sep 17 00:00:00 2001
From: nhhoang96 <10899923+nhhoang96@users.noreply.github.com>
Date: Wed, 15 Apr 2026 20:13:24 +0000
Subject: [PATCH 21/23] Apply pre-commit

---
 apps/audio_plots.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/apps/audio_plots.py b/apps/audio_plots.py
index 09603235..6529b21c 100644
--- a/apps/audio_plots.py
+++ b/apps/audio_plots.py
@@ -389,7 +389,9 @@ def _load_pydub(path: Path) -> tuple:
                     divisor = 32768.0 if sw == 2 else 2_147_483_648.0
                     with open(path, "rb") as f:
                         # Re-seek to the data chunk start
-                        f.read(4); f.read(4); f.read(4)  # RIFF + size + WAVE
+                        f.read(4)
+                        f.read(4)
+                        f.read(4)  # RIFF + size + WAVE
                         while True:
                             hdr = f.read(8)
                             if len(hdr) < 8:
@@ -1145,7 +1147,6 @@ def render_audio_analysis_tab(record_dir: Path) -> None:
         show_el_spec = False
         st.info("ElevenLabs audio recording is not available for this record.")
 
-
     try:
         fig = _build_figure(data, show_mixed_spec=show_mixed_spec, show_el_spec=show_el_spec)
         st.plotly_chart(fig, width="stretch", theme="streamlit")

From 090361ac84c2a7b223be17a9538884c26d1db1db Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Wed, 15 Apr 2026 21:53:31 +0000
Subject: [PATCH 22/23] Fix issues with pre-commit

---
 apps/audio_plots.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/apps/audio_plots.py b/apps/audio_plots.py
index 09603235..6a6f4ab5 100644
--- a/apps/audio_plots.py
+++ b/apps/audio_plots.py
@@ -370,7 +370,6 @@ def _load_pydub(path: Path) -> tuple:
     if n_channels > 1:
         seg = seg.set_channels(1)
     sr = seg.frame_rate
-    sample_width = seg.sample_width
     y = np.array(seg.get_array_of_samples()).astype(np.float32) / 32768.0
 
     # WAV files from some recorders have an incorrect data-chunk size in the
@@ -388,8 +387,10 @@ def _load_pydub(path: Path) -> tuple:
                     dtype = np.int16 if sw == 2 else np.int32
                     divisor = 32768.0 if sw == 2 else 2_147_483_648.0
                     with open(path, "rb") as f:
-                        # Re-seek to the data chunk start
-                        f.read(4); f.read(4); f.read(4)  # RIFF + size + WAVE
+                        # Re-seek to the data chunk start (RIFF + size + WAVE)
+                        f.read(4)
+                        f.read(4)
+                        f.read(4)
                         while True:
                             hdr = f.read(8)
                             if len(hdr) < 8:
@@ -1145,7 +1146,6 @@ def render_audio_analysis_tab(record_dir: Path) -> None:
         show_el_spec = False
         st.info("ElevenLabs audio recording is not available for this record.")
 
-
     try:
         fig = _build_figure(data, show_mixed_spec=show_mixed_spec, show_el_spec=show_el_spec)
         st.plotly_chart(fig, width="stretch", theme="streamlit")

From 07a29d85c1ea21ce0b68f16cf94b644a19c41597 Mon Sep 17 00:00:00 2001
From: hoang <hnguy7@uic.edu>
Date: Thu, 16 Apr 2026 14:55:27 +0000
Subject: [PATCH 23/23] Add time-axis for every subplot

---
 apps/audio_plots.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/apps/audio_plots.py b/apps/audio_plots.py
index 6a6f4ab5..505dfa0e 100644
--- a/apps/audio_plots.py
+++ b/apps/audio_plots.py
@@ -1058,9 +1058,18 @@ def _no_file(row: int) -> None:
     )
     fig.update_xaxes(title_text="Time (seconds)", row=tl_row, col=1)
 
-    # Shared x-range + grid for all rows
+    # Shared x-range + grid for all rows.
+    # showticklabels=True is required on every row because shared_xaxes=True
+    # hides tick labels on all but the bottom subplot by default.
     for r in range(1, n_rows + 1):
-        fig.update_xaxes(range=plot_xlim, showgrid=True, gridcolor="rgba(128,128,128,0.15)", row=r, col=1)
+        fig.update_xaxes(
+            range=plot_xlim,
+            showgrid=True,
+            gridcolor="rgba(128,128,128,0.15)",
+            showticklabels=True,
+            row=r,
+            col=1,
+        )
         fig.update_yaxes(showgrid=True, gridcolor="rgba(128,128,128,0.15)", row=r, col=1)
 
     return fig