diff --git a/dashboard/app.py b/dashboard/app.py index 5bbbfb2..26205c9 100644 --- a/dashboard/app.py +++ b/dashboard/app.py @@ -214,8 +214,9 @@ def _latest(lst): latest_infer = _latest(infer_runs) latest_ops = _latest(ops_runs) latest_train = _latest(train_runs) + latest_hw = _latest(hw_runs) - colA, colB, colC, colD = st.columns(4) + colA, colB, colC, colD, colE = st.columns(5) with colA: st.markdown("#### 🔗 通信(最新)") @@ -255,6 +256,15 @@ def _latest(lst): st.write(f"- time: {latest_train.get('time','')}") st.write(f"- status: {'✅' if latest_train.get('success') else '❌'}") + with colE: + st.markdown("#### 🔧 硬件(最新)") + if not latest_hw: + st.info("暂无硬件结果") + else: + st.write(f"- testcase: `{latest_hw.get('testcase','')}`") + st.write(f"- time: {latest_hw.get('time','')}") + st.write(f"- status: {'✅' if latest_hw.get('success') else '❌'}") + st.divider() # ========== Recent runs table ========== @@ -311,7 +321,7 @@ def _latest(lst): st.markdown("---") st.markdown("### 🚀 快速导航") - col1, col2, col3, col4 = st.columns(4) + col1, col2, col3, col4, col5 = st.columns(5) if col1.button("🔗 通信测试分析", use_container_width=True): st.switch_page("pages/communication.py") if col2.button("⚡ 算子测试分析", use_container_width=True): @@ -320,6 +330,8 @@ def _latest(lst): st.switch_page("pages/inference.py") if col4.button("🏋️ 训练测试分析", use_container_width=True): st.switch_page("pages/training.py") + if col5.button("🔧 硬件测试分析", use_container_width=True): + st.switch_page("pages/hardware.py") except Exception as e: st.error(f"Dashboard 加载失败: {e}") diff --git a/dashboard/pages/hardware.py b/dashboard/pages/hardware.py new file mode 100644 index 0000000..d658a08 --- /dev/null +++ b/dashboard/pages/hardware.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +"""Hardware tests analysis page.""" + +import streamlit as st +import pandas as pd + +from common import init_page, show_data_source_info +from components.header import render_header +from utils.visualizations import ( + create_summary_table_hw, + plot_hw_mem_sweep, + plot_hw_cache, +) + +init_page("硬件测试分析 | InfiniMetrics", "🔧") + + +def main(): + render_header() + st.markdown("## 🔧 硬件性能测试分析") + + show_data_source_info() + + runs = st.session_state.data_loader.list_test_runs() + # Identify hardware runs by testcase starting with hardware + hw_runs = [r for r in runs if (r.get("testcase") or "").startswith("hardware")] + + if not hw_runs: + st.info("未找到硬件测试结果(testcase 需以 hardware.* 开头)。") + return + + # ---------- Sidebar Filters ---------- + with st.sidebar: + st.markdown("### 🔍 筛选条件") + only_success = st.checkbox("仅显示成功测试", value=True) + y_log = st.checkbox("Y轴对数刻度(可选)", value=False) + + filtered = [r for r in hw_runs if (not only_success or r.get("success"))] + + st.caption(f"找到 {len(filtered)} 个硬件测试") + + if not filtered: + st.warning("没有符合条件的测试结果") + return + + # ---------- Run Selection ---------- + options = { + f"{r.get('testcase','unknown')} | {r.get('time','')} | {r.get('run_id','')[:12]}": i + for i, r in enumerate(filtered) + } + + selected = st.multiselect( + "选择要分析的测试运行(可多选对比)", + list(options.keys()), + default=list(options.keys())[:1], + ) + if not selected: + return + + def _load_run_data(run_info): + """Load test result data for a run.""" + identifier = run_info.get("path") or run_info.get("run_id") + return { + **run_info, + "data": st.session_state.data_loader.load_test_result(identifier), + } + + selected_runs = [_load_run_data(filtered[options[k]]) for k in selected] + + tab1, tab2, tab3 = st.tabs(["📈 性能图表", "📊 数据表格", "🔍 详细配置"]) + + # ---------- Charts ---------- + with tab1: + for run in selected_runs: + metrics = run["data"].get("metrics", []) + + # Group metrics by type + mem_metrics = [m for m in metrics if "mem_sweep" in m.get("name", "")] + cache_metrics = [m for m in metrics if "cache" in m.get("name", "")] + stream_metrics = [m for m in metrics if "stream" in m.get("name", "")] + + st.markdown(f"### {run.get('run_id', '')[:16]}") + + # Memory bandwidth plots + if mem_metrics: + st.markdown("#### 内存带宽 (Memory Sweep)") + cols = st.columns(min(3, len(mem_metrics))) + for i, m in enumerate(mem_metrics): + with cols[i % len(cols)]: + df = m.get("data") + if df is not None and len(df.columns) >= 2: + fig = plot_hw_mem_sweep( + df, + title=m.get("name", "memory"), + y_log_scale=y_log, + ) + st.plotly_chart(fig, use_container_width=True) + + # Cache bandwidth plots + if cache_metrics: + st.markdown("#### 缓存带宽 (Cache)") + cols = st.columns(min(2, len(cache_metrics))) + for i, m in enumerate(cache_metrics): + with cols[i % len(cols)]: + df = m.get("data") + if df is not None and len(df.columns) >= 2: + fig = plot_hw_cache( + df, + title=m.get("name", "cache"), + y_log_scale=y_log, + ) + st.plotly_chart(fig, use_container_width=True) + + # STREAM benchmark scalars + if stream_metrics: + st.markdown("#### STREAM 基准测试") + stream_data = [] + for m in stream_metrics: + stream_data.append( + { + "指标": m.get("name", ""), + "数值": f"{m.get('value', 0):.2f} {m.get('unit', '')}", + } + ) + if stream_data: + st.dataframe( + pd.DataFrame(stream_data), + use_container_width=True, + hide_index=True, + ) + + # ---------- Tables ---------- + with tab2: + for run in selected_runs: + with st.expander(f"{run.get('run_id')} - 原始数据"): + for m in run["data"].get("metrics", []): + if m.get("data") is None: + continue + st.markdown(f"**{m.get('name')}**") + st.dataframe(m["data"], use_container_width=True, hide_index=True) + + # ---------- Config ---------- + with tab3: + for run in selected_runs: + with st.expander(f"{run.get('run_id')} - 配置与环境"): + summary = create_summary_table_hw(run["data"]) + st.dataframe(summary, use_container_width=True, hide_index=True) + st.markdown("**config**") + st.json(run["data"].get("config", {})) + st.markdown("**environment**") + st.json(run["data"].get("environment", {})) + + +if __name__ == "__main__": + main() diff --git a/dashboard/utils/visualizations/__init__.py b/dashboard/utils/visualizations/__init__.py new file mode 100644 index 0000000..f897fcb --- /dev/null +++ b/dashboard/utils/visualizations/__init__.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +"""Visualization functions for InfiniMetrics dashboard. + +This package provides visualization utilities organized by test type: +- base: Common/legacy visualization functions +- hardware: Hardware test visualizations (memory sweep, cache bandwidth) +- (future) communication: Communication test visualizations +- (future) inference: Inference test visualizations +- (future) operator: Operator test visualizations +""" + +from .base import ( + plot_metric_vs_size, + plot_comparison_matrix, + create_summary_table, + create_gauge_chart, + plot_timeseries_auto, + create_summary_table_infer, + create_summary_table_ops, +) +from .hardware import ( + create_summary_table_hw, + plot_hw_mem_sweep, + plot_hw_cache, +) + +__all__ = [ + # Base (common/legacy) + "plot_metric_vs_size", + "plot_comparison_matrix", + "create_summary_table", + "create_gauge_chart", + "plot_timeseries_auto", + "create_summary_table_infer", + "create_summary_table_ops", + # Hardware + "create_summary_table_hw", + "plot_hw_mem_sweep", + "plot_hw_cache", +] diff --git a/dashboard/utils/visualizations.py b/dashboard/utils/visualizations/base.py similarity index 82% rename from dashboard/utils/visualizations.py rename to dashboard/utils/visualizations/base.py index 5e396c7..8ba1bc2 100644 --- a/dashboard/utils/visualizations.py +++ b/dashboard/utils/visualizations/base.py @@ -8,7 +8,7 @@ from typing import Dict, List, Any, Optional, Literal import streamlit as st -from .data_loader import get_friendly_size +from utils.data_loader import get_friendly_size def plot_metric_vs_size( @@ -209,15 +209,21 @@ def create_summary_table(test_result: Dict[str, Any]) -> pd.DataFrame: accelerators = machine.get("accelerators", []) if accelerators: acc = accelerators[0] - summary_data.append({"指标": "GPU型号", "数值": acc.get("model", "Unknown")}) - summary_data.append({"指标": "GPU数量", "数值": acc.get("count", "Unknown")}) + summary_data.append( + {"指标": "GPU型号", "数值": str(acc.get("model", "Unknown"))} + ) + summary_data.append( + {"指标": "GPU数量", "数值": str(acc.get("count", "Unknown"))} + ) summary_data.append( { "指标": "显存/卡", "数值": f"{acc.get('memory_gb_per_card', 'Unknown')} GB", } ) - summary_data.append({"指标": "CUDA版本", "数值": acc.get("cuda", "Unknown")}) + summary_data.append( + {"指标": "CUDA版本", "数值": str(acc.get("cuda", "Unknown"))} + ) # Test config summary config = test_result.get("config", {}) @@ -231,14 +237,14 @@ def create_summary_table(test_result: Dict[str, Any]) -> pd.DataFrame: ) nodes = resolved.get("nodes") or config.get("nodes", 1) - summary_data.append({"指标": "算子", "数值": config.get("operator", "Unknown")}) - summary_data.append({"指标": "设备数", "数值": device_used}) - summary_data.append({"指标": "节点数", "数值": nodes}) + summary_data.append({"指标": "算子", "数值": str(config.get("operator", "Unknown"))}) + summary_data.append({"指标": "设备数", "数值": str(device_used)}) + summary_data.append({"指标": "节点数", "数值": str(nodes)}) summary_data.append( - {"指标": "预热迭代", "数值": config.get("warmup_iterations", "Unknown")} + {"指标": "预热迭代", "数值": str(config.get("warmup_iterations", "Unknown"))} ) summary_data.append( - {"指标": "测量迭代", "数值": config.get("measured_iterations", "Unknown")} + {"指标": "测量迭代", "数值": str(config.get("measured_iterations", "Unknown"))} ) # Performance summary (extract from metrics if available) @@ -364,40 +370,49 @@ def create_summary_table_infer(test_result: dict) -> pd.DataFrame: try: acc = env["cluster"][0]["machine"]["accelerators"][0] rows += [ - {"指标": "加速卡", "数值": acc.get("model", "Unknown")}, - {"指标": "卡数", "数值": acc.get("count", "Unknown")}, + {"指标": "加速卡", "数值": str(acc.get("model", "Unknown"))}, + {"指标": "卡数", "数值": str(acc.get("count", "Unknown"))}, {"指标": "显存/卡", "数值": f"{acc.get('memory_gb_per_card','?')} GB"}, - {"指标": "CUDA", "数值": acc.get("cuda", "Unknown")}, - {"指标": "平台", "数值": acc.get("type", "nvidia")}, + {"指标": "CUDA", "数值": str(acc.get("cuda", "Unknown"))}, + {"指标": "平台", "数值": str(acc.get("type", "nvidia"))}, ] except Exception: pass cfg = test_result.get("config", {}) rows += [ - {"指标": "框架", "数值": cfg.get("framework", "unknown")}, - {"指标": "模型", "数值": cfg.get("model", "")}, + {"指标": "框架", "数值": str(cfg.get("framework", "unknown"))}, + {"指标": "模型", "数值": str(cfg.get("model", ""))}, { "指标": "batch", - "数值": (cfg.get("infer_args", {}) or {}).get("static_batch_size", "unknown"), + "数值": str( + (cfg.get("infer_args", {}) or {}).get("static_batch_size", "unknown") + ), }, { "指标": "prompt_tok", - "数值": (cfg.get("infer_args", {}) or {}).get("prompt_token_num", "unknown"), + "数值": str( + (cfg.get("infer_args", {}) or {}).get("prompt_token_num", "unknown") + ), }, { "指标": "output_tok", - "数值": (cfg.get("infer_args", {}) or {}).get("output_token_num", "unknown"), + "数值": str( + (cfg.get("infer_args", {}) or {}).get("output_token_num", "unknown") + ), }, - {"指标": "warmup", "数值": cfg.get("warmup_iterations", "unknown")}, - {"指标": "measured", "数值": cfg.get("measured_iterations", "unknown")}, + {"指标": "warmup", "数值": str(cfg.get("warmup_iterations", "unknown"))}, + {"指标": "measured", "数值": str(cfg.get("measured_iterations", "unknown"))}, ] # scalar metrics quick view for m in test_result.get("metrics", []): if m.get("type") == "scalar": rows.append( - {"指标": m.get("name"), "数值": f"{m.get('value')} {m.get('unit','')}"} + { + "指标": str(m.get("name", "")), + "数值": f"{m.get('value', '')} {m.get('unit', '')}".strip(), + } ) return pd.DataFrame(rows) @@ -407,17 +422,22 @@ def create_summary_table_ops(test_result: dict) -> pd.DataFrame: rows = [] cfg = test_result.get("config", {}) - rows.append({"指标": "testcase", "数值": test_result.get("testcase", "")}) + rows.append({"指标": "testcase", "数值": str(test_result.get("testcase", ""))}) # Try to get operator name from config - rows.append({"指标": "算子", "数值": cfg.get("operator", cfg.get("op_name", "Unknown"))}) + rows.append( + { + "指标": "算子", + "数值": str(cfg.get("operator", cfg.get("op_name", "Unknown"))), + } + ) # Environment info env = test_result.get("environment", {}) try: acc = env["cluster"][0]["machine"]["accelerators"][0] rows += [ - {"指标": "加速卡", "数值": acc.get("model", "Unknown")}, - {"指标": "卡数", "数值": acc.get("count", "Unknown")}, + {"指标": "加速卡", "数值": str(acc.get("model", "Unknown"))}, + {"指标": "卡数", "数值": str(acc.get("count", "Unknown"))}, ] except Exception: pass @@ -425,7 +445,12 @@ def create_summary_table_ops(test_result: dict) -> pd.DataFrame: # Scalar metrics summary scalars = [m for m in test_result.get("metrics", []) if m.get("type") == "scalar"] for m in scalars: - rows.append({"指标": m.get("name"), "数值": f"{m.get('value')} {m.get('unit','')}"}) + rows.append( + { + "指标": str(m.get("name", "")), + "数值": f"{m.get('value', '')} {m.get('unit', '')}".strip(), + } + ) # Common config fields fallback for k in [ @@ -436,6 +461,6 @@ def create_summary_table_ops(test_result: dict) -> pd.DataFrame: "measured_iterations", ]: if k in cfg: - rows.append({"指标": k, "数值": cfg.get(k)}) + rows.append({"指标": k, "数值": str(cfg.get(k))}) return pd.DataFrame(rows) diff --git a/dashboard/utils/visualizations/hardware.py b/dashboard/utils/visualizations/hardware.py new file mode 100644 index 0000000..aeec693 --- /dev/null +++ b/dashboard/utils/visualizations/hardware.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +"""Hardware-specific visualization functions for InfiniMetrics dashboard.""" + +import plotly.graph_objects as go +import pandas as pd + +# Color constants +COLOR_MEMORY = "#2196F3" +COLOR_MEMORY_FILL = "rgba(33, 150, 243, 0.1)" +COLOR_CACHE = "#E91E63" +COLOR_CACHE_FILL = "rgba(233, 30, 99, 0.1)" +COLOR_AVG_LINE = "#9E9E9E" +COLOR_GRID = "rgba(200,200,200,0.3)" + +# Layout defaults +_LAYOUT_DEFAULTS = { + "template": "plotly_white", + "height": 450, + "hovermode": "closest", + "margin": dict(t=60, b=40, l=60, r=30), + "showlegend": False, +} + + +def _apply_common_style(fig: go.Figure, title: str, xaxis_title: str, yaxis_title: str): + """Apply common layout and styling to a hardware figure.""" + fig.update_layout( + title=dict(text=f"{title}", font=dict(size=14)), + xaxis_title=xaxis_title, + yaxis_title=yaxis_title, + **_LAYOUT_DEFAULTS, + ) + fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor=COLOR_GRID) + fig.update_yaxes( + showgrid=True, gridwidth=1, gridcolor=COLOR_GRID, rangemode="tozero" + ) + + +def _add_avg_line(fig: go.Figure, y_values: pd.Series): + """Add average line annotation to figure.""" + avg_val = y_values.mean() + fig.add_hline( + y=avg_val, + line_dash="dash", + line_color=COLOR_AVG_LINE, + annotation_text=f"Avg: {avg_val:.1f}", + annotation_position="right", + ) + + +def create_summary_table_hw(test_result: dict) -> pd.DataFrame: + """Create summary table for hardware test results.""" + rows = [{"指标": "testcase", "数值": str(test_result.get("testcase", ""))}] + + env = test_result.get("environment", {}) + try: + acc = env["cluster"][0]["machine"]["accelerators"][0] + rows += [ + {"指标": "加速卡", "数值": str(acc.get("model", "Unknown"))}, + {"指标": "卡数", "数值": str(acc.get("count", "Unknown"))}, + ] + except (KeyError, IndexError, TypeError): + pass + + for m in test_result.get("metrics", []): + if m.get("type") == "scalar": + rows.append( + { + "指标": str(m.get("name", "")), + "数值": f"{m.get('value', '')} {m.get('unit', '')}".strip(), + } + ) + + return pd.DataFrame(rows) + + +def plot_hw_mem_sweep( + df: pd.DataFrame, title: str = "Memory Sweep", y_log_scale: bool = False +) -> go.Figure: + """Plot memory sweep bandwidth: x=size_mb, y=bandwidth_gbps.""" + fig = go.Figure() + + if df is None or df.empty: + fig.update_layout(title=f"{title} (no data)") + return fig + + xcol, ycol = "size_mb", "bandwidth_gbps" + if xcol not in df.columns or ycol not in df.columns: + xcol = df.columns[0] + ycol = df.columns[1] if len(df.columns) > 1 else df.columns[0] + + fig.add_trace( + go.Scatter( + x=df[xcol], + y=df[ycol], + mode="lines+markers", + name="Bandwidth", + line=dict(color=COLOR_MEMORY, width=2.5, shape="spline"), + marker=dict(size=8, color=COLOR_MEMORY, line=dict(color="white", width=1)), + fill="tozeroy", + fillcolor=COLOR_MEMORY_FILL, + hovertemplate="%{x} MB
Bandwidth: %{y:.2f} GB/s", + ) + ) + + _apply_common_style(fig, title, "Size (MB)", "Bandwidth (GB/s)") + fig.update_xaxes(type="log", range=[0, 3]) # 1MB to 1000MB + _add_avg_line(fig, df[ycol]) + + if y_log_scale: + fig.update_yaxes(type="log") + + return fig + + +def plot_hw_cache( + df: pd.DataFrame, title: str = "Cache Bandwidth", y_log_scale: bool = False +) -> go.Figure: + """Plot cache bandwidth: x=exec_data or data_set, y=eff_bw.""" + fig = go.Figure() + + if df is None or df.empty: + fig.update_layout(title=f"{title} (no data)") + return fig + + # exec_data (L2) preferred over data_set (L1) + ycol = "eff_bw" + if "exec_data" in df.columns and ycol in df.columns: + xcol = "exec_data" + elif "data_set" in df.columns and ycol in df.columns: + xcol = "data_set" + else: + xcol = df.columns[0] + ycol = df.columns[1] if len(df.columns) > 1 else df.columns[0] + + fig.add_trace( + go.Scatter( + x=df[xcol], + y=df[ycol], + mode="lines+markers", + name="Effective BW", + line=dict(color=COLOR_CACHE, width=2.5, shape="spline"), + marker=dict(size=8, color=COLOR_CACHE, line=dict(color="white", width=1)), + fill="tozeroy", + fillcolor=COLOR_CACHE_FILL, + hovertemplate="Data Set: %{x}
BW: %{y:.2f} GB/s", + ) + ) + + _apply_common_style(fig, title, "Data Set", "Effective BW (GB/s)") + _add_avg_line(fig, df[ycol]) + + if y_log_scale: + fig.update_yaxes(type="log") + + return fig