diff --git a/dashboard/app.py b/dashboard/app.py
index 5bbbfb2..26205c9 100644
--- a/dashboard/app.py
+++ b/dashboard/app.py
@@ -214,8 +214,9 @@ def _latest(lst):
latest_infer = _latest(infer_runs)
latest_ops = _latest(ops_runs)
latest_train = _latest(train_runs)
+ latest_hw = _latest(hw_runs)
- colA, colB, colC, colD = st.columns(4)
+ colA, colB, colC, colD, colE = st.columns(5)
with colA:
st.markdown("#### 🔗 通信(最新)")
@@ -255,6 +256,15 @@ def _latest(lst):
st.write(f"- time: {latest_train.get('time','')}")
st.write(f"- status: {'✅' if latest_train.get('success') else '❌'}")
+ with colE:
+ st.markdown("#### 🔧 硬件(最新)")
+ if not latest_hw:
+ st.info("暂无硬件结果")
+ else:
+ st.write(f"- testcase: `{latest_hw.get('testcase','')}`")
+ st.write(f"- time: {latest_hw.get('time','')}")
+ st.write(f"- status: {'✅' if latest_hw.get('success') else '❌'}")
+
st.divider()
# ========== Recent runs table ==========
@@ -311,7 +321,7 @@ def _latest(lst):
st.markdown("---")
st.markdown("### 🚀 快速导航")
- col1, col2, col3, col4 = st.columns(4)
+ col1, col2, col3, col4, col5 = st.columns(5)
if col1.button("🔗 通信测试分析", use_container_width=True):
st.switch_page("pages/communication.py")
if col2.button("⚡ 算子测试分析", use_container_width=True):
@@ -320,6 +330,8 @@ def _latest(lst):
st.switch_page("pages/inference.py")
if col4.button("🏋️ 训练测试分析", use_container_width=True):
st.switch_page("pages/training.py")
+ if col5.button("🔧 硬件测试分析", use_container_width=True):
+ st.switch_page("pages/hardware.py")
except Exception as e:
st.error(f"Dashboard 加载失败: {e}")
diff --git a/dashboard/pages/hardware.py b/dashboard/pages/hardware.py
new file mode 100644
index 0000000..d658a08
--- /dev/null
+++ b/dashboard/pages/hardware.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+"""Hardware tests analysis page."""
+
+import streamlit as st
+import pandas as pd
+
+from common import init_page, show_data_source_info
+from components.header import render_header
+from utils.visualizations import (
+ create_summary_table_hw,
+ plot_hw_mem_sweep,
+ plot_hw_cache,
+)
+
+init_page("硬件测试分析 | InfiniMetrics", "🔧")
+
+
+def main():
+ render_header()
+ st.markdown("## 🔧 硬件性能测试分析")
+
+ show_data_source_info()
+
+ runs = st.session_state.data_loader.list_test_runs()
+ # Identify hardware runs by testcase starting with hardware
+ hw_runs = [r for r in runs if (r.get("testcase") or "").startswith("hardware")]
+
+ if not hw_runs:
+ st.info("未找到硬件测试结果(testcase 需以 hardware.* 开头)。")
+ return
+
+ # ---------- Sidebar Filters ----------
+ with st.sidebar:
+ st.markdown("### 🔍 筛选条件")
+ only_success = st.checkbox("仅显示成功测试", value=True)
+ y_log = st.checkbox("Y轴对数刻度(可选)", value=False)
+
+ filtered = [r for r in hw_runs if (not only_success or r.get("success"))]
+
+ st.caption(f"找到 {len(filtered)} 个硬件测试")
+
+ if not filtered:
+ st.warning("没有符合条件的测试结果")
+ return
+
+ # ---------- Run Selection ----------
+ options = {
+ f"{r.get('testcase','unknown')} | {r.get('time','')} | {r.get('run_id','')[:12]}": i
+ for i, r in enumerate(filtered)
+ }
+
+ selected = st.multiselect(
+ "选择要分析的测试运行(可多选对比)",
+ list(options.keys()),
+ default=list(options.keys())[:1],
+ )
+ if not selected:
+ return
+
+ def _load_run_data(run_info):
+ """Load test result data for a run."""
+ identifier = run_info.get("path") or run_info.get("run_id")
+ return {
+ **run_info,
+ "data": st.session_state.data_loader.load_test_result(identifier),
+ }
+
+ selected_runs = [_load_run_data(filtered[options[k]]) for k in selected]
+
+ tab1, tab2, tab3 = st.tabs(["📈 性能图表", "📊 数据表格", "🔍 详细配置"])
+
+ # ---------- Charts ----------
+ with tab1:
+ for run in selected_runs:
+ metrics = run["data"].get("metrics", [])
+
+ # Group metrics by type
+ mem_metrics = [m for m in metrics if "mem_sweep" in m.get("name", "")]
+ cache_metrics = [m for m in metrics if "cache" in m.get("name", "")]
+ stream_metrics = [m for m in metrics if "stream" in m.get("name", "")]
+
+ st.markdown(f"### {run.get('run_id', '')[:16]}")
+
+ # Memory bandwidth plots
+ if mem_metrics:
+ st.markdown("#### 内存带宽 (Memory Sweep)")
+ cols = st.columns(min(3, len(mem_metrics)))
+ for i, m in enumerate(mem_metrics):
+ with cols[i % len(cols)]:
+ df = m.get("data")
+ if df is not None and len(df.columns) >= 2:
+ fig = plot_hw_mem_sweep(
+ df,
+ title=m.get("name", "memory"),
+ y_log_scale=y_log,
+ )
+ st.plotly_chart(fig, use_container_width=True)
+
+ # Cache bandwidth plots
+ if cache_metrics:
+ st.markdown("#### 缓存带宽 (Cache)")
+ cols = st.columns(min(2, len(cache_metrics)))
+ for i, m in enumerate(cache_metrics):
+ with cols[i % len(cols)]:
+ df = m.get("data")
+ if df is not None and len(df.columns) >= 2:
+ fig = plot_hw_cache(
+ df,
+ title=m.get("name", "cache"),
+ y_log_scale=y_log,
+ )
+ st.plotly_chart(fig, use_container_width=True)
+
+ # STREAM benchmark scalars
+ if stream_metrics:
+ st.markdown("#### STREAM 基准测试")
+ stream_data = []
+ for m in stream_metrics:
+ stream_data.append(
+ {
+ "指标": m.get("name", ""),
+ "数值": f"{m.get('value', 0):.2f} {m.get('unit', '')}",
+ }
+ )
+ if stream_data:
+ st.dataframe(
+ pd.DataFrame(stream_data),
+ use_container_width=True,
+ hide_index=True,
+ )
+
+ # ---------- Tables ----------
+ with tab2:
+ for run in selected_runs:
+ with st.expander(f"{run.get('run_id')} - 原始数据"):
+ for m in run["data"].get("metrics", []):
+ if m.get("data") is None:
+ continue
+ st.markdown(f"**{m.get('name')}**")
+ st.dataframe(m["data"], use_container_width=True, hide_index=True)
+
+ # ---------- Config ----------
+ with tab3:
+ for run in selected_runs:
+ with st.expander(f"{run.get('run_id')} - 配置与环境"):
+ summary = create_summary_table_hw(run["data"])
+ st.dataframe(summary, use_container_width=True, hide_index=True)
+ st.markdown("**config**")
+ st.json(run["data"].get("config", {}))
+ st.markdown("**environment**")
+ st.json(run["data"].get("environment", {}))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/dashboard/utils/visualizations/__init__.py b/dashboard/utils/visualizations/__init__.py
new file mode 100644
index 0000000..f897fcb
--- /dev/null
+++ b/dashboard/utils/visualizations/__init__.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+"""Visualization functions for InfiniMetrics dashboard.
+
+This package provides visualization utilities organized by test type:
+- base: Common/legacy visualization functions
+- hardware: Hardware test visualizations (memory sweep, cache bandwidth)
+- (future) communication: Communication test visualizations
+- (future) inference: Inference test visualizations
+- (future) operator: Operator test visualizations
+"""
+
+from .base import (
+ plot_metric_vs_size,
+ plot_comparison_matrix,
+ create_summary_table,
+ create_gauge_chart,
+ plot_timeseries_auto,
+ create_summary_table_infer,
+ create_summary_table_ops,
+)
+from .hardware import (
+ create_summary_table_hw,
+ plot_hw_mem_sweep,
+ plot_hw_cache,
+)
+
+__all__ = [
+ # Base (common/legacy)
+ "plot_metric_vs_size",
+ "plot_comparison_matrix",
+ "create_summary_table",
+ "create_gauge_chart",
+ "plot_timeseries_auto",
+ "create_summary_table_infer",
+ "create_summary_table_ops",
+ # Hardware
+ "create_summary_table_hw",
+ "plot_hw_mem_sweep",
+ "plot_hw_cache",
+]
diff --git a/dashboard/utils/visualizations.py b/dashboard/utils/visualizations/base.py
similarity index 82%
rename from dashboard/utils/visualizations.py
rename to dashboard/utils/visualizations/base.py
index 5e396c7..8ba1bc2 100644
--- a/dashboard/utils/visualizations.py
+++ b/dashboard/utils/visualizations/base.py
@@ -8,7 +8,7 @@
from typing import Dict, List, Any, Optional, Literal
import streamlit as st
-from .data_loader import get_friendly_size
+from utils.data_loader import get_friendly_size
def plot_metric_vs_size(
@@ -209,15 +209,21 @@ def create_summary_table(test_result: Dict[str, Any]) -> pd.DataFrame:
accelerators = machine.get("accelerators", [])
if accelerators:
acc = accelerators[0]
- summary_data.append({"指标": "GPU型号", "数值": acc.get("model", "Unknown")})
- summary_data.append({"指标": "GPU数量", "数值": acc.get("count", "Unknown")})
+ summary_data.append(
+ {"指标": "GPU型号", "数值": str(acc.get("model", "Unknown"))}
+ )
+ summary_data.append(
+ {"指标": "GPU数量", "数值": str(acc.get("count", "Unknown"))}
+ )
summary_data.append(
{
"指标": "显存/卡",
"数值": f"{acc.get('memory_gb_per_card', 'Unknown')} GB",
}
)
- summary_data.append({"指标": "CUDA版本", "数值": acc.get("cuda", "Unknown")})
+ summary_data.append(
+ {"指标": "CUDA版本", "数值": str(acc.get("cuda", "Unknown"))}
+ )
# Test config summary
config = test_result.get("config", {})
@@ -231,14 +237,14 @@ def create_summary_table(test_result: Dict[str, Any]) -> pd.DataFrame:
)
nodes = resolved.get("nodes") or config.get("nodes", 1)
- summary_data.append({"指标": "算子", "数值": config.get("operator", "Unknown")})
- summary_data.append({"指标": "设备数", "数值": device_used})
- summary_data.append({"指标": "节点数", "数值": nodes})
+ summary_data.append({"指标": "算子", "数值": str(config.get("operator", "Unknown"))})
+ summary_data.append({"指标": "设备数", "数值": str(device_used)})
+ summary_data.append({"指标": "节点数", "数值": str(nodes)})
summary_data.append(
- {"指标": "预热迭代", "数值": config.get("warmup_iterations", "Unknown")}
+ {"指标": "预热迭代", "数值": str(config.get("warmup_iterations", "Unknown"))}
)
summary_data.append(
- {"指标": "测量迭代", "数值": config.get("measured_iterations", "Unknown")}
+ {"指标": "测量迭代", "数值": str(config.get("measured_iterations", "Unknown"))}
)
# Performance summary (extract from metrics if available)
@@ -364,40 +370,49 @@ def create_summary_table_infer(test_result: dict) -> pd.DataFrame:
try:
acc = env["cluster"][0]["machine"]["accelerators"][0]
rows += [
- {"指标": "加速卡", "数值": acc.get("model", "Unknown")},
- {"指标": "卡数", "数值": acc.get("count", "Unknown")},
+ {"指标": "加速卡", "数值": str(acc.get("model", "Unknown"))},
+ {"指标": "卡数", "数值": str(acc.get("count", "Unknown"))},
{"指标": "显存/卡", "数值": f"{acc.get('memory_gb_per_card','?')} GB"},
- {"指标": "CUDA", "数值": acc.get("cuda", "Unknown")},
- {"指标": "平台", "数值": acc.get("type", "nvidia")},
+ {"指标": "CUDA", "数值": str(acc.get("cuda", "Unknown"))},
+ {"指标": "平台", "数值": str(acc.get("type", "nvidia"))},
]
except Exception:
pass
cfg = test_result.get("config", {})
rows += [
- {"指标": "框架", "数值": cfg.get("framework", "unknown")},
- {"指标": "模型", "数值": cfg.get("model", "")},
+ {"指标": "框架", "数值": str(cfg.get("framework", "unknown"))},
+ {"指标": "模型", "数值": str(cfg.get("model", ""))},
{
"指标": "batch",
- "数值": (cfg.get("infer_args", {}) or {}).get("static_batch_size", "unknown"),
+ "数值": str(
+ (cfg.get("infer_args", {}) or {}).get("static_batch_size", "unknown")
+ ),
},
{
"指标": "prompt_tok",
- "数值": (cfg.get("infer_args", {}) or {}).get("prompt_token_num", "unknown"),
+ "数值": str(
+ (cfg.get("infer_args", {}) or {}).get("prompt_token_num", "unknown")
+ ),
},
{
"指标": "output_tok",
- "数值": (cfg.get("infer_args", {}) or {}).get("output_token_num", "unknown"),
+ "数值": str(
+ (cfg.get("infer_args", {}) or {}).get("output_token_num", "unknown")
+ ),
},
- {"指标": "warmup", "数值": cfg.get("warmup_iterations", "unknown")},
- {"指标": "measured", "数值": cfg.get("measured_iterations", "unknown")},
+ {"指标": "warmup", "数值": str(cfg.get("warmup_iterations", "unknown"))},
+ {"指标": "measured", "数值": str(cfg.get("measured_iterations", "unknown"))},
]
# scalar metrics quick view
for m in test_result.get("metrics", []):
if m.get("type") == "scalar":
rows.append(
- {"指标": m.get("name"), "数值": f"{m.get('value')} {m.get('unit','')}"}
+ {
+ "指标": str(m.get("name", "")),
+ "数值": f"{m.get('value', '')} {m.get('unit', '')}".strip(),
+ }
)
return pd.DataFrame(rows)
@@ -407,17 +422,22 @@ def create_summary_table_ops(test_result: dict) -> pd.DataFrame:
rows = []
cfg = test_result.get("config", {})
- rows.append({"指标": "testcase", "数值": test_result.get("testcase", "")})
+ rows.append({"指标": "testcase", "数值": str(test_result.get("testcase", ""))})
# Try to get operator name from config
- rows.append({"指标": "算子", "数值": cfg.get("operator", cfg.get("op_name", "Unknown"))})
+ rows.append(
+ {
+ "指标": "算子",
+ "数值": str(cfg.get("operator", cfg.get("op_name", "Unknown"))),
+ }
+ )
# Environment info
env = test_result.get("environment", {})
try:
acc = env["cluster"][0]["machine"]["accelerators"][0]
rows += [
- {"指标": "加速卡", "数值": acc.get("model", "Unknown")},
- {"指标": "卡数", "数值": acc.get("count", "Unknown")},
+ {"指标": "加速卡", "数值": str(acc.get("model", "Unknown"))},
+ {"指标": "卡数", "数值": str(acc.get("count", "Unknown"))},
]
except Exception:
pass
@@ -425,7 +445,12 @@ def create_summary_table_ops(test_result: dict) -> pd.DataFrame:
# Scalar metrics summary
scalars = [m for m in test_result.get("metrics", []) if m.get("type") == "scalar"]
for m in scalars:
- rows.append({"指标": m.get("name"), "数值": f"{m.get('value')} {m.get('unit','')}"})
+ rows.append(
+ {
+ "指标": str(m.get("name", "")),
+ "数值": f"{m.get('value', '')} {m.get('unit', '')}".strip(),
+ }
+ )
# Common config fields fallback
for k in [
@@ -436,6 +461,6 @@ def create_summary_table_ops(test_result: dict) -> pd.DataFrame:
"measured_iterations",
]:
if k in cfg:
- rows.append({"指标": k, "数值": cfg.get(k)})
+ rows.append({"指标": k, "数值": str(cfg.get(k))})
return pd.DataFrame(rows)
diff --git a/dashboard/utils/visualizations/hardware.py b/dashboard/utils/visualizations/hardware.py
new file mode 100644
index 0000000..aeec693
--- /dev/null
+++ b/dashboard/utils/visualizations/hardware.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+"""Hardware-specific visualization functions for InfiniMetrics dashboard."""
+
+import plotly.graph_objects as go
+import pandas as pd
+
+# Color constants
+COLOR_MEMORY = "#2196F3"
+COLOR_MEMORY_FILL = "rgba(33, 150, 243, 0.1)"
+COLOR_CACHE = "#E91E63"
+COLOR_CACHE_FILL = "rgba(233, 30, 99, 0.1)"
+COLOR_AVG_LINE = "#9E9E9E"
+COLOR_GRID = "rgba(200,200,200,0.3)"
+
+# Layout defaults
+_LAYOUT_DEFAULTS = {
+ "template": "plotly_white",
+ "height": 450,
+ "hovermode": "closest",
+ "margin": dict(t=60, b=40, l=60, r=30),
+ "showlegend": False,
+}
+
+
+def _apply_common_style(fig: go.Figure, title: str, xaxis_title: str, yaxis_title: str):
+ """Apply common layout and styling to a hardware figure."""
+ fig.update_layout(
+ title=dict(text=f"{title}", font=dict(size=14)),
+ xaxis_title=xaxis_title,
+ yaxis_title=yaxis_title,
+ **_LAYOUT_DEFAULTS,
+ )
+ fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor=COLOR_GRID)
+ fig.update_yaxes(
+ showgrid=True, gridwidth=1, gridcolor=COLOR_GRID, rangemode="tozero"
+ )
+
+
+def _add_avg_line(fig: go.Figure, y_values: pd.Series):
+ """Add average line annotation to figure."""
+ avg_val = y_values.mean()
+ fig.add_hline(
+ y=avg_val,
+ line_dash="dash",
+ line_color=COLOR_AVG_LINE,
+ annotation_text=f"Avg: {avg_val:.1f}",
+ annotation_position="right",
+ )
+
+
+def create_summary_table_hw(test_result: dict) -> pd.DataFrame:
+ """Create summary table for hardware test results."""
+ rows = [{"指标": "testcase", "数值": str(test_result.get("testcase", ""))}]
+
+ env = test_result.get("environment", {})
+ try:
+ acc = env["cluster"][0]["machine"]["accelerators"][0]
+ rows += [
+ {"指标": "加速卡", "数值": str(acc.get("model", "Unknown"))},
+ {"指标": "卡数", "数值": str(acc.get("count", "Unknown"))},
+ ]
+ except (KeyError, IndexError, TypeError):
+ pass
+
+ for m in test_result.get("metrics", []):
+ if m.get("type") == "scalar":
+ rows.append(
+ {
+ "指标": str(m.get("name", "")),
+ "数值": f"{m.get('value', '')} {m.get('unit', '')}".strip(),
+ }
+ )
+
+ return pd.DataFrame(rows)
+
+
+def plot_hw_mem_sweep(
+ df: pd.DataFrame, title: str = "Memory Sweep", y_log_scale: bool = False
+) -> go.Figure:
+ """Plot memory sweep bandwidth: x=size_mb, y=bandwidth_gbps."""
+ fig = go.Figure()
+
+ if df is None or df.empty:
+ fig.update_layout(title=f"{title} (no data)")
+ return fig
+
+ xcol, ycol = "size_mb", "bandwidth_gbps"
+ if xcol not in df.columns or ycol not in df.columns:
+ xcol = df.columns[0]
+ ycol = df.columns[1] if len(df.columns) > 1 else df.columns[0]
+
+ fig.add_trace(
+ go.Scatter(
+ x=df[xcol],
+ y=df[ycol],
+ mode="lines+markers",
+ name="Bandwidth",
+ line=dict(color=COLOR_MEMORY, width=2.5, shape="spline"),
+ marker=dict(size=8, color=COLOR_MEMORY, line=dict(color="white", width=1)),
+ fill="tozeroy",
+ fillcolor=COLOR_MEMORY_FILL,
+ hovertemplate="%{x} MB
Bandwidth: %{y:.2f} GB/s",
+ )
+ )
+
+ _apply_common_style(fig, title, "Size (MB)", "Bandwidth (GB/s)")
+ fig.update_xaxes(type="log", range=[0, 3]) # 1MB to 1000MB
+ _add_avg_line(fig, df[ycol])
+
+ if y_log_scale:
+ fig.update_yaxes(type="log")
+
+ return fig
+
+
+def plot_hw_cache(
+ df: pd.DataFrame, title: str = "Cache Bandwidth", y_log_scale: bool = False
+) -> go.Figure:
+ """Plot cache bandwidth: x=exec_data or data_set, y=eff_bw."""
+ fig = go.Figure()
+
+ if df is None or df.empty:
+ fig.update_layout(title=f"{title} (no data)")
+ return fig
+
+ # exec_data (L2) preferred over data_set (L1)
+ ycol = "eff_bw"
+ if "exec_data" in df.columns and ycol in df.columns:
+ xcol = "exec_data"
+ elif "data_set" in df.columns and ycol in df.columns:
+ xcol = "data_set"
+ else:
+ xcol = df.columns[0]
+ ycol = df.columns[1] if len(df.columns) > 1 else df.columns[0]
+
+ fig.add_trace(
+ go.Scatter(
+ x=df[xcol],
+ y=df[ycol],
+ mode="lines+markers",
+ name="Effective BW",
+ line=dict(color=COLOR_CACHE, width=2.5, shape="spline"),
+ marker=dict(size=8, color=COLOR_CACHE, line=dict(color="white", width=1)),
+ fill="tozeroy",
+ fillcolor=COLOR_CACHE_FILL,
+ hovertemplate="Data Set: %{x}
BW: %{y:.2f} GB/s",
+ )
+ )
+
+ _apply_common_style(fig, title, "Data Set", "Effective BW (GB/s)")
+ _add_avg_line(fig, df[ycol])
+
+ if y_log_scale:
+ fig.update_yaxes(type="log")
+
+ return fig