PaddlePaddle · lizexu123 · Apr 15, 2026 · Apr 15, 2026 · Apr 18, 2026 · Apr 18, 2026
diff --git a/.claude/skills/benchmark-compare/scripts/extract_metrics.py b/.claude/skills/benchmark-compare/scripts/extract_metrics.py
@@ -1,13 +1,18 @@
 #!/usr/bin/env python3
 """extract_metrics.py — 从 benchmark 结果文件提取指标，输出结构化 JSON
 
+支持框架: fd (FastDeploy) / sg (SGLang) / vllm (vLLM)
+任意框架结果均可缺省，缺省的不参与对比。
+
 用法:
     python3 extract_metrics.py \
         --fd-result <FD_RESULT.txt> \
         --sg-result <SG_RESULT.txt> \
+        --vllm-result <VLLM_RESULT.txt> \
         --model-path <MODEL_PATH> \
         --fd-config '{"gpu":"H800","tp":1,"concurrency":32}' \
         --sg-config '{"gpu":"H800","tp":1,"concurrency":32}' \
+        --vllm-config '{"gpu":"H800","tp":1,"concurrency":32}' \
         --output <metrics.json>
 """
 
@@ -18,12 +23,16 @@
 import subprocess
 import sys
 
+# 支持的框架列表
+FRAMEWORKS = ("fd", "sg", "vllm")
+
 
 def parse_benchmark_result(filepath):
     """解析 benchmark_serving.py 的输出文件，提取所有指标"""
     metrics = {}
-    if not os.path.isfile(filepath):
-        print(f"[WARN] 结果文件不存在: {filepath}", file=sys.stderr)
+    if not filepath or not os.path.isfile(filepath):
+        if filepath:
+            print(f"[WARN] 结果文件不存在: {filepath}", file=sys.stderr)
         return metrics
 
     with open(filepath, "r") as f:
@@ -110,70 +119,104 @@ def get_model_info(model_path):
     return info
 
 
-def compute_comparison(fd_metrics, sg_metrics):
-    """计算对比指标（差异百分比、胜出方）"""
+# 吞吐类指标：越高越好
+HIGHER_IS_BETTER = {
+    "total_token_throughput",
+    "output_token_throughput",
+    "request_throughput",
+    "mean_decode",
+    "median_decode",
+    "p80_decode",
+    "p95_decode",
+    "p99_decode",
+}
+
+# 延迟类指标：越低越好
+LOWER_IS_BETTER = {
+    "mean_ttft",
+    "median_ttft",
+    "p80_ttft",
+    "p95_ttft",
+    "p99_ttft",
+    "mean_tpot",
+    "median_tpot",
+    "p80_tpot",
+    "p95_tpot",
+    "p99_tpot",
+    "mean_itl",
+    "median_itl",
+    "p80_itl",
+    "p95_itl",
+    "p99_itl",
+    "mean_e2el",
+    "median_e2el",
+    "p80_e2el",
+    "p95_e2el",
+    "p99_e2el",
+    "benchmark_duration",
+}
+
+
+def compute_comparison(all_metrics, baseline="sg"):
+    """计算多框架对比指标。
+
+    all_metrics: {"fd": {...}, "sg": {...}, "vllm": {...}}（任意 key 可为空 dict）
+    baseline:    用于计算 diff_pct 的基准框架（默认 SGLang）
+
+    返回:
+    {
+      metric_key: {
+        "fd": ..., "sg": ..., "vllm": ...,
+        "diff_pct": {"fd": ..., "vllm": ...},   # 相对 baseline
+        "winner": "fd" | "sg" | "vllm" | "tie"
+      }
+    }
+    """
     comparison = {}
 
-    # 吞吐类指标：越高越好
-    higher_is_better = {
-        "total_token_throughput",
-        "output_token_throughput",
-        "request_throughput",
-        "mean_decode",
-        "median_decode",
-        "p80_decode",
-        "p95_decode",
-        "p99_decode",
-    }
+    # 只比较实际有数据的框架
+    active = [fw for fw in FRAMEWORKS if all_metrics.get(fw)]
+    if not active:
+        return comparison
 
-    # 延迟类指标：越低越好
-    lower_is_better = {
-        "mean_ttft",
-        "median_ttft",
-        "p80_ttft",
-        "p95_ttft",
-        "p99_ttft",
-        "mean_tpot",
-        "median_tpot",
-        "p80_tpot",
-        "p95_tpot",
-        "p99_tpot",
-        "mean_itl",
-        "median_itl",
-        "p80_itl",
-        "p95_itl",
-        "p99_itl",
-        "mean_e2el",
-        "median_e2el",
-        "p80_e2el",
-        "p95_e2el",
-        "p99_e2el",
-        "benchmark_duration",
-    }
-
-    all_keys = set(fd_metrics.keys()) | set(sg_metrics.keys())
+    # 收集所有指标 key
+    all_keys = set()
+    for fw in active:
+        all_keys |= set(all_metrics[fw].keys())
 
     for key in sorted(all_keys):
-        fd_val = fd_metrics.get(key)
-        sg_val = sg_metrics.get(key)
-
-        if fd_val is None or sg_val is None:
+        entry = {}
+        per_fw_val = {}
+        for fw in active:
+            val = all_metrics[fw].get(key)
+            if val is None:
+                continue
+            entry[fw] = val
+            per_fw_val[fw] = val
+
+        if len(per_fw_val) < 2:
+            # 单框架数据，无法对比但仍记录
+            comparison[key] = entry
             continue
 
-        entry = {"fd": fd_val, "sg": sg_val}
-
-        # 计算差异百分比 (FD 相对于 SG)
-        if sg_val != 0:
-            diff_pct = round((fd_val - sg_val) / sg_val * 100, 2)
-        else:
-            diff_pct = 0
-        entry["diff_pct"] = diff_pct
+        # 计算相对 baseline 的差异百分比
+        diff_pct = {}
+        base_val = per_fw_val.get(baseline)
+        for fw, val in per_fw_val.items():
+            if fw == baseline or base_val is None:
+                continue
+            if base_val != 0:
+                diff_pct[fw] = round((val - base_val) / base_val * 100, 2)
+            else:
+                diff_pct[fw] = 0
+        if diff_pct:
+            entry["diff_pct"] = diff_pct
 
         # 判断胜出方
-        if key in higher_is_better:
-            entry["winner"] = "fd" if fd_val > sg_val else "sg"
-        elif key in lower_is_better:
-            entry["winner"] = "fd" if fd_val < sg_val else "sg"
+        if key in HIGHER_IS_BETTER:
+            entry["winner"] = max(per_fw_val, key=per_fw_val.get)
+        elif key in LOWER_IS_BETTER:
+            entry["winner"] = min(per_fw_val, key=per_fw_val.get)
         else:
             entry["winner"] = "tie"
 
@@ -184,40 +227,65 @@ def compute_comparison(fd_metrics, sg_metrics):
 
 def main():
     parser = argparse.ArgumentParser(description="从 benchmark 结果提取指标并生成对比 JSON")
-    parser.add_argument("--fd-result", required=True, help="FastDeploy 结果文件路径")
-    parser.add_argument("--sg-result", required=True, help="SGLang 结果文件路径")
+    parser.add_argument("--fd-result", default=None, help="FastDeploy 结果文件路径")
+    parser.add_argument("--sg-result", default=None, help="SGLang 结果文件路径")
+    parser.add_argument("--vllm-result", default=None, help="vLLM 结果文件路径")
     parser.add_argument("--model-path", required=True, help="模型权重目录路径")
     parser.add_argument("--fd-config", default="{}", help="FD 部署配置 JSON 字符串")
     parser.add_argument("--sg-config", default="{}", help="SG 部署配置 JSON 字符串")
+    parser.add_argument("--vllm-config", default="{}", help="vLLM 部署配置 JSON 字符串")
+    parser.add_argument(
+        "--baseline", default="sg", choices=FRAMEWORKS, help="对比基准框架（计算 diff_pct 用），默认 sg"
+    )
     parser.add_argument("--output", default="metrics.json", help="输出 JSON 路径")
     args = parser.parse_args()
 
-    print(f"[INFO] 解析 FD 结果: {args.fd_result}")
-    fd_metrics = parse_benchmark_result(args.fd_result)
-    print(f"[INFO] 解析 SG 结果: {args.sg_result}")
-    sg_metrics = parse_benchmark_result(args.sg_result)
+    # 至少需要一份结果
+    if not any([args.fd_result, args.sg_result, args.vllm_result]):
+        parser.error("至少需要提供 --fd-result / --sg-result / --vllm-result 中的一个")
+
+    result_paths = {
+        "fd": args.fd_result,
+        "sg": args.sg_result,
+        "vllm": args.vllm_result,
+    }
+    config_strs = {
+        "fd": args.fd_config,
+        "sg": args.sg_config,
+        "vllm": args.vllm_config,
+    }
+    framework_display = {"fd": "FastDeploy", "sg": "SGLang", "vllm": "vLLM"}
+
+    all_metrics = {}
+    for fw in FRAMEWORKS:
+        path = result_paths[fw]
+        if path:
+            print(f"[INFO] 解析 {framework_display[fw]} 结果: {path}")
+            all_metrics[fw] = parse_benchmark_result(path)
+        else:
+            all_metrics[fw] = {}
 
     print(f"[INFO] 读取模型信息: {args.model_path}")
     model_info = get_model_info(args.model_path)
 
-    print("[INFO] 计算对比指标...")
-    comparison = compute_comparison(fd_metrics, sg_metrics)
+    print(f"[INFO] 计算对比指标 (baseline={args.baseline})...")
+    comparison = compute_comparison(all_metrics, baseline=args.baseline)
 
     # 解析部署配置
-    fd_config = json.loads(args.fd_config) if args.fd_config else {}
-    sg_config = json.loads(args.sg_config) if args.sg_config else {}
+    configs = {}
+    for fw in FRAMEWORKS:
+        try:
+            configs[fw] = json.loads(config_strs[fw]) if config_strs[fw] else {}
+        except json.JSONDecodeError as e:
+            print(f"[WARN] 解析 --{fw}-config 失败: {e}", file=sys.stderr)
+            configs[fw] = {}
 
     output = {
         "model": model_info,
-        "config": {
-            "fd": fd_config,
-            "sg": sg_config,
-        },
-        "raw_metrics": {
-            "fd": fd_metrics,
-            "sg": sg_metrics,
-        },
+        "config": configs,
+        "raw_metrics": all_metrics,
         "comparison": comparison,
+        "baseline": args.baseline,
     }
 
     with open(args.output, "w") as f:
@@ -236,14 +304,29 @@ def main():
         "mean_decode",
         "benchmark_duration",
     ]
+    active = [fw for fw in FRAMEWORKS if all_metrics.get(fw)]
+    if not active:
+        print("[WARN] 没有任何有效的结果数据")
+        return
+
     print("\n========== 核心指标摘要 ==========")
-    print(f"{'Metric':<30} {'FD':>12} {'SG':>12} {'Diff%':>8} {'Winner':>8}")
-    print("-" * 72)
+    header = f"{'Metric':<30}"
+    for fw in active:
+        header += f" {framework_display[fw]:>12}"
+    header += f" {'Winner':>10}"
+    print(header)
+    print("-" * len(header))
     for key in key_metrics:
-        if key in comparison:
-            c = comparison[key]
-            print(f"{key:<30} {c['fd']:>12.2f} {c['sg']:>12.2f} {c['diff_pct']:>+7.1f}% {c['winner']:>8}")
-    print("=" * 72)
+        if key not in comparison:
+            continue
+        c = comparison[key]
+        line = f"{key:<30}"
+        for fw in active:
+            val = c.get(fw)
+            line += f" {val:>12.2f}" if isinstance(val, (int, float)) else f" {'-':>12}"
+        line += f" {c.get('winner', '-'):>10}"
+        print(line)
+    print("=" * len(header))
 
 
 if __name__ == "__main__":