diff --git a/.claude/skills/benchmark-compare/scripts/extract_metrics.py b/.claude/skills/benchmark-compare/scripts/extract_metrics.py index 6423c139017..b3eabcc2b79 100644 --- a/.claude/skills/benchmark-compare/scripts/extract_metrics.py +++ b/.claude/skills/benchmark-compare/scripts/extract_metrics.py @@ -1,13 +1,18 @@ #!/usr/bin/env python3 """extract_metrics.py — 从 benchmark 结果文件提取指标,输出结构化 JSON +支持框架: fd (FastDeploy) / sg (SGLang) / vllm (vLLM) +任意框架结果均可缺省,缺省的不参与对比。 + 用法: python3 extract_metrics.py \ --fd-result \ --sg-result \ + --vllm-result \ --model-path \ --fd-config '{"gpu":"H800","tp":1,"concurrency":32}' \ --sg-config '{"gpu":"H800","tp":1,"concurrency":32}' \ + --vllm-config '{"gpu":"H800","tp":1,"concurrency":32}' \ --output """ @@ -18,12 +23,16 @@ import subprocess import sys +# 支持的框架列表 +FRAMEWORKS = ("fd", "sg", "vllm") + def parse_benchmark_result(filepath): """解析 benchmark_serving.py 的输出文件,提取所有指标""" metrics = {} - if not os.path.isfile(filepath): - print(f"[WARN] 结果文件不存在: {filepath}", file=sys.stderr) + if not filepath or not os.path.isfile(filepath): + if filepath: + print(f"[WARN] 结果文件不存在: {filepath}", file=sys.stderr) return metrics with open(filepath, "r") as f: @@ -110,70 +119,104 @@ def get_model_info(model_path): return info -def compute_comparison(fd_metrics, sg_metrics): - """计算对比指标(差异百分比、胜出方)""" +# 吞吐类指标:越高越好 +HIGHER_IS_BETTER = { + "total_token_throughput", + "output_token_throughput", + "request_throughput", + "mean_decode", + "median_decode", + "p80_decode", + "p95_decode", + "p99_decode", +} + +# 延迟类指标:越低越好 +LOWER_IS_BETTER = { + "mean_ttft", + "median_ttft", + "p80_ttft", + "p95_ttft", + "p99_ttft", + "mean_tpot", + "median_tpot", + "p80_tpot", + "p95_tpot", + "p99_tpot", + "mean_itl", + "median_itl", + "p80_itl", + "p95_itl", + "p99_itl", + "mean_e2el", + "median_e2el", + "p80_e2el", + "p95_e2el", + "p99_e2el", + "benchmark_duration", +} + + +def compute_comparison(all_metrics, baseline="sg"): + """计算多框架对比指标。 + + all_metrics: {"fd": {...}, "sg": {...}, "vllm": {...}}(任意 key 可为空 dict) + baseline: 用于计算 diff_pct 的基准框架(默认 SGLang) + + 返回: + { + metric_key: { + "fd": ..., "sg": ..., "vllm": ..., + "diff_pct": {"fd": ..., "vllm": ...}, # 相对 baseline + "winner": "fd" | "sg" | "vllm" | "tie" + } + } + """ comparison = {} - # 吞吐类指标:越高越好 - higher_is_better = { - "total_token_throughput", - "output_token_throughput", - "request_throughput", - "mean_decode", - "median_decode", - "p80_decode", - "p95_decode", - "p99_decode", - } + # 只比较实际有数据的框架 + active = [fw for fw in FRAMEWORKS if all_metrics.get(fw)] + if not active: + return comparison - # 延迟类指标:越低越好 - lower_is_better = { - "mean_ttft", - "median_ttft", - "p80_ttft", - "p95_ttft", - "p99_ttft", - "mean_tpot", - "median_tpot", - "p80_tpot", - "p95_tpot", - "p99_tpot", - "mean_itl", - "median_itl", - "p80_itl", - "p95_itl", - "p99_itl", - "mean_e2el", - "median_e2el", - "p80_e2el", - "p95_e2el", - "p99_e2el", - "benchmark_duration", - } - - all_keys = set(fd_metrics.keys()) | set(sg_metrics.keys()) + # 收集所有指标 key + all_keys = set() + for fw in active: + all_keys |= set(all_metrics[fw].keys()) for key in sorted(all_keys): - fd_val = fd_metrics.get(key) - sg_val = sg_metrics.get(key) - - if fd_val is None or sg_val is None: + entry = {} + per_fw_val = {} + for fw in active: + val = all_metrics[fw].get(key) + if val is None: + continue + entry[fw] = val + per_fw_val[fw] = val + + if len(per_fw_val) < 2: + # 单框架数据,无法对比但仍记录 + comparison[key] = entry continue - entry = {"fd": fd_val, "sg": sg_val} - - # 计算差异百分比 (FD 相对于 SG) - if sg_val != 0: - diff_pct = round((fd_val - sg_val) / sg_val * 100, 2) - else: - diff_pct = 0 - entry["diff_pct"] = diff_pct + # 计算相对 baseline 的差异百分比 + diff_pct = {} + base_val = per_fw_val.get(baseline) + for fw, val in per_fw_val.items(): + if fw == baseline or base_val is None: + continue + if base_val != 0: + diff_pct[fw] = round((val - base_val) / base_val * 100, 2) + else: + diff_pct[fw] = 0 + if diff_pct: + entry["diff_pct"] = diff_pct # 判断胜出方 - if key in higher_is_better: - entry["winner"] = "fd" if fd_val > sg_val else "sg" - elif key in lower_is_better: - entry["winner"] = "fd" if fd_val < sg_val else "sg" + if key in HIGHER_IS_BETTER: + entry["winner"] = max(per_fw_val, key=per_fw_val.get) + elif key in LOWER_IS_BETTER: + entry["winner"] = min(per_fw_val, key=per_fw_val.get) else: entry["winner"] = "tie" @@ -184,40 +227,65 @@ def compute_comparison(fd_metrics, sg_metrics): def main(): parser = argparse.ArgumentParser(description="从 benchmark 结果提取指标并生成对比 JSON") - parser.add_argument("--fd-result", required=True, help="FastDeploy 结果文件路径") - parser.add_argument("--sg-result", required=True, help="SGLang 结果文件路径") + parser.add_argument("--fd-result", default=None, help="FastDeploy 结果文件路径") + parser.add_argument("--sg-result", default=None, help="SGLang 结果文件路径") + parser.add_argument("--vllm-result", default=None, help="vLLM 结果文件路径") parser.add_argument("--model-path", required=True, help="模型权重目录路径") parser.add_argument("--fd-config", default="{}", help="FD 部署配置 JSON 字符串") parser.add_argument("--sg-config", default="{}", help="SG 部署配置 JSON 字符串") + parser.add_argument("--vllm-config", default="{}", help="vLLM 部署配置 JSON 字符串") + parser.add_argument( + "--baseline", default="sg", choices=FRAMEWORKS, help="对比基准框架(计算 diff_pct 用),默认 sg" + ) parser.add_argument("--output", default="metrics.json", help="输出 JSON 路径") args = parser.parse_args() - print(f"[INFO] 解析 FD 结果: {args.fd_result}") - fd_metrics = parse_benchmark_result(args.fd_result) - print(f"[INFO] 解析 SG 结果: {args.sg_result}") - sg_metrics = parse_benchmark_result(args.sg_result) + # 至少需要一份结果 + if not any([args.fd_result, args.sg_result, args.vllm_result]): + parser.error("至少需要提供 --fd-result / --sg-result / --vllm-result 中的一个") + + result_paths = { + "fd": args.fd_result, + "sg": args.sg_result, + "vllm": args.vllm_result, + } + config_strs = { + "fd": args.fd_config, + "sg": args.sg_config, + "vllm": args.vllm_config, + } + framework_display = {"fd": "FastDeploy", "sg": "SGLang", "vllm": "vLLM"} + + all_metrics = {} + for fw in FRAMEWORKS: + path = result_paths[fw] + if path: + print(f"[INFO] 解析 {framework_display[fw]} 结果: {path}") + all_metrics[fw] = parse_benchmark_result(path) + else: + all_metrics[fw] = {} print(f"[INFO] 读取模型信息: {args.model_path}") model_info = get_model_info(args.model_path) - print("[INFO] 计算对比指标...") - comparison = compute_comparison(fd_metrics, sg_metrics) + print(f"[INFO] 计算对比指标 (baseline={args.baseline})...") + comparison = compute_comparison(all_metrics, baseline=args.baseline) # 解析部署配置 - fd_config = json.loads(args.fd_config) if args.fd_config else {} - sg_config = json.loads(args.sg_config) if args.sg_config else {} + configs = {} + for fw in FRAMEWORKS: + try: + configs[fw] = json.loads(config_strs[fw]) if config_strs[fw] else {} + except json.JSONDecodeError as e: + print(f"[WARN] 解析 --{fw}-config 失败: {e}", file=sys.stderr) + configs[fw] = {} output = { "model": model_info, - "config": { - "fd": fd_config, - "sg": sg_config, - }, - "raw_metrics": { - "fd": fd_metrics, - "sg": sg_metrics, - }, + "config": configs, + "raw_metrics": all_metrics, "comparison": comparison, + "baseline": args.baseline, } with open(args.output, "w") as f: @@ -236,14 +304,29 @@ def main(): "mean_decode", "benchmark_duration", ] + active = [fw for fw in FRAMEWORKS if all_metrics.get(fw)] + if not active: + print("[WARN] 没有任何有效的结果数据") + return + print("\n========== 核心指标摘要 ==========") - print(f"{'Metric':<30} {'FD':>12} {'SG':>12} {'Diff%':>8} {'Winner':>8}") - print("-" * 72) + header = f"{'Metric':<30}" + for fw in active: + header += f" {framework_display[fw]:>12}" + header += f" {'Winner':>10}" + print(header) + print("-" * len(header)) for key in key_metrics: - if key in comparison: - c = comparison[key] - print(f"{key:<30} {c['fd']:>12.2f} {c['sg']:>12.2f} {c['diff_pct']:>+7.1f}% {c['winner']:>8}") - print("=" * 72) + if key not in comparison: + continue + c = comparison[key] + line = f"{key:<30}" + for fw in active: + val = c.get(fw) + line += f" {val:>12.2f}" if isinstance(val, (int, float)) else f" {'-':>12}" + line += f" {c.get('winner', '-'):>10}" + print(line) + print("=" * len(header)) if __name__ == "__main__": diff --git a/.claude/skills/benchmark-compare/scripts/generate_report.py b/.claude/skills/benchmark-compare/scripts/generate_report.py index d93444f443b..b1afd542720 100644 --- a/.claude/skills/benchmark-compare/scripts/generate_report.py +++ b/.claude/skills/benchmark-compare/scripts/generate_report.py @@ -86,8 +86,8 @@ def parse_benchmark_log(filepath): def scan_log_dir(log_dir): """扫描日志目录,自动识别场景并提取指标 - 文件命名约定: *_bs_[_].txt - 例如: GLM-4.7-Flash_long_bs32_fd.txt, GLM-4.7-Flash_long_bs512_fp8_sg.txt + 文件命名约定: *_bs_[_].txt + 例如: GLM-4.7-Flash_long_bs32_fd.txt, GLM-4.7-Flash_long_bs512_fp8_vllm.txt """ data = {} if not os.path.isdir(log_dir): @@ -101,11 +101,11 @@ def scan_log_dir(log_dir): filepath = os.path.join(root, fname) # 尝试从文件名解析场景信息 - # 格式: *_bs_[_].txt - m = re.search(r"_bs(\d+)_(?:(fp8|bf16|wint4|wint8)_)?(fd|sg)\.txt$", fname, re.IGNORECASE) + # 格式: *_bs_[_].txt + m = re.search(r"_bs(\d+)_(?:(fp8|bf16|wint4|wint8)_)?(fd|sg|vllm)\.txt$", fname, re.IGNORECASE) if not m: # 也尝试无 quant 的模式 (默认 bf16) - m = re.search(r"_bs(\d+)_(fd|sg)\.txt$", fname, re.IGNORECASE) + m = re.search(r"_bs(\d+)_(fd|sg|vllm)\.txt$", fname, re.IGNORECASE) if m: bs = m.group(1) quant = "bf16" @@ -826,11 +826,18 @@ def main(): parser.add_argument("--max-model-len", type=int, default=65536, help="最大模型长度") parser.add_argument("--fd-attention", default="MLA_ATTN (FlashAttn v3)", help="FD Attention Backend") parser.add_argument("--sg-attention", default="flashmla", help="SG Attention Backend") + parser.add_argument("--vllm-attention", default="flash-attn", help="vLLM Attention Backend") parser.add_argument("--sg-version", default="", help="SGLang 版本") + parser.add_argument("--vllm-version", default="", help="vLLM 版本") parser.add_argument("--fd-commit-date", default="", help="FD commit 日期") parser.add_argument("--fd-commit-short", default="", help="FD commit 短 hash") parser.add_argument("--fd-commit-full", default="", help="FD commit 完整 hash") + # 框架选择(三选二对比) + parser.add_argument( + "--frameworks", default="fd,sg", help="对比哪两个框架,逗号分隔(如 vllm,sg 或 fd,vllm),默认 fd,sg" + ) + # 显示配置 parser.add_argument("--default-quant", default="bf16", help="默认量化选择") parser.add_argument("--default-bs", default="512", help="默认并发选择") @@ -853,21 +860,40 @@ def main(): print("[ERROR] 未找到有效的 benchmark 数据", file=sys.stderr) sys.exit(1) - # 过滤掉不完整的场景(缺少 fd 或 sg) + # 解析 --frameworks,决定对比哪两个框架;把它们映射到现有 HTML 模板的 fd/sg 槽位 + fw_list = [x.strip().lower() for x in args.frameworks.split(",") if x.strip()] + if len(fw_list) != 2 or any(x not in ("fd", "sg", "vllm") for x in fw_list): + print(f"[ERROR] --frameworks 必须为 fd/sg/vllm 中的两个,逗号分隔,得到: {args.frameworks}", file=sys.stderr) + sys.exit(1) + left_fw, right_fw = fw_list[0], fw_list[1] + + framework_display = {"fd": "FastDeploy", "sg": "SGLang", "vllm": "vLLM"} + # framework_attn_key = {"fd": "fd_attention", "sg": "sg_attention", "vllm": "vllm_attention"} + framework_attn_val = { + "fd": args.fd_attention, + "sg": args.sg_attention, + "vllm": args.vllm_attention, + } + framework_version = {"fd": "", "sg": args.sg_version, "vllm": args.vllm_version} + + # 把所选两个框架的数据映射到 fd/sg 槽位 (left→fd, right→sg) valid_data = {} for key, val in benchmark_data.items(): - if "fd" in val and "sg" in val and val["fd"] and val["sg"]: - valid_data[key] = val + left_val = val.get(left_fw) + right_val = val.get(right_fw) + if left_val and right_val: + valid_data[key] = {"fd": left_val, "sg": right_val} else: - print(f"[WARN] 场景 {key} 数据不完整,跳过", file=sys.stderr) + print(f"[WARN] 场景 {key} 缺少 {left_fw} 或 {right_fw} 数据,跳过", file=sys.stderr) if not valid_data: - print("[ERROR] 没有完整的对比场景数据", file=sys.stderr) + print(f"[ERROR] 没有完整的 {left_fw} vs {right_fw} 对比场景数据", file=sys.stderr) sys.exit(1) + print(f"[INFO] 对比框架: {framework_display[left_fw]} vs {framework_display[right_fw]}") print(f"[INFO] 有效场景: {', '.join(sorted(valid_data.keys()))}") - # 构建配置 + # 构建配置(fd 槽=left_fw, sg 槽=right_fw) config = { "model_name": args.model_name, "model_type": args.model_type, @@ -879,12 +905,12 @@ def main(): "dp_size": args.dp, "ep_size": args.ep, "max_model_len": args.max_model_len, - "fd_attention": args.fd_attention, - "sg_attention": args.sg_attention, - "sg_version": args.sg_version, - "fd_commit_date": args.fd_commit_date, - "fd_commit_short": args.fd_commit_short, - "fd_commit_full": args.fd_commit_full, + "fd_attention": framework_attn_val[left_fw], + "sg_attention": framework_attn_val[right_fw], + "sg_version": framework_version[right_fw], + "fd_commit_date": args.fd_commit_date if left_fw == "fd" else "", + "fd_commit_short": args.fd_commit_short if left_fw == "fd" else "", + "fd_commit_full": args.fd_commit_full if left_fw == "fd" else "", "default_quant": args.default_quant, "default_bs": args.default_bs, "test_date": args.test_date, @@ -895,6 +921,21 @@ def main(): # 生成 HTML html = generate_html(valid_data, config) + # 把模板里的 "FastDeploy"/"SGLang" 文本标签替换为所选框架名 + # 注意:CSS 类名 .fd / .sg / fd-c / sg-c 等保持不变(只是颜色样式) + left_name = framework_display[left_fw] + right_name = framework_display[right_fw] + if left_name != "FastDeploy" or right_name != "SGLang": + # 用 placeholder 中转避免 FastDeploy→X 后再被 SGLang 替换误伤 + html = html.replace("FastDeploy", "__LEFT_FW__") + html = html.replace("SGLang", "__RIGHT_FW__") + html = html.replace("__LEFT_FW__", left_name) + html = html.replace("__RIGHT_FW__", right_name) + # FD 优势 / FD 文本也替换 + html = html.replace("FD 优势", f"{left_name} 优势") + html = html.replace(">FD<", f">{left_name}<") + html = html.replace(">SG<", f">{right_name}<") + with open(args.output, "w", encoding="utf-8") as f: f.write(html) diff --git a/.claude/skills/benchmark-compare/scripts/launch_service.sh b/.claude/skills/benchmark-compare/scripts/launch_service.sh index dae50f67d55..ba4eee30dea 100644 --- a/.claude/skills/benchmark-compare/scripts/launch_service.sh +++ b/.claude/skills/benchmark-compare/scripts/launch_service.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # launch_service.sh — 通用推理框架服务启动脚本 -# 支持 FastDeploy / SGLang,支持单卡/多卡 TP/DP/EP/PD 分离模式 +# 支持 FastDeploy / SGLang / vLLM,支持单卡/多卡 TP/DP/EP/PD 分离模式 set -euo pipefail # ============================================================ @@ -28,7 +28,7 @@ usage() { 用法: bash launch_service.sh [OPTIONS] 必需参数: - --framework 推理框架 (fd=FastDeploy, sg=SGLang) + --framework 推理框架 (fd=FastDeploy, sg=SGLang, vllm=vLLM) --model 模型权重路径 --port 服务端口 --gpus CUDA_VISIBLE_DEVICES (如 "0" 或 "0,1,2,3,4,5,6,7") @@ -40,6 +40,7 @@ usage() { --ep expert-parallel-size, MoE 模型专用 (默认: 0, 不启用) FD: 映射为 --enable-expert-parallel (EP=TP×DP 隐式) SG: 映射为 --ep-size N + vLLM: 映射为 --enable-expert-parallel --concurrency max-num-seqs / max-running-requests (默认: 32) --max-model-len 最大序列长度 (默认: 65536) --quantization 量化方式: none|block_wise_fp8|fp8|wint4|wint8 (默认: none) @@ -61,6 +62,10 @@ usage() { # TP=4 + DP=2 + EP=8 启动 SGLang (MoE, 8卡) bash launch_service.sh --framework sg --model /path/to/model --port 8280 \ --gpus 0,1,2,3,4,5,6,7 --tp 4 --dp 2 --ep 8 --venv /path/to/sglang_env/.venv + + # 单卡启动 vLLM + bash launch_service.sh --framework vllm --model /path/to/model --port 8380 \ + --gpus 2 --venv /path/to/vllm_env/.venv EOF exit "${1:-0}" } @@ -94,8 +99,8 @@ if [[ -z "$FRAMEWORK" || -z "$MODEL" || -z "$PORT" || -z "$GPUS" || -z "$VENV" ] usage 1 fi -if [[ "$FRAMEWORK" != "fd" && "$FRAMEWORK" != "sg" ]]; then - echo "错误: --framework 必须为 fd 或 sg" +if [[ "$FRAMEWORK" != "fd" && "$FRAMEWORK" != "sg" && "$FRAMEWORK" != "vllm" ]]; then + echo "错误: --framework 必须为 fd / sg / vllm" exit 1 fi @@ -267,12 +272,74 @@ launch_sglang() { echo "[INFO] SGLang PID: $! (已写入 /tmp/sg_pid_${PORT})" } +# ============================================================ +# 启动 vLLM +# ============================================================ +launch_vllm() { + echo "[INFO] 启动 vLLM 服务..." + echo " 模型: $MODEL" + echo " 端口: $PORT" + echo " GPU: $GPUS (TP=$TP, DP=$DP, EP=$EP)" + echo " 并发: $CONCURRENCY" + echo " 量化: $QUANTIZATION" + echo " 日志: $LOG_FILE" + + source "$VENV/bin/activate" + + export CUDA_VISIBLE_DEVICES="$GPUS" + + # DP 模式下设置 MASTER_PORT 避免冲突 + if [[ "$DP" -gt 1 ]]; then + export VLLM_MASTER_PORT=${VLLM_MASTER_PORT:-46000} + echo "[INFO] DP=$DP, 设置 VLLM_MASTER_PORT=$VLLM_MASTER_PORT 避免端口冲突" + fi + + # 构建命令 + local CMD="python -m vllm.entrypoints.openai.api_server" + CMD+=" --model $MODEL" + CMD+=" --host 0.0.0.0" + CMD+=" --port $PORT" + CMD+=" --tensor-parallel-size $TP" + CMD+=" --max-model-len $MAX_MODEL_LEN" + CMD+=" --max-num-seqs $CONCURRENCY" + CMD+=" --gpu-memory-utilization $GPU_MEM_UTIL" + CMD+=" --trust-remote-code" + + # DP (data parallelism) + if [[ "$DP" -gt 1 ]]; then + CMD+=" --data-parallel-size $DP" + fi + + # EP (expert parallelism) + if [[ "$EP" -gt 0 ]]; then + CMD+=" --enable-expert-parallel" + fi + + # 量化(vLLM 用 fp8 / awq / gptq 等;映射 FD 的 block_wise_fp8 → fp8) + if [[ "$QUANTIZATION" != "none" ]]; then + local VQ="$QUANTIZATION" + [[ "$VQ" == "block_wise_fp8" ]] && VQ="fp8" + CMD+=" --quantization $VQ" + fi + + # 额外参数 + if [[ -n "$EXTRA_ARGS" ]]; then + CMD+=" $EXTRA_ARGS" + fi + + echo "[INFO] 执行: $CMD" + nohup bash -c "$CMD" > "$LOG_FILE" 2>&1 & + echo $! > "/tmp/vllm_pid_${PORT}" + echo "[INFO] vLLM PID: $! (已写入 /tmp/vllm_pid_${PORT})" +} + # ============================================================ # 主入口 # ============================================================ case "$FRAMEWORK" in - fd) launch_fastdeploy ;; - sg) launch_sglang ;; + fd) launch_fastdeploy ;; + sg) launch_sglang ;; + vllm) launch_vllm ;; esac echo "[INFO] 服务已在后台启动,请使用 health_check.sh 等待就绪"