Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 88 additions & 3 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,22 @@ qwen3.5-fp4-mi355x-sglang:
- { tp: 2, conc-start: 4, conc-end: 256 }
- { tp: 4, conc-start: 4, conc-end: 16 }

# target
qwen3.5-fp4-mi355x-sglang-agentic-hicache:
image: lmsysorg/sglang:v0.5.12-rocm720-mi35x
model: amd/Qwen3.5-397B-A17B-MXFP4
model-prefix: qwen3.5
runner: mi355x
precision: fp4
framework: sglang
multinode: false
scenarios:
agentic-coding:
- duration: 1800
search-space:
- { tp: 2, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] }
- { tp: 2, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] }

qwen3.5-fp4-mi355x-atom:
image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
model: amd/Qwen3.5-397B-A17B-MXFP4
Expand Down Expand Up @@ -872,6 +888,22 @@ minimaxm2.5-fp4-mi355x-atom:
- { tp: 4, conc-start: 4, conc-end: 128 }
- { tp: 8, conc-start: 4, conc-end: 16 }

# target
minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache:
image: vllm/vllm-openai-rocm:v0.22.0
model: amd/MiniMax-M2.5-MXFP4
model-prefix: minimaxm2.5
runner: mi355x
precision: fp4
framework: vllm
multinode: false
scenarios:
agentic-coding:
- duration: 1800
search-space:
- { tp: 1, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
- { tp: 1, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }

minimaxm2.5-fp4-mi355x-vllm:
image: vllm/vllm-openai-rocm:v0.22.0
model: amd/MiniMax-M2.5-MXFP4
Expand Down Expand Up @@ -2494,6 +2526,23 @@ glm5.1-fp4-mi355x-sglang-agentic:
# sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
- { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }

# target
glm5.1-fp4-mi355x-sglang-agentic-hicache:
image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
model: amd/GLM-5.1-MXFP4
model-prefix: glm5.1
runner: mi355x
precision: fp4
framework: sglang
multinode: false
scenarios:
agentic-coding:
- duration: 1800
search-space:
# sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
- { tp: 2, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] }
- { tp: 2, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48] }

kimik2.5-fp4-mi355x-vllm-agentic:
image: vllm/vllm-openai-rocm:v0.22.0
model: amd/Kimi-K2.5-MXFP4
Expand All @@ -2518,6 +2567,22 @@ kimik2.5-fp4-mi355x-vllm-agentic:
- { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
- { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] }

# target
kimik2.5-fp4-mi355x-vllm-agentic-lmcache:
image: vllm/vllm-openai-rocm:v0.22.0
model: amd/Kimi-K2.5-MXFP4
model-prefix: kimik2.5
runner: mi355x
precision: fp4
framework: vllm
multinode: false
scenarios:
agentic-coding:
- duration: 1800
search-space:
- { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
- { tp: 4, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }

minimaxm2.5-fp8-mi355x-vllm-agentic:
image: vllm/vllm-openai-rocm:v0.22.0
model: MiniMaxAI/MiniMax-M2.5
Expand All @@ -2536,6 +2601,25 @@ minimaxm2.5-fp8-mi355x-vllm-agentic:
- { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
- { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] }

# target
minimaxm2.5-fp8-mi355x-vllm-agentic-lmcache:
image: vllm/vllm-openai-rocm:v0.22.0
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm2.5
runner: mi355x
precision: fp8
framework: vllm
multinode: false
scenarios:
agentic-coding:
# MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
# Compute saturates first; cpu offload likely won't help, but worth confirming.
# AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
- duration: 1800
search-space:
- { tp: 2, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }
- { tp: 2, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48] }

minimaxm2.5-fp8-mi300x-vllm-agentic:
image: vllm/vllm-openai-rocm:v0.22.0
model: MiniMaxAI/MiniMax-M2.5
Expand Down Expand Up @@ -2573,8 +2657,9 @@ minimaxm2.5-fp8-mi325x-vllm-agentic:
- { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
- { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] }

# target
qwen3.5-fp8-mi355x-sglang-agentic-hicache:
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531
model: Qwen/Qwen3.5-397B-A17B-FP8
model-prefix: qwen3.5
runner: mi355x
Expand All @@ -2585,8 +2670,8 @@ qwen3.5-fp8-mi355x-sglang-agentic-hicache:
agentic-coding:
- duration: 1800
search-space:
- { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
- { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
- { tp: 4, ep: 1, offloading: none, conc-list: [8, 16, 32, 40, 48, 56, 72] }
- { tp: 4, ep: 1, offloading: hicache, conc-list: [8, 16, 32, 40, 48, 56, 72] }

dsv4-fp4-mi355x-vllm-agentic:
image: vllm/vllm-openai-rocm:v0.22.0
Expand Down
96 changes: 87 additions & 9 deletions benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,29 @@
set -euo pipefail
set -x

# Agentic trace replay benchmark for GLM-5.1 FP4 on MI355X using SGLang.
# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
#
# Base server recipe follows the upstream MI300X reference
# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
# The agentic harness (resolve_trace_source / build_replay_cmd /
# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
# --disable-radix-cache is dropped because agentic replay needs prefix reuse.
#
# Required env vars:
# MODEL, TP, CONC, RESULT_DIR
# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
#
# OFFLOADING values:
# none - SGLang GPU KV with the default RadixAttention prefix cache.
# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars MODEL TP CONC RESULT_DIR DURATION
check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION

if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
MAX_MODEL_LEN=131072
fi
PORT=${PORT:-8888}
DURATION=${DURATION:-1800}
EP_SIZE=${EP_SIZE:-1}

if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
Expand All @@ -30,8 +41,16 @@ else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi

rocm-smi || true
amd-smi || true
# ---- Resolve traces and install deps ----------------------------------------
# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
# signal at high concurrency.
#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
#060226
export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k

# ---- Resolve traces and install deps ----------------------------------------
resolve_trace_source
Expand All @@ -48,26 +67,85 @@ mkdir -p "$RESULT_DIR"

pip install -U transformers

CACHE_ARGS=()
WARMUP_ARGS=()
CUDA_GRAPH_MAX_BS="$CONC"
case "$OFFLOADING" in
none)
# Leave SGLang's default RadixAttention prefix cache on — agentic
# replay needs it; --disable-radix-cache would zero the hit rate.
;;
hicache)
# GLM-5.1 FP4 uses a standard transformer (no hybrid Mamba path),
# so one HiCache host pool per TP rank is sufficient.
# The node-total DRAM budget divides by TP and host-pool count.
TOTAL_CPU_DRAM_GB=3000
HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-1}"
HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-500}}"
HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
# GLM-5.1 uses standard paged attention (no no_buffer scheduler constraint),
# so page_size can be left at the default. Keep the safer direct/layer_first
# copy path on ROCm.
HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
fi
if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
exit 1
fi
echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
CACHE_ARGS=(
--page-size "$HICACHE_PAGE_SIZE"
--enable-hierarchical-cache
--hicache-size "$HICACHE_SIZE_GB"
--hicache-io-backend "$HICACHE_IO_BACKEND"
--hicache-mem-layout "$HICACHE_MEM_LAYOUT"
--hicache-write-policy "$HICACHE_WRITE_POLICY"
)
# HiCache startup reaches API readiness but SGLang's internal warmup
# request can time out on this path; let aiperf own benchmark traffic.
WARMUP_ARGS=(--skip-server-warmup)
# Don't force ROCm graph capture at every high concurrency point; conc=16
# is the highest known-good capture size for this model/server path.
HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}"
if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
fi
;;
*)
echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
exit 1
;;
esac

echo "Starting SGLang server..."
export PYTHONNOUSERSITE=1

pip install -U transformers
python3 -m sglang.launch_server \
--model-path "$MODEL_PATH" --served-model-name "$MODEL" \
--model-path "$MODEL_PATH" \
--served-model-name "$MODEL" \
Comment thread
cursor[bot] marked this conversation as resolved.
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
--trust-remote-code \
--cuda-graph-max-bs $CONC \
--max-running-requests $CONC \
--context-length $MAX_MODEL_LEN \
--mem-fraction-static 0.85 \
--tool-call-parser glm47 \
--reasoning-parser glm45 \
--model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \
--nsa-prefill-backend tilelang \
--nsa-decode-backend tilelang \
--watchdog-timeout 1200 \
--kv-cache-dtype fp8_e4m3 \
--tokenizer-worker-num $((TP*2)) \
"${CACHE_ARGS[@]}" \
"${WARMUP_ARGS[@]}" \
--enable-metrics > "$SERVER_LOG" 2>&1 &
SERVER_PID=$!
echo "Server PID: $SERVER_PID"
Expand All @@ -77,4 +155,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
# ---- Run benchmark ----------------------------------------------------------
build_replay_cmd "$RESULT_DIR"

run_agentic_replay_and_write_outputs "$RESULT_DIR"
run_agentic_replay_and_write_outputs "$RESULT_DIR"
Loading