Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,29 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

export AMDGCN_USE_BUFFER_OPS=0
# --- AITER backend optimizations (env-var tuning) ---
export VLLM_ROCM_USE_AITER=1
export VLLM_USE_ROCM_AITER_MXFP4=1
export VLLM_ROCM_USE_AITER_PAGED_ATTN=1
export VLLM_ROCM_USE_AITER_LINEAR=1
export VLLM_ROCM_USE_AITER_TRITON_ROPE=1
export VLLM_ROCM_USE_AITER_FP4_ASM_GEMM=1
export VLLM_ROCM_USE_AITER_TRITON_GEMM=0
export VLLM_ROCM_MOE_PADDING=0
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
export AITER_BF16_FP8_BOUND=0
export AITER_USE_OPUS_MOE_SORTING=1
export AITER_USE_NT=0
export AMDGCN_USE_BUFFER_OPS=1
export CK_MXFP4_MOE_DIM_ALIGNMENT=64
export GPU_MAX_HW_QUEUES=4

ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"

# --- Speculative decoding (06/02 — n-gram prompt lookup, lossless) ---
SPEC_DECODE="--speculative-config {\"method\":\"ngram\",\"num_speculative_tokens\":3,\"prompt_lookup_min\":2,\"prompt_lookup_max\":64}"

SERVER_LOG=/workspace/server.log

if [ "${EVAL_ONLY}" = "true" ]; then
Expand All @@ -53,10 +69,13 @@ set -x
vllm serve $MODEL --port $PORT \
$ATTN_BACKEND $FUSE_ROPE_KVCACHE \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--gpu-memory-utilization 0.97 \
--max-model-len $MAX_MODEL_LEN \
--max-num-seqs 256 \
--max-num-batched-tokens 16384 \
--block-size=64 \
--no-enable-prefix-caching > $SERVER_LOG 2>&1 &
--no-enable-prefix-caching \
$SPEC_DECODE > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3430,3 +3430,11 @@
- "Image: vllm/vllm-openai:v0.20.1"
- "Same 1k/1k and 8k/1k search space as gb300, plus a new tp8-1p1d at low concurrencies for both ISLs"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1652

- config-keys:
- gptoss-fp4-mi355x-vllm
description:
- "Enable n-gram speculative decoding (prompt-lookup, num_speculative_tokens=3) for 3.26x decode throughput improvement"
- "Add full AITER env-var tuning: MXFP4, FP4 ASM GEMM, unified paged attention, inductor graph partition, opus MoE sorting"
- "Set gpu-memory-utilization=0.97, max-num-seqs=256, max-num-batched-tokens=16384, GPU_MAX_HW_QUEUES=4"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1657