vngcloud · thangquang09 · Jun 30, 2026
diff --git a/.claude/skills/agentic-replay-run/SKILL.md b/.claude/skills/agentic-replay-run/SKILL.md
@@ -0,0 +1,61 @@
+---
+name: agentic-replay-run
+description: Run the supported InferenceX Weka coding benchmarks through agentic-replay/AIPerf.
+---
+
+# Agentic-replay Weka
+
+Supported path only: `agentic-replay` + `custom-dataset-type: weka_trace` + `benchmark-client: [aiperf]` on `h200-greennode_01`.
+
+Before running:
+
+1. Ask the user to choose the runner. Recommend `h200-greennode_01` unless they named another runner.
+2. Ask whether to enable DCGM/GPU metrics. Recommend enabled.
+3. Ask for duration. Offer smoke `90s` and full `>=900s`; recommend full for real benchmark runs.
+4. Ask the user to choose the GitHub Actions run title. Recommend a date-first title like `YYYY/MM/DD GLM-5.2 FP8 8xH200 sglang0.5.14 Agentic Replay Weka CCU4-8-12-16 hicache128`.
+5. Ask the user to confirm the model run settings before dispatch. Show the selected config key, image, runner, benchmark script, dataset source, `tp`, `ep` if present, concurrency, duration, `max-model-len`, DCGM setting, and any cache flags.
+6. Do not dispatch or start a model run until the user confirms the title and settings.
+
+Dispatch hygiene:
+
+- Create a temporary `exp/...` branch for benchmark-specific config edits, push it, and dispatch against that branch.
+- Keep `dev` clean; use `main` the same way if it becomes the base branch later.
+
+Known-good configs:
+
+- `minimaxm2.5-weka-fp8-h200-greennode-sglang-smoke`
+- `glm5.2-weka-fp8-h200-greennode-sglang-smoke`
+
+Dataset source:
+
+- Omit `input-file` and `public-dataset` for the default public SemiAnalysis Weka dataset: `semianalysis_cc_traces_weka_with_subagents_060826`.
+- Set `public-dataset` for another public Weka dataset.
+- Set `input-file` for internal MiniMax Weka-v4/local files.
+- Do not use `no-fixed-schedule`, warmup, request-count, think-time, or strip-delay fields.
+
+Required local pieces:
+
+- `utils/aiperf-mooncake` submodule
+- `benchmarks/single_node/minimaxm2.5-weka_fp8_h200_sglang.sh`
+- `benchmarks/single_node/glm5.2-ep8-deepep_fp8_h200_sglang.sh`
+- `runners/launch_h200-greennode.sh`
+
+Validate before dispatch:
+
+```bash
+python3 -c "import yaml; yaml.safe_load(open('.github/configs/nvidia-master.yaml'))"
+bash -n benchmarks/single_node/minimaxm2.5-weka_fp8_h200_sglang.sh
+bash -n benchmarks/single_node/glm5.2-ep8-deepep_fp8_h200_sglang.sh
+bash -n runners/launch_h200-greennode.sh
+uv run python utils/matrix_logic/generate_sweep_configs.py test-config \
+  --config-files .github/configs/nvidia-master.yaml \
+  --config-keys minimaxm2.5-weka-fp8-h200-greennode-sglang-smoke
+uv run python utils/matrix_logic/generate_sweep_configs.py test-config \
+  --config-files .github/configs/nvidia-master.yaml \
+  --config-keys glm5.2-weka-fp8-h200-greennode-sglang-smoke
+```
+
+Successful reference runs:
+
+- MiniMax-M2.5: https://github.com/vngcloud/InferenceX/actions/runs/28376099323
+- GLM-5.2-FP8: https://github.com/vngcloud/InferenceX/actions/runs/28279462408
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -9484,3 +9484,37 @@ qwen3.5-fp8-h100-sglang-mtp:
       osl: 1024
       search-space:
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
+
+minimaxm2.5-weka-fp8-h200-greennode-sglang-smoke:
+  image: lmsysorg/sglang:v0.5.12-cu130
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5-weka
+  runner: h200-greennode_01
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-replay:
+    - custom-dataset-type: weka_trace
+      max-model-len: 196608
+      benchmark-client: [aiperf]
+      duration: 300
+      search-space:
+      - { tp: 8, ep: 8, conc-list: [1] }
+
+glm5.2-weka-fp8-h200-greennode-sglang-smoke:
+  image: lmsysorg/sglang:v0.5.14-cu130
+  model: zai-org/GLM-5.2-FP8
+  model-prefix: glm5.2-ep8-deepep
+  runner: h200-greennode_01
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-replay:
+    - custom-dataset-type: weka_trace
+      max-model-len: 196608
+      benchmark-client: [aiperf]
+      duration: 300
+      search-space:
+      - { tp: 8, ep: 8, conc-list: [2] }
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -20,6 +20,10 @@ on:
       framework:
         required: true
         type: string
+      benchmark-client:
+        required: false
+        type: string
+        default: 'inferencex_native'
       exp-name:
         required: true
         type: string
@@ -97,6 +101,26 @@ on:
         required: false
         type: string
         default: '1800'
+      input-file:
+        description: "Repo-relative trace JSONL for agentic-replay (mooncake_trace)"
+        required: false
+        type: string
+        default: ''
+      public-dataset:
+        description: "AIPerf --public-dataset for agentic-replay Weka"
+        required: false
+        type: string
+        default: ''
+      custom-dataset-type:
+        description: "AIPerf --custom-dataset-type for agentic-replay (e.g. mooncake_trace)"
+        required: false
+        type: string
+        default: ''
+      tokenizer:
+        description: "AIPerf --tokenizer (HF id); empty => default to the served model"
+        required: false
+        type: string
+        default: ''
 env:
   RANDOM_RANGE_RATIO: 0.8
   HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -109,6 +133,7 @@ env:
   MAX_MODEL_LEN: ${{ inputs.max-model-len }}
   IMAGE: ${{ inputs.image }}
   FRAMEWORK: ${{ inputs.framework }}
+  BENCHMARK_CLIENT: ${{ inputs.benchmark-client }}
   PRECISION: ${{ inputs.precision }}
   TP: ${{ inputs.tp }}
   EP_SIZE: ${{ inputs.ep }}
@@ -126,6 +151,10 @@ env:
   OFFLOADING: ${{ inputs.offloading }}
   TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }}
   DURATION: ${{ inputs.duration }}
+  INPUT_FILE: ${{ inputs.input-file }}
+  PUBLIC_DATASET: ${{ inputs.public-dataset }}
+  CUSTOM_DATASET_TYPE: ${{ inputs.custom-dataset-type }}
+  TOKENIZER: ${{ inputs.tokenizer }}
   RESULT_DIR: /workspace/results
   PYTHONDONTWRITEBYTECODE: '1'
   PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
@@ -188,12 +217,12 @@ jobs:
         env:
           RUNNER_NAME: ${{ runner.name }}
           RUNNER_TYPE: ${{ inputs.runner }}
-          # Hash uniquely on {EXP_NAME}_{PRECISION}_{FRAMEWORK}_tp{}-ep{}-dpa{}_disagg-{}_spec-{}_n{}_mnbt{}_conc{}_{runner}
+          # Hash uniquely on {EXP_NAME}_{PRECISION}_{FRAMEWORK}[_CLIENT]_tp{}-ep{}-dpa{}_disagg-{}_spec-{}_n{}_mnbt{}_conc{}_{runner}
           # n / mnbt segments hold the values of NUM_SPECULATIVE_TOKENS and
           # MAX_NUM_BATCHED_TOKENS (see benchmark-tmpl.yml inputs); when those
           # inputs are unset, the segments resolve to 'n' / 'mnbt' with no
           # numeric value, which is intentional and stable across runs.
-          RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}-ep${{ env.EP_SIZE }}-dpa${{ env.DP_ATTENTION }}_disagg-${{ env.DISAGG }}_spec-${{ env.SPEC_DECODING }}_n${{ env.NUM_SPECULATIVE_TOKENS }}_mnbt${{ env.MAX_NUM_BATCHED_TOKENS }}_conc${{ env.CONC }}_${{ runner.name }}
+          RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}${{ inputs.benchmark-client != 'inferencex_native' && format('_{0}', inputs.benchmark-client) || '' }}_tp${{ env.TP }}-ep${{ env.EP_SIZE }}-dpa${{ env.DP_ATTENTION }}_disagg-${{ env.DISAGG }}_spec-${{ env.SPEC_DECODING }}_n${{ env.NUM_SPECULATIVE_TOKENS }}_mnbt${{ env.MAX_NUM_BATCHED_TOKENS }}_conc${{ env.CONC }}_${{ runner.name }}
           # Suppress per-job eval markdown from being appended to the step summary.
           # We'll publish a single combined eval table in the collection job instead.
           GITHUB_STEP_SUMMARY: ''
@@ -249,7 +278,7 @@ jobs:
           path: ${{ env.RESULT_FILENAME }}.json
 
       - name: Upload agentic raw results
-        if: ${{ always() && inputs.scenario-type == 'agentic-coding' }}
+        if: ${{ always() && (inputs.scenario-type == 'agentic-coding' || inputs.scenario-type == 'agentic-replay') }}
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: agentic_${{ env.RESULT_FILENAME }}
@@ -277,12 +306,26 @@ jobs:
             results/trace_replay/gpu_telemetry_export.jsonl
             results/trace_replay/logs/aiperf.log
             results/trace_replay/logs/*.log
+            ${{ env.RESULT_FILENAME }}_aiperf/profile_export.jsonl
+            ${{ env.RESULT_FILENAME }}_aiperf/profile_export_aiperf.json
+            ${{ env.RESULT_FILENAME }}_aiperf/profile_export_aiperf.csv
+            ${{ env.RESULT_FILENAME }}_aiperf/profile_export_aiperf_timeslices.json
+            ${{ env.RESULT_FILENAME }}_aiperf/profile_export_aiperf_timeslices.csv
+            ${{ env.RESULT_FILENAME }}_aiperf/profile_export_aiperf_aggregate.json
+            ${{ env.RESULT_FILENAME }}_aiperf/profile_export_aiperf_aggregate.csv
+            ${{ env.RESULT_FILENAME }}_aiperf/profile_export_aiperf_collated.json
+            ${{ env.RESULT_FILENAME }}_aiperf/server_metrics_export.json
+            ${{ env.RESULT_FILENAME }}_aiperf/server_metrics_export.jsonl
+            ${{ env.RESULT_FILENAME }}_aiperf/server_metrics_export.csv
+            ${{ env.RESULT_FILENAME }}_aiperf/server_metrics_export.parquet
+            ${{ env.RESULT_FILENAME }}_aiperf/gpu_telemetry_export.jsonl
+            ${{ env.RESULT_FILENAME }}_aiperf/logs/aiperf.log
+            ${{ env.RESULT_FILENAME }}_aiperf/logs/*.log
           # Excluded by design (multi-GB debug artifacts, not consumed by
-          # post-processing): results/trace_replay/inputs.json (pre-formatted
-          # request bodies — the mmap'd binary equivalent is rebuilt from
-          # --public-dataset + --random-seed) and
-          # results/trace_replay/profile_export_raw.jsonl (full HTTP bodies
-          # per request — recoverable by re-running the same trace).
+          # post-processing): inputs.json (pre-formatted request bodies — the
+          # mmap'd binary equivalent is rebuilt from --public-dataset +
+          # --random-seed) and profile_export_raw.jsonl (full HTTP bodies per
+          # request — recoverable by re-running the same trace).
           if-no-files-found: ignore
 
       - name: Upload server logs

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
@@ -51,6 +51,7 @@ jobs:
             multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
             agentic-config: ${{ steps.get-jobs.outputs.agentic-config }}
             multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }}
+            agentic-replay-config: ${{ steps.get-jobs.outputs.agentic-replay-config }}
         steps:
             - name: Checkout code (ref)
               if: ${{ inputs.ref && inputs.ref != '' }}
@@ -71,12 +72,14 @@ jobs:
                     ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
                   AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x]))")
                   MULTI_AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' in x]))")
-                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
+                  AGENTIC_REPLAY=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-replay' and 'prefill' not in x]))")
+                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') not in ('agentic-coding', 'agentic-replay') and not x.get('eval-only', False)]))")
                   MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
-                  EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and x.get('run-eval', False)]))")
+                  EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') not in ('agentic-coding', 'agentic-replay') and x.get('run-eval', False)]))")
                   MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))")
                   echo "agentic-config=$AGENTIC" >> $GITHUB_OUTPUT
                   echo "multi-node-agentic-config=$MULTI_AGENTIC" >> $GITHUB_OUTPUT
+                  echo "agentic-replay-config=$AGENTIC_REPLAY" >> $GITHUB_OUTPUT
                   echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
                   echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
                   echo "eval-config=$EVALS" >> $GITHUB_OUTPUT
@@ -179,6 +182,7 @@ jobs:
             model: ${{ matrix.config.model }}
             model-prefix: ${{ matrix.config.model-prefix }}
             framework: ${{ matrix.config.framework }}
+            benchmark-client: ${{ matrix.config.benchmark-client }}
             precision: ${{ matrix.config.precision }}
             tp: ${{ matrix.config.tp }}
             ep: ${{ matrix.config.ep }}
@@ -255,6 +259,7 @@ jobs:
             model: ${{ matrix.config.model }}
             model-prefix: ${{ matrix.config.model-prefix }}
             framework: ${{ matrix.config.framework }}
+            benchmark-client: ${{ matrix.config.benchmark-client }}
             precision: ${{ matrix.config.precision }}
             tp: ${{ matrix.config.tp }}
             ep: ${{ matrix.config.ep }}
@@ -267,6 +272,43 @@ jobs:
             run-eval: false
             ref: ${{ inputs.ref }}
 
+    test-sweep-agentic-replay:
+        needs: get-jobs
+        if: ${{ needs.get-jobs.outputs.agentic-replay-config != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: agentic-replay /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.agentic-replay-config) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            benchmark-client: ${{ matrix.config.benchmark-client }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            input-file: ${{ matrix.config.input-file }}
+            public-dataset: ${{ matrix.config.public-dataset }}
+            custom-dataset-type: ${{ matrix.config.custom-dataset-type }}
+            tokenizer: ${{ matrix.config.tokenizer }}
+            duration: ${{ inputs.duration-override != '' && inputs.duration-override || matrix.config.duration }}
+            spec-decoding: 'none'
+            disagg: 'false'
+            run-eval: false
+            scenario-type: agentic-replay
+            ref: ${{ inputs.ref }}
+
     test-sweep-evals:
         needs: get-jobs
         if: ${{ needs.get-jobs.outputs.eval-config != '[]' }}
@@ -287,6 +329,7 @@ jobs:
             model: ${{ matrix.config.model }}
             model-prefix: ${{ matrix.config.model-prefix }}
             framework: ${{ matrix.config.framework }}
+            benchmark-client: ${{ matrix.config.benchmark-client }}
             precision: ${{ matrix.config.precision }}
             tp: ${{ matrix.config.tp }}
             ep: ${{ matrix.config.ep }}
@@ -301,8 +344,8 @@ jobs:
             ref: ${{ inputs.ref }}
 
     collect-results:
-        needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-agentic, test-sweep-multi-node-agentic]
-        if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped' || needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }}
+        needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-agentic, test-sweep-multi-node-agentic, test-sweep-agentic-replay]
+        if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped' || needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped' || needs.test-sweep-agentic-replay.result != 'skipped') }}
         uses: ./.github/workflows/collect-results.yml
         secrets: inherit
         with: