From ecafdc02fc7ecf9535eace735f19a670949f23c5 Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Thu, 19 Mar 2026 06:16:40 +0000
Subject: [PATCH 01/16] feat/nv ci test

---
 .ci/README.md                | 171 ++++++++++++++++++++++++++++
 .ci/build.py                 | 210 +++++++++++++++++++++++++++++++++++
 .ci/config.yaml              |  36 ++++++
 .ci/images/ascend/Dockerfile |  31 ++++++
 .ci/images/nvidia/Dockerfile |  26 +++++
 .ci/run.py                   | 195 ++++++++++++++++++++++++++++++++
 pyproject.toml               |   2 +-
 7 files changed, 670 insertions(+), 1 deletion(-)
 create mode 100644 .ci/README.md
 create mode 100644 .ci/build.py
 create mode 100644 .ci/config.yaml
 create mode 100644 .ci/images/ascend/Dockerfile
 create mode 100644 .ci/images/nvidia/Dockerfile
 create mode 100644 .ci/run.py
diff --git a/.ci/README.md b/.ci/README.md
new file mode 100644
index 0000000..59ee101
--- /dev/null
+++ b/.ci/README.md
@@ -0,0 +1,171 @@
+# .ci — CI 镜像与流水线
+
+本目录管理 CI 所用的 Docker 镜像构建与测试流水线执行。
+
+## 目录结构
+
+```
+.ci/
+├── config.yaml              # 统一配置（registry、镜像、job 定义）
+├── build.py                 # 镜像构建脚本
+├── run.py                   # CI 流水线执行脚本
+├── README.md
+└── images/
+    ├── nvidia/Dockerfile    # NVIDIA 平台镜像
+    └── ascend/Dockerfile    # 昇腾平台镜像
+```
+
+## 前置依赖
+
+- Docker
+- Python 3.10+
+- pyyaml (`pip install pyyaml`)
+
+## 配置文件 `config.yaml`
+
+```yaml
+repo:
+  url: https://github.com/InfiniTensor/InfiniOps.git
+  branch: master
+
+registry:
+  url: ""                    # Harbor 地址，本地开发时留空
+  project: infiniops
+  credentials_env: REGISTRY_TOKEN
+
+images:
+  nvidia:
+    dockerfile: .ci/images/nvidia/
+    build_args:
+      BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
+  ascend:
+    dockerfile: .ci/images/ascend/
+    build_args:
+      BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0
+    private_sdk:
+      source: "${PRIVATE_SDK_URL}"
+
+jobs:
+  nvidia_gpu:
+    image: stable            # stable | latest | 具体 commit hash
+    platform: nvidia
+    resources:
+      gpu_ids: "0"           # GPU 设备 ID，如 "0" "0,2" "all"
+      gpu_type: A100
+      memory: 32GB
+      timeout: 3600
+    setup: pip install .[dev]
+    stages:
+      - name: test
+        run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml
+```
+
+- **`registry.url`** 为空时镜像仅保存在本地，tag 格式为 `<project>-ci/<platform>:<tag>`。
+- **`images.<platform>.build_args`** 会作为 `--build-arg` 传入 `docker build`。
+- **`jobs.<name>.image`** 支持 `stable`、`latest` 或具体 commit hash。
+- **`resources.gpu_ids`** 指定 GPU 设备 ID，支持 `"0"`、`"0,2"`、`"all"` 等格式，映射为 `docker run --gpus "device=..."`。也可保留 `gpu_count` 按数量分配。
+
+## 镜像构建 `build.py`
+
+```bash
+python .ci/build.py [options]
+```
+
+| 参数 | 默认值 | 说明 |
+|---|---|---|
+| `--platform` | `all` | 构建平台：`nvidia`、`ascend` 或 `all` |
+| `--commit` | `HEAD` | 用于镜像 tag 的 git ref |
+| `--push` | — | 构建后推送到 registry |
+| `--force` | — | 跳过变更检测，强制构建 |
+| `--dry-run` | — | 仅打印命令，不执行 |
+| `--config` | `.ci/config.yaml` | 配置文件路径 |
+
+### 示例
+
+```bash
+# 构建 nvidia 镜像（自动检测 Dockerfile 变更，无变更则跳过）
+python .ci/build.py --platform nvidia
+
+# 强制构建
+python .ci/build.py --platform nvidia --force
+
+# 构建全部平台并推送到 registry
+python .ci/build.py --push --force
+
+# 预览实际执行的 docker 命令
+python .ci/build.py --platform nvidia --force --dry-run
+```
+
+### 构建流程
+
+1. 通过 `git diff HEAD~1` 检测 Dockerfile 目录是否有变更（`--force` 跳过此步）
+2. `docker build` 构建镜像，同时打 `<commit-hash>` 和 `latest` 两个 tag
+3. 自动透传宿主机的 `http_proxy`/`https_proxy`/`no_proxy` 到构建容器
+4. 若指定 `--push`，将两个 tag 推送到 registry
+
+### 产物
+
+| Tag | 说明 |
+|---|---|
+| `infiniops-ci/<platform>:<commit-hash>` | 精确追溯到某次构建 |
+| `infiniops-ci/<platform>:latest` | 最近一次构建 |
+
+## 流水线执行 `run.py`
+
+```bash
+python .ci/run.py [options]
+```
+
+| 参数 | 默认值 | 说明 |
+|---|---|---|
+| `--job` | 配置中第一个 job | 要执行的 job 名称 |
+| `--branch` | `config.yaml` 中的 `repo.branch` | 覆盖克隆分支 |
+| `--stage` | 全部 | 仅运行指定 stage |
+| `--image-tag` | job 中的 `image` 字段 | 覆盖镜像版本 |
+| `--gpu-id` | config 中的 `gpu_ids` | GPU 设备 ID，如 `0`、`0,2`、`all` |
+| `--dry-run` | — | 仅打印 docker 命令，不执行 |
+| `--config` | `.ci/config.yaml` | 配置文件路径 |
+
+### 示例
+
+```bash
+# 运行默认 job
+python .ci/run.py
+
+# 指定分支和镜像版本
+python .ci/run.py --branch feature-xxx --image-tag latest
+
+# 只用 GPU 0 运行
+python .ci/run.py --gpu-id 0
+
+# 用 GPU 0 和 2 运行
+python .ci/run.py --gpu-id 0,2
+
+# 使用全部 GPU
+python .ci/run.py --gpu-id all
+
+# 只跑 test stage
+python .ci/run.py --stage test
+
+# 预览 docker 命令
+python .ci/run.py --dry-run
+```
+
+### 执行流程
+
+1. 解析 job 配置，拉取对应镜像
+2. `docker run` 启动容器（自动挂载 GPU、限制内存）
+3. 容器内 `git clone` → `checkout` → 执行 `setup` 命令
+4. 依次执行各 stage，汇总结果
+
+## 代理配置
+
+如果网络环境需要代理，在宿主机设置环境变量后即可：
+
+```bash
+export http_proxy=http://localhost:9991
+export https_proxy=http://localhost:9991
+```
+
+- **`build.py`** 会自动透传代理到 `docker build`（通过 `--build-arg` + `--network host`）。
+- **`run.py`** 使用 `--network host`，容器内可直接访问宿主机代理。
diff --git a/.ci/build.py b/.ci/build.py
new file mode 100644
index 0000000..489ebf0
--- /dev/null
+++ b/.ci/build.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+"""CI image builder: detect changes, build, tag, and optionally push Docker images."""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    print(
+        "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr
+    )
+    sys.exit(1)
+
+
+def load_config(path):
+    with open(path, encoding="utf-8") as f:
+        return yaml.safe_load(f)
+
+
+def get_git_commit(ref="HEAD"):
+    result = subprocess.run(
+        ["git", "rev-parse", "--short", ref],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        print(f"error: failed to get commit hash for `{ref}`", file=sys.stderr)
+        sys.exit(1)
+
+    return result.stdout.strip()
+
+
+def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"):
+    """Check if any file under `dockerfile_dir` changed since `base_ref`."""
+    result = subprocess.run(
+        ["git", "diff", "--name-only", base_ref, "--", dockerfile_dir],
+        capture_output=True,
+        text=True,
+    )
+
+    return bool(result.stdout.strip())
+
+
+def build_image_tag(registry_url, project, platform, tag):
+    if registry_url:
+        return f"{registry_url}/{project}/{platform}:{tag}"
+
+    return f"{project}-ci/{platform}:{tag}"
+
+
+def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run):
+    """Build a single platform image. Returns True on success."""
+    registry_url = registry_cfg.get("url", "")
+    project = registry_cfg.get("project", "infiniops")
+    dockerfile_dir = platform_cfg["dockerfile"]
+
+    commit_tag = build_image_tag(registry_url, project, platform, commit)
+    latest_tag = build_image_tag(registry_url, project, platform, "latest")
+
+    build_args_cfg = platform_cfg.get("build_args", {})
+    build_cmd = ["docker", "build", "--network", "host"]
+    for key, value in build_args_cfg.items():
+        build_cmd.extend(["--build-arg", f"{key}={value}"])
+
+    for proxy_var in ("http_proxy", "https_proxy", "no_proxy"):
+        proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.upper())
+        if proxy_val:
+            build_cmd.extend(["--build-arg", f"{proxy_var}={proxy_val}"])
+
+    private_sdk = platform_cfg.get("private_sdk", {})
+    if private_sdk:
+        sdk_url = private_sdk.get("source", "")
+        if sdk_url.startswith("${") and sdk_url.endswith("}"):
+            env_var = sdk_url[2:-1]
+            sdk_url = os.environ.get(env_var, "")
+        if sdk_url:
+            build_cmd.extend(["--build-arg", f"PRIVATE_SDK_URL={sdk_url}"])
+
+    build_cmd.extend(["-t", commit_tag, "-t", latest_tag, dockerfile_dir])
+
+    if dry_run:
+        print(f"[dry-run] {' '.join(build_cmd)}")
+        if push:
+            print(f"[dry-run] docker push {commit_tag}")
+            print(f"[dry-run] docker push {latest_tag}")
+
+        return True
+
+    print(f"==> building {platform}: {commit_tag}", file=sys.stderr)
+    result = subprocess.run(build_cmd)
+    if result.returncode != 0:
+        error = {
+            "stage": "build",
+            "platform": platform,
+            "tag": commit_tag,
+            "exit_code": result.returncode,
+        }
+        print(json.dumps(error), file=sys.stderr)
+
+        return False
+
+    if push:
+        for tag in (commit_tag, latest_tag):
+            print(f"==> pushing {tag}", file=sys.stderr)
+            push_result = subprocess.run(["docker", "push", tag])
+            if push_result.returncode != 0:
+                error = {
+                    "stage": "push",
+                    "platform": platform,
+                    "tag": tag,
+                    "exit_code": push_result.returncode,
+                }
+                print(json.dumps(error), file=sys.stderr)
+
+                return False
+
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Build CI Docker images")
+    parser.add_argument(
+        "--platform",
+        type=str,
+        default="all",
+        help="Platform to build: nvidia, ascend, or all (default: all)",
+    )
+    parser.add_argument(
+        "--config",
+        type=Path,
+        default=Path(__file__).resolve().parent / "config.yaml",
+        help="Path to config.yaml",
+    )
+    parser.add_argument(
+        "--commit",
+        type=str,
+        default="HEAD",
+        help="Git ref for tagging the image (default: HEAD)",
+    )
+    parser.add_argument(
+        "--push",
+        action="store_true",
+        help="Push images to registry after building",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Skip change detection and force build",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print commands without executing",
+    )
+    args = parser.parse_args()
+
+    config = load_config(args.config)
+    registry_cfg = config.get("registry", {})
+    images_cfg = config.get("images", {})
+
+    if not images_cfg:
+        print("error: no `images` section in config", file=sys.stderr)
+        sys.exit(1)
+
+    if args.platform == "all":
+        platforms = list(images_cfg.keys())
+    else:
+        if args.platform not in images_cfg:
+            print(
+                f"error: platform `{args.platform}` not found in config",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        platforms = [args.platform]
+
+    commit = get_git_commit(args.commit)
+    failed = False
+
+    for platform in platforms:
+        platform_cfg = images_cfg[platform]
+        dockerfile_dir = platform_cfg["dockerfile"]
+
+        if not Path(dockerfile_dir).is_dir():
+            print(
+                f"warning: dockerfile directory `{dockerfile_dir}` does not exist, skipping {platform}",
+                file=sys.stderr,
+            )
+            continue
+
+        if not args.force and not has_dockerfile_changed(dockerfile_dir):
+            print(f"==> {platform}: no changes detected, skipping", file=sys.stderr)
+            continue
+
+        ok = build_image(
+            platform, platform_cfg, registry_cfg, commit, args.push, args.dry_run
+        )
+        if not ok:
+            failed = True
+
+    if failed:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.ci/config.yaml b/.ci/config.yaml
new file mode 100644
index 0000000..fea3f7c
--- /dev/null
+++ b/.ci/config.yaml
@@ -0,0 +1,36 @@
+repo:
+  url: https://github.com/InfiniTensor/InfiniOps.git
+  branch: master
+
+registry:
+  url: ""                              # TODO: Harbor not ready yet
+  project: infiniops
+  credentials_env: REGISTRY_TOKEN
+
+images: 
+  nvidia:
+    dockerfile: .ci/images/nvidia/
+    build_args:
+      BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
+  ascend:                              # TODO: Ascend image is not ready yet
+    dockerfile: .ci/images/ascend/
+    build_args:
+      BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0
+    private_sdk:
+      source: "${PRIVATE_SDK_URL}"
+
+jobs:
+  nvidia_gpu:
+    image: stable
+    platform: nvidia
+    resources:
+      gpu_ids: "0"                       # 指定 GPU ID，如 "0" "0,2" "all"
+      gpu_type: A100
+      memory: 32GB
+      timeout: 3600
+
+    setup: pip install .[dev]
+
+    stages:
+      - name: test
+        run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml
diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile
new file mode 100644
index 0000000..87f7c91
--- /dev/null
+++ b/.ci/images/ascend/Dockerfile
@@ -0,0 +1,31 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        git \
+        cmake \
+        ninja-build \
+        curl \
+        libclang-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG PRIVATE_SDK_URL
+RUN if [ -n "$PRIVATE_SDK_URL" ]; then \
+        curl -fSL "$PRIVATE_SDK_URL" -o /tmp/sdk.run && \
+        chmod +x /tmp/sdk.run && /tmp/sdk.run --quiet && \
+        rm /tmp/sdk.run; \
+    fi
+
+RUN pip install --no-cache-dir \
+    scikit-build-core \
+    pybind11 \
+    libclang \
+    pytest \
+    pytest-cov \
+    pytest-xdist \
+    pyyaml
+
+WORKDIR /workspace
diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile
new file mode 100644
index 0000000..d89ea91
--- /dev/null
+++ b/.ci/images/nvidia/Dockerfile
@@ -0,0 +1,26 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+ARG http_proxy
+ARG https_proxy
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        git \
+        cmake \
+        ninja-build \
+        libclang-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --no-cache-dir \
+    scikit-build-core \
+    pybind11 \
+    libclang \
+    pytest \
+    pytest-cov \
+    pytest-xdist \
+    pyyaml
+
+WORKDIR /workspace
diff --git a/.ci/run.py b/.ci/run.py
new file mode 100644
index 0000000..0421a56
--- /dev/null
+++ b/.ci/run.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+"""Standalone Docker CI runner: clone repo, setup, run stages. Output to stdout."""
+
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    print(
+        "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr
+    )
+    sys.exit(1)
+
+
+def load_config(path):
+    with open(path, encoding="utf-8") as f:
+        return yaml.safe_load(f)
+
+
+def resolve_image(config, platform, image_tag):
+    """Resolve an image reference ('stable', 'latest', or commit hash) to a full URL."""
+    registry = config.get("registry", {})
+    registry_url = registry.get("url", "")
+    project = registry.get("project", "infiniops")
+
+    if not registry_url:
+        return f"{project}-ci/{platform}:{image_tag}"
+
+    return f"{registry_url}/{project}/{platform}:{image_tag}"
+
+
+def build_runner_script():
+    return r"""
+export https_proxy=http://localhost:9991
+set -e
+cd /workspace
+git clone "$REPO_URL" repo
+cd repo
+git checkout "$BRANCH"
+echo "========== Setup =========="
+eval "$SETUP_CMD"
+set +e
+failed=0
+for i in $(seq 1 "$NUM_STAGES"); do
+  name_var="STAGE_${i}_NAME"
+  cmd_var="STAGE_${i}_CMD"
+  name="${!name_var}"
+  cmd="${!cmd_var}"
+  echo "========== Stage: $name =========="
+  eval "$cmd" || failed=1
+done
+echo "========== Summary =========="
+exit $failed
+"""
+
+
+def build_docker_args(
+    config, job_name, repo_url, branch, stages, workdir, image_tag_override,
+    gpu_id_override=None,
+):
+    job = config["jobs"][job_name]
+    platform = job.get("platform", "nvidia")
+    image_tag = image_tag_override or job.get("image", "stable")
+    image = resolve_image(config, platform, image_tag)
+    resources = job.get("resources", {})
+    setup_cmd = job.get("setup", "pip install .[dev]")
+
+    args = [
+        "docker",
+        "run",
+        "--rm",
+        "--network",
+        "host",
+        "-i",
+        "-w",
+        workdir,
+        "-e",
+        f"REPO_URL={repo_url}",
+        "-e",
+        f"BRANCH={branch}",
+        "-e",
+        f"SETUP_CMD={setup_cmd}",
+        "-e",
+        f"NUM_STAGES={len(stages)}",
+    ]
+    for i, s in enumerate(stages):
+        args.append("-e")
+        args.append(f"STAGE_{i + 1}_NAME={s['name']}")
+        args.append("-e")
+        args.append(f"STAGE_{i + 1}_CMD={s['run']}")
+
+    gpu_id = gpu_id_override or str(resources.get("gpu_ids", ""))
+    gpu_count = resources.get("gpu_count", 0)
+    if gpu_id:
+        if gpu_id == "all":
+            args.extend(["--gpus", "all"])
+        else:
+            args.extend(["--gpus", f'"device={gpu_id}"'])
+    elif gpu_count and gpu_count > 0:
+        args.extend(["--gpus", f"count={gpu_count}"])
+
+    memory = resources.get("memory")
+    if memory:
+        mem = str(memory).upper().replace("GB", "g").replace("MB", "m")
+        if not mem.endswith("g") and not mem.endswith("m"):
+            mem = f"{mem}g"
+        args.extend(["--memory", mem])
+
+    timeout_sec = resources.get("timeout")
+    if timeout_sec:
+        args.extend(["--stop-timeout", str(timeout_sec)])
+
+    args.append(image)
+    args.append("bash")
+    args.append("-c")
+    args.append(build_runner_script().strip())
+
+    return args
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run Docker CI pipeline")
+    parser.add_argument(
+        "--config",
+        type=Path,
+        default=Path(__file__).resolve().parent / "config.yaml",
+        help="Path to config.yaml",
+    )
+    parser.add_argument("--branch", type=str, help="Override repo branch")
+    parser.add_argument("--job", type=str, help="Job name to run (default: first job)")
+    parser.add_argument(
+        "--stage",
+        type=str,
+        help="Run only this stage name (still runs setup first)",
+    )
+    parser.add_argument(
+        "--image-tag",
+        type=str,
+        help="Override image tag (stable, latest, or commit hash)",
+    )
+    parser.add_argument(
+        "--gpu-id",
+        type=str,
+        help='GPU device IDs to use, e.g. "0", "0,2", "all"',
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print docker command and exit",
+    )
+    args = parser.parse_args()
+
+    config = load_config(args.config)
+    repo = config.get("repo", {})
+    repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git")
+    branch = args.branch or repo.get("branch", "dev-infra")
+
+    jobs = config.get("jobs", {})
+    if not jobs:
+        print("error: no jobs in config", file=sys.stderr)
+        sys.exit(1)
+    job_name = args.job or next(iter(jobs))
+    if job_name not in jobs:
+        print(f"error: job {job_name!r} not in config", file=sys.stderr)
+        sys.exit(1)
+
+    job = jobs[job_name]
+    all_stages = job.get("stages", [])
+    if args.stage:
+        stages = [s for s in all_stages if s["name"] == args.stage]
+        if not stages:
+            print(f"error: stage {args.stage!r} not found", file=sys.stderr)
+            sys.exit(1)
+    else:
+        stages = all_stages
+
+    workdir = "/workspace"
+    docker_args = build_docker_args(
+        config, job_name, repo_url, branch, stages, workdir, args.image_tag,
+        gpu_id_override=args.gpu_id,
+    )
+
+    if args.dry_run:
+        print(" ".join(docker_args))
+
+        return
+
+    sys.exit(subprocess.run(docker_args).returncode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 765b90a..3dbc186 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "InfiniOps"
 version = "0.1.0"
 
 [project.optional-dependencies]
-dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch"]
+dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch", "pyyaml"]
 
 [tool.scikit-build.wheel]
 install-dir = "infini"

From f8a60644f8a46a41718e7d1fd42bab1988ed9ecc Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Fri, 20 Mar 2026 07:24:55 +0000
Subject: [PATCH 02/16] feat: ci sys for nv platform

---
 .ci/README.md                | 155 +++++-------------
 .ci/build.py                 | 103 ++++++++++--
 .ci/config.yaml              |  17 +-
 .ci/images/ascend/Dockerfile |   8 +
 .ci/images/nvidia/Dockerfile |   5 +
 .ci/run.py                   | 117 ++++++++++++--
 .ci/tests/__init__.py        |   0
 .ci/tests/conftest.py        |  42 +++++
 .ci/tests/test_build.py      | 186 ++++++++++++++++++++++
 .ci/tests/test_run.py        | 298 +++++++++++++++++++++++++++++++++++
 10 files changed, 775 insertions(+), 156 deletions(-)
 create mode 100644 .ci/tests/__init__.py
 create mode 100644 .ci/tests/conftest.py
 create mode 100644 .ci/tests/test_build.py
 create mode 100644 .ci/tests/test_run.py

diff --git a/.ci/README.md b/.ci/README.md
index 59ee101..0bd59bd 100644
--- a/.ci/README.md
+++ b/.ci/README.md
@@ -1,25 +1,18 @@
 # .ci — CI 镜像与流水线
 
-本目录管理 CI 所用的 Docker 镜像构建与测试流水线执行。
-
-## 目录结构
-
 ```
 .ci/
-├── config.yaml              # 统一配置（registry、镜像、job 定义）
-├── build.py                 # 镜像构建脚本
-├── run.py                   # CI 流水线执行脚本
-├── README.md
+├── config.yaml              # 统一配置（镜像、job 定义）
+├── build.py                 # 镜像构建
+├── run.py                   # CI 流水线执行
 └── images/
-    ├── nvidia/Dockerfile    # NVIDIA 平台镜像
-    └── ascend/Dockerfile    # 昇腾平台镜像
+    ├── nvidia/Dockerfile
+    └── ascend/Dockerfile
 ```
 
-## 前置依赖
+**前置依赖**：Docker、Python 3.10+、`pip install pyyaml`
 
-- Docker
-- Python 3.10+
-- pyyaml (`pip install pyyaml`)
+---
 
 ## 配置文件 `config.yaml`
 
@@ -28,144 +21,72 @@ repo:
   url: https://github.com/InfiniTensor/InfiniOps.git
   branch: master
 
-registry:
-  url: ""                    # Harbor 地址，本地开发时留空
-  project: infiniops
-  credentials_env: REGISTRY_TOKEN
-
 images:
   nvidia:
     dockerfile: .ci/images/nvidia/
     build_args:
       BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
-  ascend:
-    dockerfile: .ci/images/ascend/
-    build_args:
-      BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0
-    private_sdk:
-      source: "${PRIVATE_SDK_URL}"
 
 jobs:
   nvidia_gpu:
-    image: stable            # stable | latest | 具体 commit hash
+    image: latest            # latest | <commit-hash>
     platform: nvidia
     resources:
-      gpu_ids: "0"           # GPU 设备 ID，如 "0" "0,2" "all"
-      gpu_type: A100
+      gpu_ids: "0"           # "0" | "0,2" | "all"
       memory: 32GB
-      timeout: 3600
+      shm_size: 16g          # 避免 PyTorch SHMEM 不足
+      timeout: 3600          # 容器内脚本最大运行秒数
     setup: pip install .[dev]
+    env:                     # 可选，注入容器环境变量
+      MY_VAR: value
     stages:
       - name: test
-        run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml
+        run: pytest tests/ -n auto -v --tb=short --junitxml=/workspace/results/test-results.xml
 ```
 
-- **`registry.url`** 为空时镜像仅保存在本地，tag 格式为 `<project>-ci/<platform>:<tag>`。
-- **`images.<platform>.build_args`** 会作为 `--build-arg` 传入 `docker build`。
-- **`jobs.<name>.image`** 支持 `stable`、`latest` 或具体 commit hash。
-- **`resources.gpu_ids`** 指定 GPU 设备 ID，支持 `"0"`、`"0,2"`、`"all"` 等格式，映射为 `docker run --gpus "device=..."`。也可保留 `gpu_count` 按数量分配。
+---
 
 ## 镜像构建 `build.py`
 
-```bash
-python .ci/build.py [options]
-```
-
-| 参数 | 默认值 | 说明 |
-|---|---|---|
-| `--platform` | `all` | 构建平台：`nvidia`、`ascend` 或 `all` |
-| `--commit` | `HEAD` | 用于镜像 tag 的 git ref |
-| `--push` | — | 构建后推送到 registry |
-| `--force` | — | 跳过变更检测，强制构建 |
-| `--dry-run` | — | 仅打印命令，不执行 |
-| `--config` | `.ci/config.yaml` | 配置文件路径 |
-
-### 示例
+| 参数 | 说明 |
+|---|---|
+| `--platform nvidia\|ascend\|all` | 构建平台，默认 `all` |
+| `--force` | 跳过 Dockerfile 变更检测 |
+| `--dry-run` | 打印命令不执行 |
 
 ```bash
-# 构建 nvidia 镜像（自动检测 Dockerfile 变更，无变更则跳过）
+# 检测变更后构建（无变更自动跳过）
 python .ci/build.py --platform nvidia
 
 # 强制构建
 python .ci/build.py --platform nvidia --force
-
-# 构建全部平台并推送到 registry
-python .ci/build.py --push --force
-
-# 预览实际执行的 docker 命令
-python .ci/build.py --platform nvidia --force --dry-run
 ```
 
-### 构建流程
+构建产物以宿主机本地镜像 tag 存储：`infiniops-ci/<platform>:<commit-hash>` 和 `:latest`。
+代理、`no_proxy` 自动从宿主机环境变量透传到 `docker build`。
 
-1. 通过 `git diff HEAD~1` 检测 Dockerfile 目录是否有变更（`--force` 跳过此步）
-2. `docker build` 构建镜像，同时打 `<commit-hash>` 和 `latest` 两个 tag
-3. 自动透传宿主机的 `http_proxy`/`https_proxy`/`no_proxy` 到构建容器
-4. 若指定 `--push`，将两个 tag 推送到 registry
+> `--push` 为预留功能，需在 `config.yaml` 中配置 `registry` 段后方可使用。
 
-### 产物
-
-| Tag | 说明 |
-|---|---|
-| `infiniops-ci/<platform>:<commit-hash>` | 精确追溯到某次构建 |
-| `infiniops-ci/<platform>:latest` | 最近一次构建 |
+---
 
 ## 流水线执行 `run.py`
 
-```bash
-python .ci/run.py [options]
-```
-
-| 参数 | 默认值 | 说明 |
-|---|---|---|
-| `--job` | 配置中第一个 job | 要执行的 job 名称 |
-| `--branch` | `config.yaml` 中的 `repo.branch` | 覆盖克隆分支 |
-| `--stage` | 全部 | 仅运行指定 stage |
-| `--image-tag` | job 中的 `image` 字段 | 覆盖镜像版本 |
-| `--gpu-id` | config 中的 `gpu_ids` | GPU 设备 ID，如 `0`、`0,2`、`all` |
-| `--dry-run` | — | 仅打印 docker 命令，不执行 |
-| `--config` | `.ci/config.yaml` | 配置文件路径 |
-
-### 示例
+| 参数 | 说明 |
+|---|---|
+| `--branch` | 覆盖克隆分支 |
+| `--stage` | 只运行指定 stage |
+| `--image-tag` | 覆盖镜像 tag |
+| `--gpu-id` | 覆盖 GPU 设备 ID |
+| `--results-dir` | 宿主机目录，挂载到容器 `/workspace/results` |
+| `--dry-run` | 打印 docker 命令不执行 |
 
 ```bash
 # 运行默认 job
-python .ci/run.py
-
-# 指定分支和镜像版本
-python .ci/run.py --branch feature-xxx --image-tag latest
-
-# 只用 GPU 0 运行
-python .ci/run.py --gpu-id 0
-
-# 用 GPU 0 和 2 运行
-python .ci/run.py --gpu-id 0,2
-
-# 使用全部 GPU
-python .ci/run.py --gpu-id all
-
-# 只跑 test stage
-python .ci/run.py --stage test
+python .ci/run.py --branch feat/my-feature --results-dir ./ci-results
 
-# 预览 docker 命令
-python .ci/run.py --dry-run
-```
-
-### 执行流程
-
-1. 解析 job 配置，拉取对应镜像
-2. `docker run` 启动容器（自动挂载 GPU、限制内存）
-3. 容器内 `git clone` → `checkout` → 执行 `setup` 命令
-4. 依次执行各 stage，汇总结果
-
-## 代理配置
-
-如果网络环境需要代理，在宿主机设置环境变量后即可：
-
-```bash
-export http_proxy=http://localhost:9991
-export https_proxy=http://localhost:9991
+# 只跑 test stage，预览命令
+python .ci/run.py --stage test --dry-run
 ```
 
-- **`build.py`** 会自动透传代理到 `docker build`（通过 `--build-arg` + `--network host`）。
-- **`run.py`** 使用 `--network host`，容器内可直接访问宿主机代理。
+容器内执行流程：`git clone` → `checkout` → `setup` → stages。
+代理从宿主机透传，测试结果写入 `--results-dir`。每次运行均为干净环境（不挂载宿主机 pip 缓存）。
diff --git a/.ci/build.py b/.ci/build.py
index 489ebf0..2339319 100644
--- a/.ci/build.py
+++ b/.ci/build.py
@@ -4,6 +4,7 @@
 import argparse
 import json
 import os
+import shlex
 import subprocess
 import sys
 from pathlib import Path
@@ -28,6 +29,7 @@ def get_git_commit(ref="HEAD"):
         capture_output=True,
         text=True,
     )
+
     if result.returncode != 0:
         print(f"error: failed to get commit hash for `{ref}`", file=sys.stderr)
         sys.exit(1)
@@ -43,9 +45,61 @@ def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"):
         text=True,
     )
 
+    if result.returncode != 0:
+        print(
+            "warning: git diff failed (shallow clone or initial commit?);"
+            " assuming Dockerfile changed",
+            file=sys.stderr,
+        )
+        return True
+
     return bool(result.stdout.strip())
 
 
+def docker_login(registry_cfg, dry_run):
+    """Log in to the registry using `credentials_env` token.
+
+    Returns True on success.
+
+    NOTE: Registry support is currently unused (`config.yaml` has no registry
+    section). Retained for future integration with an external image management
+    system.
+    """
+    credentials_env = registry_cfg.get("credentials_env")
+    registry_url = registry_cfg.get("url", "")
+
+    if not credentials_env or not registry_url:
+        return True
+
+    token = os.environ.get(credentials_env)
+
+    if not token:
+        print(
+            f"error: {credentials_env} not set, cannot login",
+            file=sys.stderr,
+        )
+        return False
+
+    if dry_run:
+        print(
+            f"[dry-run] echo <token> | docker login {registry_url}"
+            " --username token --password-stdin"
+        )
+        return True
+
+    result = subprocess.run(
+        ["docker", "login", registry_url, "--username", "token", "--password-stdin"],
+        input=token,
+        text=True,
+    )
+
+    if result.returncode != 0:
+        print("error: docker login failed", file=sys.stderr)
+        return False
+
+    return True
+
+
 def build_image_tag(registry_url, project, platform, tag):
     if registry_url:
         return f"{registry_url}/{project}/{platform}:{tag}"
@@ -53,46 +107,53 @@ def build_image_tag(registry_url, project, platform, tag):
     return f"{project}-ci/{platform}:{tag}"
 
 
-def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run):
+def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run, logged_in):
     """Build a single platform image. Returns True on success."""
     registry_url = registry_cfg.get("url", "")
     project = registry_cfg.get("project", "infiniops")
     dockerfile_dir = platform_cfg["dockerfile"]
-
     commit_tag = build_image_tag(registry_url, project, platform, commit)
     latest_tag = build_image_tag(registry_url, project, platform, "latest")
 
     build_args_cfg = platform_cfg.get("build_args", {})
     build_cmd = ["docker", "build", "--network", "host"]
+
     for key, value in build_args_cfg.items():
         build_cmd.extend(["--build-arg", f"{key}={value}"])
 
-    for proxy_var in ("http_proxy", "https_proxy", "no_proxy"):
-        proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.upper())
+    for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"):
+        proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower())
+
         if proxy_val:
             build_cmd.extend(["--build-arg", f"{proxy_var}={proxy_val}"])
+            build_cmd.extend(["--build-arg", f"{proxy_var.lower()}={proxy_val}"])
 
     private_sdk = platform_cfg.get("private_sdk", {})
+
     if private_sdk:
-        sdk_url = private_sdk.get("source", "")
-        if sdk_url.startswith("${") and sdk_url.endswith("}"):
-            env_var = sdk_url[2:-1]
-            sdk_url = os.environ.get(env_var, "")
+        source_env = private_sdk.get("source_env", "")
+        sdk_url = os.environ.get(source_env, "") if source_env else ""
+
         if sdk_url:
             build_cmd.extend(["--build-arg", f"PRIVATE_SDK_URL={sdk_url}"])
 
     build_cmd.extend(["-t", commit_tag, "-t", latest_tag, dockerfile_dir])
 
     if dry_run:
-        print(f"[dry-run] {' '.join(build_cmd)}")
+        print(f"[dry-run] {shlex.join(build_cmd)}")
+
         if push:
-            print(f"[dry-run] docker push {commit_tag}")
-            print(f"[dry-run] docker push {latest_tag}")
+            if not logged_in:
+                print("[dry-run] (skipping push: docker login failed)")
+            else:
+                print(f"[dry-run] docker push {commit_tag}")
+                print(f"[dry-run] docker push {latest_tag}")
 
         return True
 
     print(f"==> building {platform}: {commit_tag}", file=sys.stderr)
     result = subprocess.run(build_cmd)
+
     if result.returncode != 0:
         error = {
             "stage": "build",
@@ -105,9 +166,14 @@ def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run):
         return False
 
     if push:
+        if not logged_in:
+            print("error: docker login failed, cannot push", file=sys.stderr)
+            return False
+
         for tag in (commit_tag, latest_tag):
             print(f"==> pushing {tag}", file=sys.stderr)
             push_result = subprocess.run(["docker", "push", tag])
+
             if push_result.returncode != 0:
                 error = {
                     "stage": "push",
@@ -145,7 +211,7 @@ def main():
     parser.add_argument(
         "--push",
         action="store_true",
-        help="Push images to registry after building",
+        help="Push images to registry after building (requires registry in config)",
     )
     parser.add_argument(
         "--force",
@@ -179,6 +245,7 @@ def main():
         platforms = [args.platform]
 
     commit = get_git_commit(args.commit)
+    logged_in = docker_login(registry_cfg, args.dry_run) if args.push else True
     failed = False
 
     for platform in platforms:
@@ -187,7 +254,8 @@ def main():
 
         if not Path(dockerfile_dir).is_dir():
             print(
-                f"warning: dockerfile directory `{dockerfile_dir}` does not exist, skipping {platform}",
+                f"warning: dockerfile directory `{dockerfile_dir}` does not exist,"
+                f" skipping {platform}",
                 file=sys.stderr,
             )
             continue
@@ -197,8 +265,15 @@ def main():
             continue
 
         ok = build_image(
-            platform, platform_cfg, registry_cfg, commit, args.push, args.dry_run
+            platform,
+            platform_cfg,
+            registry_cfg,
+            commit,
+            args.push,
+            args.dry_run,
+            logged_in=logged_in,
         )
+
         if not ok:
             failed = True
 
diff --git a/.ci/config.yaml b/.ci/config.yaml
index fea3f7c..c80c47d 100644
--- a/.ci/config.yaml
+++ b/.ci/config.yaml
@@ -2,12 +2,7 @@ repo:
   url: https://github.com/InfiniTensor/InfiniOps.git
   branch: master
 
-registry:
-  url: ""                              # TODO: Harbor not ready yet
-  project: infiniops
-  credentials_env: REGISTRY_TOKEN
-
-images: 
+images:
   nvidia:
     dockerfile: .ci/images/nvidia/
     build_args:
@@ -17,20 +12,22 @@ images:
     build_args:
       BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0
     private_sdk:
-      source: "${PRIVATE_SDK_URL}"
+      source_env: PRIVATE_SDK_URL
 
 jobs:
   nvidia_gpu:
-    image: stable
+    image: latest
     platform: nvidia
     resources:
       gpu_ids: "0"                       # 指定 GPU ID，如 "0" "0,2" "all"
-      gpu_type: A100
       memory: 32GB
+      shm_size: 16g                      # 避免 PyTorch 默认 64MB SHMEM 不足
       timeout: 3600
 
     setup: pip install .[dev]
+    # env:                             # 可选，注入容器环境变量
+    #   MY_VAR: value
 
     stages:
       - name: test
-        run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml
+        run: pytest tests/ -n auto -v --tb=short --junitxml=/workspace/results/test-results.xml
diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile
index 87f7c91..66392eb 100644
--- a/.ci/images/ascend/Dockerfile
+++ b/.ci/images/ascend/Dockerfile
@@ -3,11 +3,19 @@ FROM ${BASE_IMAGE}
 
 ENV DEBIAN_FRONTEND=noninteractive
 
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy
+ARG https_proxy
+ARG no_proxy
+
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         git \
         cmake \
         ninja-build \
+        coreutils \
         curl \
         libclang-dev \
     && rm -rf /var/lib/apt/lists/*
diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile
index d89ea91..74ccfd1 100644
--- a/.ci/images/nvidia/Dockerfile
+++ b/.ci/images/nvidia/Dockerfile
@@ -3,14 +3,19 @@ FROM ${BASE_IMAGE}
 
 ENV DEBIAN_FRONTEND=noninteractive
 
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
 ARG http_proxy
 ARG https_proxy
+ARG no_proxy
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         git \
         cmake \
         ninja-build \
+        coreutils \
         libclang-dev \
     && rm -rf /var/lib/apt/lists/*
 
diff --git a/.ci/run.py b/.ci/run.py
index 0421a56..3f25afa 100644
--- a/.ci/run.py
+++ b/.ci/run.py
@@ -2,8 +2,11 @@
 """Standalone Docker CI runner: clone repo, setup, run stages. Output to stdout."""
 
 import argparse
+import os
+import shlex
 import subprocess
 import sys
+from datetime import datetime
 from pathlib import Path
 
 try:
@@ -20,8 +23,35 @@ def load_config(path):
         return yaml.safe_load(f)
 
 
+def get_git_commit(ref="HEAD"):
+    result = subprocess.run(
+        ["git", "rev-parse", "--short", ref],
+        capture_output=True,
+        text=True,
+    )
+
+    if result.returncode != 0:
+        return "unknown"
+
+    return result.stdout.strip()
+
+
+def build_results_dir(base, platform, stages, commit):
+    """Build a results directory path: `{base}/{platform}_{stages}_{commit}_{timestamp}`."""
+    stage_names = "+".join(s["name"] for s in stages)
+    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+    dirname = f"{platform}_{stage_names}_{commit}_{timestamp}"
+
+    return Path(base) / dirname
+
+
 def resolve_image(config, platform, image_tag):
-    """Resolve an image reference ('stable', 'latest', or commit hash) to a full URL."""
+    """Resolve an image reference to a full image name.
+
+    Accepts `stable`, `latest`, or a commit hash as `image_tag`. When config
+    contains a registry section, returns a registry-prefixed URL. Otherwise
+    returns a local tag (current default).
+    """
     registry = config.get("registry", {})
     registry_url = registry.get("url", "")
     project = registry.get("project", "infiniops")
@@ -34,9 +64,9 @@ def resolve_image(config, platform, image_tag):
 
 def build_runner_script():
     return r"""
-export https_proxy=http://localhost:9991
 set -e
 cd /workspace
+mkdir -p /workspace/results
 git clone "$REPO_URL" repo
 cd repo
 git checkout "$BRANCH"
@@ -58,15 +88,27 @@ def build_runner_script():
 
 
 def build_docker_args(
-    config, job_name, repo_url, branch, stages, workdir, image_tag_override,
+    config,
+    job_name,
+    repo_url,
+    branch,
+    stages,
+    workdir,
+    image_tag_override,
     gpu_id_override=None,
+    results_dir=None,
 ):
     job = config["jobs"][job_name]
     platform = job.get("platform", "nvidia")
-    image_tag = image_tag_override or job.get("image", "stable")
+    image_tag = image_tag_override or job.get("image", "latest")
     image = resolve_image(config, platform, image_tag)
     resources = job.get("resources", {})
-    setup_cmd = job.get("setup", "pip install .[dev]")
+    setup_raw = job.get("setup", "pip install .[dev]")
+
+    if isinstance(setup_raw, list):
+        setup_cmd = "\n".join(setup_raw)
+    else:
+        setup_cmd = setup_raw
 
     args = [
         "docker",
@@ -86,6 +128,20 @@ def build_docker_args(
         "-e",
         f"NUM_STAGES={len(stages)}",
     ]
+
+    for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"):
+        proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower())
+
+        if proxy_val:
+            args.extend(["-e", f"{proxy_var}={proxy_val}"])
+            args.extend(["-e", f"{proxy_var.lower()}={proxy_val}"])
+
+    for key, value in job.get("env", {}).items():
+        args.extend(["-e", f"{key}={value}"])
+
+    if results_dir:
+        args.extend(["-v", f"{results_dir.resolve()}:/workspace/results"])
+
     for i, s in enumerate(stages):
         args.append("-e")
         args.append(f"STAGE_{i + 1}_NAME={s['name']}")
@@ -94,6 +150,7 @@ def build_docker_args(
 
     gpu_id = gpu_id_override or str(resources.get("gpu_ids", ""))
     gpu_count = resources.get("gpu_count", 0)
+
     if gpu_id:
         if gpu_id == "all":
             args.extend(["--gpus", "all"])
@@ -103,20 +160,28 @@ def build_docker_args(
         args.extend(["--gpus", f"count={gpu_count}"])
 
     memory = resources.get("memory")
+
     if memory:
-        mem = str(memory).upper().replace("GB", "g").replace("MB", "m")
+        mem = str(memory).lower().replace("gb", "g").replace("mb", "m")
+
         if not mem.endswith("g") and not mem.endswith("m"):
             mem = f"{mem}g"
+
         args.extend(["--memory", mem])
 
+    shm_size = resources.get("shm_size")
+
+    if shm_size:
+        args.extend(["--shm-size", str(shm_size)])
+
     timeout_sec = resources.get("timeout")
+    args.append(image)
+
     if timeout_sec:
-        args.extend(["--stop-timeout", str(timeout_sec)])
+        # Requires coreutils `timeout` inside the container image.
+        args.extend(["timeout", str(timeout_sec)])
 
-    args.append(image)
-    args.append("bash")
-    args.append("-c")
-    args.append(build_runner_script().strip())
+    args.extend(["bash", "-c", build_runner_script().strip()])
 
     return args
 
@@ -146,6 +211,12 @@ def main():
         type=str,
         help='GPU device IDs to use, e.g. "0", "0,2", "all"',
     )
+    parser.add_argument(
+        "--results-dir",
+        type=Path,
+        default=Path("ci-results"),
+        help="Base directory for test results (default: ./ci-results)",
+    )
     parser.add_argument(
         "--dry-run",
         action="store_true",
@@ -156,38 +227,54 @@ def main():
     config = load_config(args.config)
     repo = config.get("repo", {})
     repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git")
-    branch = args.branch or repo.get("branch", "dev-infra")
+    branch = args.branch or repo.get("branch", "master")
 
     jobs = config.get("jobs", {})
+
     if not jobs:
         print("error: no jobs in config", file=sys.stderr)
         sys.exit(1)
+
     job_name = args.job or next(iter(jobs))
+
     if job_name not in jobs:
         print(f"error: job {job_name!r} not in config", file=sys.stderr)
         sys.exit(1)
 
     job = jobs[job_name]
     all_stages = job.get("stages", [])
+
     if args.stage:
         stages = [s for s in all_stages if s["name"] == args.stage]
+
         if not stages:
             print(f"error: stage {args.stage!r} not found", file=sys.stderr)
             sys.exit(1)
     else:
         stages = all_stages
 
+    platform = job.get("platform", "nvidia")
+    commit = get_git_commit()
+    results_dir = build_results_dir(args.results_dir, platform, stages, commit)
+
     workdir = "/workspace"
     docker_args = build_docker_args(
-        config, job_name, repo_url, branch, stages, workdir, args.image_tag,
+        config,
+        job_name,
+        repo_url,
+        branch,
+        stages,
+        workdir,
+        args.image_tag,
         gpu_id_override=args.gpu_id,
+        results_dir=results_dir,
     )
 
     if args.dry_run:
-        print(" ".join(docker_args))
-
+        print(shlex.join(docker_args))
         return
 
+    results_dir.mkdir(parents=True, exist_ok=True)
     sys.exit(subprocess.run(docker_args).returncode)
 
 
diff --git a/.ci/tests/__init__.py b/.ci/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/.ci/tests/conftest.py b/.ci/tests/conftest.py
new file mode 100644
index 0000000..98079cd
--- /dev/null
+++ b/.ci/tests/conftest.py
@@ -0,0 +1,42 @@
+import sys
+from pathlib import Path
+
+# Allow `import run` and `import build` directly.
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+import pytest
+
+
+@pytest.fixture
+def minimal_config():
+    return {
+        "repo": {
+            "url": "https://github.com/InfiniTensor/InfiniOps.git",
+            "branch": "master",
+        },
+        "images": {
+            "nvidia": {
+                "dockerfile": ".ci/images/nvidia/",
+                "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"},
+            }
+        },
+        "jobs": {
+            "nvidia_gpu": {
+                "image": "latest",
+                "platform": "nvidia",
+                "resources": {
+                    "gpu_ids": "0",
+                    "memory": "32GB",
+                    "shm_size": "16g",
+                    "timeout": 3600,
+                },
+                "setup": "pip install .[dev]",
+                "stages": [
+                    {
+                        "name": "test",
+                        "run": "pytest tests/ -v",
+                    }
+                ],
+            }
+        },
+    }
diff --git a/.ci/tests/test_build.py b/.ci/tests/test_build.py
new file mode 100644
index 0000000..fa2f292
--- /dev/null
+++ b/.ci/tests/test_build.py
@@ -0,0 +1,186 @@
+import build
+
+
+# ---------------------------------------------------------------------------
+# build_image_tag
+# ---------------------------------------------------------------------------
+
+
+def test_build_image_tag_with_registry():
+    tag = build.build_image_tag("localhost:5000", "infiniops", "nvidia", "latest")
+    assert tag == "localhost:5000/infiniops/nvidia:latest"
+
+
+def test_build_image_tag_without_registry():
+    tag = build.build_image_tag("", "infiniops", "nvidia", "abc1234")
+    assert tag == "infiniops-ci/nvidia:abc1234"
+
+
+def test_build_image_tag_commit_hash():
+    tag = build.build_image_tag(
+        "registry.example.com:5000", "proj", "ascend", "deadbeef"
+    )
+    assert tag == "registry.example.com:5000/proj/ascend:deadbeef"
+
+
+# ---------------------------------------------------------------------------
+# has_dockerfile_changed
+# ---------------------------------------------------------------------------
+
+
+def test_has_dockerfile_changed_true_when_stdout_nonempty(mocker):
+    mocker.patch(
+        "subprocess.run",
+        return_value=mocker.Mock(returncode=0, stdout="Dockerfile\n"),
+    )
+    assert build.has_dockerfile_changed(".ci/images/nvidia/") is True
+
+
+def test_has_dockerfile_changed_false_when_stdout_empty(mocker):
+    mocker.patch(
+        "subprocess.run",
+        return_value=mocker.Mock(returncode=0, stdout=""),
+    )
+    assert build.has_dockerfile_changed(".ci/images/nvidia/") is False
+
+
+def test_has_dockerfile_changed_true_on_git_error(mocker):
+    # Shallow clone or initial commit: `git diff` returns non-zero.
+    mocker.patch(
+        "subprocess.run",
+        return_value=mocker.Mock(returncode=128, stdout=""),
+    )
+    assert build.has_dockerfile_changed(".ci/images/nvidia/") is True
+
+
+# ---------------------------------------------------------------------------
+# docker_login
+# ---------------------------------------------------------------------------
+
+
+def test_docker_login_no_credentials_env(mocker):
+    run_mock = mocker.patch("subprocess.run")
+    result = build.docker_login({"url": "localhost:5000"}, dry_run=False)
+    assert result is True
+    run_mock.assert_not_called()
+
+
+def test_docker_login_token_not_set(mocker, monkeypatch, capsys):
+    monkeypatch.delenv("REGISTRY_TOKEN", raising=False)
+    run_mock = mocker.patch("subprocess.run")
+    cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"}
+    result = build.docker_login(cfg, dry_run=False)
+    assert result is False
+    run_mock.assert_not_called()
+
+
+def test_docker_login_dry_run_does_not_call_subprocess(mocker, monkeypatch):
+    monkeypatch.setenv("REGISTRY_TOKEN", "mytoken")
+    run_mock = mocker.patch("subprocess.run")
+    cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"}
+    result = build.docker_login(cfg, dry_run=True)
+    assert result is True
+    run_mock.assert_not_called()
+
+
+def test_docker_login_success(mocker, monkeypatch):
+    monkeypatch.setenv("REGISTRY_TOKEN", "mytoken")
+    run_mock = mocker.patch(
+        "subprocess.run",
+        return_value=mocker.Mock(returncode=0),
+    )
+    cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"}
+    result = build.docker_login(cfg, dry_run=False)
+    assert result is True
+    run_mock.assert_called_once()
+    cmd = run_mock.call_args[0][0]
+    assert "docker" in cmd
+    assert "login" in cmd
+
+
+# ---------------------------------------------------------------------------
+# build_image — dry_run and proxy
+# ---------------------------------------------------------------------------
+
+
+def _platform_cfg():
+    return {
+        "dockerfile": ".ci/images/nvidia/",
+        "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"},
+    }
+
+
+def _registry_cfg():
+    return {"url": "localhost:5000", "project": "infiniops"}
+
+
+def test_build_image_dry_run_no_subprocess(mocker, monkeypatch, capsys):
+    monkeypatch.delenv("HTTP_PROXY", raising=False)
+    run_mock = mocker.patch("subprocess.run")
+    build.build_image(
+        "nvidia",
+        _platform_cfg(),
+        _registry_cfg(),
+        "abc1234",
+        push=False,
+        dry_run=True,
+        logged_in=True,
+    )
+    run_mock.assert_not_called()
+    captured = capsys.readouterr()
+    assert "[dry-run]" in captured.out
+
+
+def test_build_image_dry_run_output_contains_image_tag(mocker, monkeypatch, capsys):
+    monkeypatch.delenv("HTTP_PROXY", raising=False)
+    mocker.patch("subprocess.run")
+    build.build_image(
+        "nvidia",
+        _platform_cfg(),
+        _registry_cfg(),
+        "abc1234",
+        push=False,
+        dry_run=True,
+        logged_in=True,
+    )
+    captured = capsys.readouterr()
+    assert "abc1234" in captured.out
+
+
+def test_build_image_proxy_in_build_args(mocker, monkeypatch):
+    monkeypatch.setenv("HTTP_PROXY", "http://proxy.test:3128")
+    run_mock = mocker.patch(
+        "subprocess.run",
+        return_value=mocker.Mock(returncode=0),
+    )
+    build.build_image(
+        "nvidia",
+        _platform_cfg(),
+        _registry_cfg(),
+        "abc1234",
+        push=False,
+        dry_run=False,
+        logged_in=True,
+    )
+    called_cmd = run_mock.call_args[0][0]
+    joined = " ".join(called_cmd)
+    assert "HTTP_PROXY=http://proxy.test:3128" in joined
+    assert "http_proxy=http://proxy.test:3128" in joined
+
+
+def test_build_image_returns_false_on_docker_error(mocker, monkeypatch):
+    monkeypatch.delenv("HTTP_PROXY", raising=False)
+    mocker.patch(
+        "subprocess.run",
+        return_value=mocker.Mock(returncode=1),
+    )
+    result = build.build_image(
+        "nvidia",
+        _platform_cfg(),
+        _registry_cfg(),
+        "abc1234",
+        push=False,
+        dry_run=False,
+        logged_in=True,
+    )
+    assert result is False
diff --git a/.ci/tests/test_run.py b/.ci/tests/test_run.py
new file mode 100644
index 0000000..075546e
--- /dev/null
+++ b/.ci/tests/test_run.py
@@ -0,0 +1,298 @@
+from pathlib import Path
+
+import pytest
+
+import run
+
+
+# ---------------------------------------------------------------------------
+# resolve_image
+# ---------------------------------------------------------------------------
+
+
+def test_resolve_image_with_registry():
+    cfg = {"registry": {"url": "localhost:5000", "project": "infiniops"}}
+    img = run.resolve_image(cfg, "nvidia", "latest")
+    assert img == "localhost:5000/infiniops/nvidia:latest"
+
+
+def test_resolve_image_without_registry(minimal_config):
+    img = run.resolve_image(minimal_config, "nvidia", "abc1234")
+    assert img == "infiniops-ci/nvidia:abc1234"
+
+
+# ---------------------------------------------------------------------------
+# build_runner_script
+# ---------------------------------------------------------------------------
+
+
+def test_runner_script_contains_git_clone():
+    script = run.build_runner_script()
+    assert "git clone" in script
+
+
+def test_runner_script_contains_setup_cmd():
+    script = run.build_runner_script()
+    assert "SETUP_CMD" in script
+
+
+def test_runner_script_exits_on_failure():
+    script = run.build_runner_script()
+    assert "exit $failed" in script
+
+
+def test_runner_script_creates_results_dir():
+    script = run.build_runner_script()
+    assert "mkdir -p /workspace/results" in script
+
+
+# ---------------------------------------------------------------------------
+# build_docker_args — basic structure
+# ---------------------------------------------------------------------------
+
+
+def test_docker_args_basic_structure(minimal_config):
+    args = run.build_docker_args(
+        minimal_config,
+        "nvidia_gpu",
+        "https://github.com/example/repo.git",
+        "master",
+        minimal_config["jobs"]["nvidia_gpu"]["stages"],
+        "/workspace",
+        None,
+    )
+    assert args[0] == "docker"
+    assert "run" in args
+    assert "--rm" in args
+
+
+def test_docker_args_correct_image(minimal_config):
+    args = run.build_docker_args(
+        minimal_config,
+        "nvidia_gpu",
+        "https://github.com/example/repo.git",
+        "master",
+        minimal_config["jobs"]["nvidia_gpu"]["stages"],
+        "/workspace",
+        None,
+    )
+    assert "infiniops-ci/nvidia:latest" in args
+
+
+def test_docker_args_image_tag_override(minimal_config):
+    args = run.build_docker_args(
+        minimal_config,
+        "nvidia_gpu",
+        "https://github.com/example/repo.git",
+        "master",
+        minimal_config["jobs"]["nvidia_gpu"]["stages"],
+        "/workspace",
+        "abc1234",
+    )
+    assert "infiniops-ci/nvidia:abc1234" in args
+
+
+# ---------------------------------------------------------------------------
+# build_docker_args — proxy passthrough
+# ---------------------------------------------------------------------------
+
+
+def test_docker_args_proxy_present_when_set(minimal_config, monkeypatch):
+    monkeypatch.setenv("HTTP_PROXY", "http://proxy.example.com:8080")
+    args = run.build_docker_args(
+        minimal_config,
+        "nvidia_gpu",
+        "https://github.com/example/repo.git",
+        "master",
+        minimal_config["jobs"]["nvidia_gpu"]["stages"],
+        "/workspace",
+        None,
+    )
+    assert "-e" in args
+    assert "HTTP_PROXY=http://proxy.example.com:8080" in args
+    assert "http_proxy=http://proxy.example.com:8080" in args
+
+
+def test_docker_args_proxy_absent_when_not_set(minimal_config, monkeypatch):
+    monkeypatch.delenv("HTTP_PROXY", raising=False)
+    monkeypatch.delenv("http_proxy", raising=False)
+    monkeypatch.delenv("HTTPS_PROXY", raising=False)
+    monkeypatch.delenv("https_proxy", raising=False)
+    monkeypatch.delenv("NO_PROXY", raising=False)
+    monkeypatch.delenv("no_proxy", raising=False)
+    args = run.build_docker_args(
+        minimal_config,
+        "nvidia_gpu",
+        "https://github.com/example/repo.git",
+        "master",
+        minimal_config["jobs"]["nvidia_gpu"]["stages"],
+        "/workspace",
+        None,
+    )
+
+    for arg in args:
+        assert not arg.startswith("HTTP_PROXY=")
+        assert not arg.startswith("http_proxy=")
+        assert not arg.startswith("HTTPS_PROXY=")
+        assert not arg.startswith("https_proxy=")
+        assert not arg.startswith("NO_PROXY=")
+        assert not arg.startswith("no_proxy=")
+
+
+def test_docker_args_proxy_lowercase_fallback(minimal_config, monkeypatch):
+    monkeypatch.delenv("HTTP_PROXY", raising=False)
+    monkeypatch.setenv("http_proxy", "http://lowercase.proxy:3128")
+    args = run.build_docker_args(
+        minimal_config,
+        "nvidia_gpu",
+        "https://github.com/example/repo.git",
+        "master",
+        minimal_config["jobs"]["nvidia_gpu"]["stages"],
+        "/workspace",
+        None,
+    )
+    assert "HTTP_PROXY=http://lowercase.proxy:3128" in args
+    assert "http_proxy=http://lowercase.proxy:3128" in args
+
+
+# ---------------------------------------------------------------------------
+# build_docker_args — GPU flags
+# ---------------------------------------------------------------------------
+
+
+def _make_args(config, gpu_id_override=None):
+    return run.build_docker_args(
+        config,
+        "nvidia_gpu",
+        "https://github.com/example/repo.git",
+        "master",
+        config["jobs"]["nvidia_gpu"]["stages"],
+        "/workspace",
+        None,
+        gpu_id_override=gpu_id_override,
+    )
+
+
+def test_docker_args_gpu_device(minimal_config):
+    args = _make_args(minimal_config)
+    idx = args.index("--gpus")
+    assert "device=0" in args[idx + 1]
+
+
+def test_docker_args_gpu_all(minimal_config):
+    minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "all"
+    args = _make_args(minimal_config)
+    idx = args.index("--gpus")
+    assert args[idx + 1] == "all"
+
+
+def test_docker_args_no_gpu(minimal_config):
+    minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = ""
+    minimal_config["jobs"]["nvidia_gpu"]["resources"].pop("gpu_count", None)
+    args = _make_args(minimal_config)
+    assert "--gpus" not in args
+
+
+def test_docker_args_gpu_override(minimal_config):
+    args = _make_args(minimal_config, gpu_id_override="2,3")
+    idx = args.index("--gpus")
+    assert "2,3" in args[idx + 1]
+
+
+# ---------------------------------------------------------------------------
+# build_docker_args — memory format
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "raw,expected",
+    [
+        ("32GB", "32g"),
+        ("512MB", "512m"),
+        ("8", "8g"),
+        ("16gb", "16g"),
+        ("256mb", "256m"),
+    ],
+)
+def test_docker_args_memory_format(minimal_config, raw, expected):
+    minimal_config["jobs"]["nvidia_gpu"]["resources"]["memory"] = raw
+    args = _make_args(minimal_config)
+    idx = args.index("--memory")
+    assert args[idx + 1] == expected
+
+
+# ---------------------------------------------------------------------------
+# build_docker_args — stages encoding
+# ---------------------------------------------------------------------------
+
+
+def test_docker_args_num_stages(minimal_config):
+    args = _make_args(minimal_config)
+    assert "NUM_STAGES=1" in args
+
+
+def test_docker_args_stage_name_cmd(minimal_config):
+    args = _make_args(minimal_config)
+    assert "STAGE_1_NAME=test" in args
+    assert any(a.startswith("STAGE_1_CMD=") for a in args)
+
+
+def test_docker_args_multiple_stages(minimal_config):
+    minimal_config["jobs"]["nvidia_gpu"]["stages"] = [
+        {"name": "lint", "run": "ruff check ."},
+        {"name": "test", "run": "pytest tests/"},
+    ]
+    args = _make_args(minimal_config)
+    assert "NUM_STAGES=2" in args
+    assert "STAGE_1_NAME=lint" in args
+    assert "STAGE_2_NAME=test" in args
+
+
+# ---------------------------------------------------------------------------
+# build_docker_args — results_dir mount
+# ---------------------------------------------------------------------------
+
+
+def test_docker_args_results_dir(minimal_config, tmp_path):
+    args = run.build_docker_args(
+        minimal_config,
+        "nvidia_gpu",
+        "https://github.com/example/repo.git",
+        "master",
+        minimal_config["jobs"]["nvidia_gpu"]["stages"],
+        "/workspace",
+        None,
+        results_dir=tmp_path,
+    )
+    joined = " ".join(str(a) for a in args)
+    assert "-v" in args
+    assert "/workspace/results" in joined
+
+
+# ---------------------------------------------------------------------------
+# build_results_dir
+# ---------------------------------------------------------------------------
+
+
+def test_build_results_dir_contains_platform():
+    stages = [{"name": "test", "run": "pytest"}]
+    d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234")
+    assert "nvidia" in d.name
+
+
+def test_build_results_dir_contains_commit():
+    stages = [{"name": "test", "run": "pytest"}]
+    d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234")
+    assert "abc1234" in d.name
+
+
+def test_build_results_dir_contains_stage_names():
+    stages = [{"name": "lint", "run": "ruff"}, {"name": "test", "run": "pytest"}]
+    d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234")
+    assert "lint+test" in d.name
+
+
+def test_build_results_dir_under_base():
+    stages = [{"name": "test", "run": "pytest"}]
+    d = run.build_results_dir("/tmp/my-results", "ascend", stages, "def5678")
+    assert d.parent == Path("/tmp/my-results")

From e2d2c21cc560692c99800a33064825f2621066b2 Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Fri, 20 Mar 2026 08:00:22 +0000
Subject: [PATCH 03/16] fix(ci): fix results dir permissions and reduce
 parallel workers

- Pass host UID/GID into container and `chown` results after tests,
  so mounted `ci-results/` is accessible by the host user.
- Limit `pytest-xdist` workers from `-n auto` to `-n 8` to prevent
  OOM worker crashes on high-core-count machines.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .ci/config.yaml | 2 +-
 .ci/run.py      | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.ci/config.yaml b/.ci/config.yaml
index c80c47d..a86174a 100644
--- a/.ci/config.yaml
+++ b/.ci/config.yaml
@@ -30,4 +30,4 @@ jobs:
 
     stages:
       - name: test
-        run: pytest tests/ -n auto -v --tb=short --junitxml=/workspace/results/test-results.xml
+        run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
diff --git a/.ci/run.py b/.ci/run.py
index 3f25afa..0c8d648 100644
--- a/.ci/run.py
+++ b/.ci/run.py
@@ -83,6 +83,9 @@ def build_runner_script():
   eval "$cmd" || failed=1
 done
 echo "========== Summary =========="
+if [ -n "$HOST_UID" ] && [ -n "$HOST_GID" ]; then
+  chown -R "$HOST_UID:$HOST_GID" /workspace/results 2>/dev/null || true
+fi
 exit $failed
 """
 
@@ -127,6 +130,10 @@ def build_docker_args(
         f"SETUP_CMD={setup_cmd}",
         "-e",
         f"NUM_STAGES={len(stages)}",
+        "-e",
+        f"HOST_UID={os.getuid()}",
+        "-e",
+        f"HOST_GID={os.getgid()}",
     ]
 
     for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"):

From 41c76c9f5f76812958d7cc68bc1ebfa161acb06e Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Mon, 23 Mar 2026 03:27:06 +0000
Subject: [PATCH 04/16] refactor(ci): Refactor code structure for improved
 readability and maintainability

---
 .ci/README.md                   | 207 ++++++-
 .ci/agent.py                    | 971 ++++++++++++++++++++++++++++++++
 .ci/build.py                    |  27 +-
 .ci/ci_resource.py              | 241 ++++++++
 .ci/config.yaml                 |  89 ++-
 .ci/github_status.py            |  98 ++++
 .ci/images/iluvatar/Dockerfile  |  53 ++
 .ci/images/nvidia/Dockerfile    |  21 +-
 .ci/run.py                      |  56 +-
 .ci/tests/conftest.py           |  44 +-
 .ci/tests/test_agent.py         | 503 +++++++++++++++++
 .ci/tests/test_github_status.py | 144 +++++
 .ci/tests/test_resource.py      | 324 +++++++++++
 .ci/tests/test_utils.py         |  90 +++
 .ci/utils.py                    | 101 ++++
 15 files changed, 2833 insertions(+), 136 deletions(-)
 create mode 100644 .ci/agent.py
 create mode 100644 .ci/ci_resource.py
 create mode 100644 .ci/github_status.py
 create mode 100644 .ci/images/iluvatar/Dockerfile
 create mode 100644 .ci/tests/test_agent.py
 create mode 100644 .ci/tests/test_github_status.py
 create mode 100644 .ci/tests/test_resource.py
 create mode 100644 .ci/tests/test_utils.py
 create mode 100644 .ci/utils.py

diff --git a/.ci/README.md b/.ci/README.md
index 0bd59bd..33841ca 100644
--- a/.ci/README.md
+++ b/.ci/README.md
@@ -2,11 +2,16 @@
 
 ```
 .ci/
-├── config.yaml              # 统一配置（镜像、job 定义）
+├── config.yaml              # 统一配置（镜像、job、Agent 定义）
+├── utils.py                 # 共享工具（load_config、get_git_commit）
+├── agent.py                 # Runner Agent（调度、Webhook、远程触发）
 ├── build.py                 # 镜像构建
-├── run.py                   # CI 流水线执行
+├── run.py                   # CI 流水线执行（Docker 层）
+├── ci_resource.py           # GPU/内存资源检测与分配
+├── github_status.py         # GitHub Commit Status 上报
 └── images/
     ├── nvidia/Dockerfile
+    ├── iluvatar/Dockerfile
     └── ascend/Dockerfile
 ```
 
@@ -16,41 +21,88 @@
 
 ## 配置文件 `config.yaml`
 
+配置以 **platform** 为顶级结构，每个平台包含镜像定义、平台级默认值和 job 列表。
+加载时自动展平为 `{platform}_{job}` 格式（如 `nvidia_gpu`）。
+
 ```yaml
 repo:
   url: https://github.com/InfiniTensor/InfiniOps.git
   branch: master
 
-images:
+platforms:
   nvidia:
-    dockerfile: .ci/images/nvidia/
-    build_args:
-      BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
-
-jobs:
-  nvidia_gpu:
-    image: latest            # latest | <commit-hash>
-    platform: nvidia
-    resources:
-      gpu_ids: "0"           # "0" | "0,2" | "all"
-      memory: 32GB
-      shm_size: 16g          # 避免 PyTorch SHMEM 不足
-      timeout: 3600          # 容器内脚本最大运行秒数
+    image:                              # 镜像定义
+      dockerfile: .ci/images/nvidia/
+      build_args:
+        BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
+    setup: pip install .[dev]           # 平台级默认值，job 可覆盖
+    jobs:
+      gpu:                              # 展平后为 nvidia_gpu
+        resources:
+          gpu_ids: "0"                  # "0" | "0,2" | "all"
+          memory: 32GB
+          shm_size: 16g
+          timeout: 3600
+        stages:
+          - name: test
+            run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
+
+  iluvatar:
+    image:
+      dockerfile: .ci/images/iluvatar/
+      build_args:
+        BASE_IMAGE: corex:qs_pj20250825
+        APT_MIRROR: http://archive.ubuntu.com/ubuntu
+        PIP_INDEX_URL: https://pypi.org/simple
+    docker_args:                        # 平台级 docker 参数，所有 job 继承
+      - "--privileged"
+      - "--cap-add=ALL"
+      - "--pid=host"
+      - "--ipc=host"
+    volumes:
+      - /dev:/dev
+      - /lib/firmware:/lib/firmware
+      - /usr/src:/usr/src
+      - /lib/modules:/lib/modules
     setup: pip install .[dev]
-    env:                     # 可选，注入容器环境变量
-      MY_VAR: value
-    stages:
-      - name: test
-        run: pytest tests/ -n auto -v --tb=short --junitxml=/workspace/results/test-results.xml
+    jobs:
+      gpu:                              # 展平后为 iluvatar_gpu
+        resources:
+          gpu_ids: "0"
+          gpu_style: none               # CoreX 设备通过 --privileged + /dev 挂载
+          memory: 32GB
+          shm_size: 16g
+          timeout: 3600
+        stages:
+          - name: test
+            run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
 ```
 
+### 配置层级说明
+
+| 层级 | 字段 | 说明 |
+|---|---|---|
+| **平台级** | `image` | 镜像定义（dockerfile、build_args） |
+| | `image_tag` | 默认镜像 tag（默认 `latest`） |
+| | `docker_args` | 额外 docker run 参数（如 `--privileged`） |
+| | `volumes` | 额外挂载卷 |
+| | `setup` | 容器内 setup 命令 |
+| | `env` | 注入容器环境变量 |
+| **Job 级** | `resources.gpu_ids` | GPU 设备 ID |
+| | `resources.gpu_style` | GPU 透传方式：`nvidia`（默认）或 `none` |
+| | `resources.memory` | 容器内存限制 |
+| | `resources.shm_size` | 共享内存大小 |
+| | `resources.timeout` | 容器内脚本最大运行秒数 |
+| | `stages` | 执行阶段列表 |
+| | 以上平台级字段 | Job 可覆盖任意平台级默认值 |
+
 ---
 
 ## 镜像构建 `build.py`
 
 | 参数 | 说明 |
 |---|---|
-| `--platform nvidia\|ascend\|all` | 构建平台，默认 `all` |
+| `--platform nvidia\|iluvatar\|ascend\|all` | 构建平台，默认 `all` |
 | `--force` | 跳过 Dockerfile 变更检测 |
 | `--dry-run` | 打印命令不执行 |
 
@@ -58,8 +110,11 @@ jobs:
 # 检测变更后构建（无变更自动跳过）
 python .ci/build.py --platform nvidia
 
-# 强制构建
-python .ci/build.py --platform nvidia --force
+# 构建 Iluvatar 镜像
+python .ci/build.py --platform iluvatar --force
+
+# 强制构建全部
+python .ci/build.py --force
 ```
 
 构建产物以宿主机本地镜像 tag 存储：`infiniops-ci/<platform>:<commit-hash>` 和 `:latest`。
@@ -73,20 +128,116 @@ python .ci/build.py --platform nvidia --force
 
 | 参数 | 说明 |
 |---|---|
+| `--job` | 指定 job 名称（默认第一个） |
 | `--branch` | 覆盖克隆分支 |
 | `--stage` | 只运行指定 stage |
 | `--image-tag` | 覆盖镜像 tag |
-| `--gpu-id` | 覆盖 GPU 设备 ID |
+| `--gpu-id` | 覆盖 GPU 设备 ID（仅 nvidia gpu_style） |
 | `--results-dir` | 宿主机目录，挂载到容器 `/workspace/results` |
 | `--dry-run` | 打印 docker 命令不执行 |
 
 ```bash
-# 运行默认 job
-python .ci/run.py --branch feat/my-feature --results-dir ./ci-results
+# 运行 NVIDIA job
+python .ci/run.py --job nvidia_gpu --branch master
+
+# 运行 Iluvatar job
+python .ci/run.py --job iluvatar_gpu --branch feat/ci-nvidia
 
 # 只跑 test stage，预览命令
-python .ci/run.py --stage test --dry-run
+python .ci/run.py --job iluvatar_gpu --stage test --dry-run
 ```
 
 容器内执行流程：`git clone` → `checkout` → `setup` → stages。
 代理从宿主机透传，测试结果写入 `--results-dir`。每次运行均为干净环境（不挂载宿主机 pip 缓存）。
+
+---
+
+## 平台差异
+
+| 平台 | GPU 透传方式 | 基础镜像 | 备注 |
+|---|---|---|---|
+| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvcr.io/nvidia/pytorch:24.10-py3` | 标准 CUDA |
+| Iluvatar | `--privileged` + `/dev` 挂载 | `corex:qs_pj20250825` | CoreX 运行时，CUDA 兼容 |
+| Ascend | TODO | `ascend-pytorch:24.0.0` | 待完善 |
+
+---
+
+## Runner Agent `agent.py`
+
+Runner Agent 支持 CLI 手动触发、GitHub Webhook 自动触发、资源感知的动态调度，以及跨机器远程触发。
+
+### CLI 手动执行
+
+```bash
+# 运行所有 job（本地 + 远程 Agent）
+python .ci/agent.py run --branch master
+
+# 运行指定 job
+python .ci/agent.py run --branch master --job nvidia_gpu
+
+# 按平台运行
+python .ci/agent.py run --branch master --platform nvidia
+
+# 预览命令
+python .ci/agent.py run --branch master --dry-run --no-status
+```
+
+| 参数 | 说明 |
+|---|---|
+| `--branch` | 测试分支（必填） |
+| `--job` | 指定 job 名称 |
+| `--platform` | 按平台过滤 job |
+| `--commit` | 覆盖 commit SHA |
+| `--image-tag` | 覆盖镜像 tag |
+| `--results-dir` | 结果目录（默认 `ci-results`） |
+| `--utilization-threshold` | GPU 空闲阈值百分比（默认 10） |
+| `--no-status` | 跳过 GitHub Status 上报 |
+| `--dry-run` | 预览模式 |
+
+### Webhook 服务
+
+每台平台机器部署一个 Agent 实例：
+
+```bash
+# NVIDIA 机器
+python .ci/agent.py serve --platform nvidia --port 8080
+
+# Iluvatar 机器
+python .ci/agent.py serve --platform iluvatar --port 8080
+```
+
+| 端点 | 方法 | 说明 |
+|---|---|---|
+| `/webhook` | POST | GitHub Webhook（push/pull_request） |
+| `/api/run` | POST | 远程触发 job |
+| `/api/job/{id}` | GET | 查询 job 状态 |
+| `/health` | GET | 健康检查 |
+| `/status` | GET | 队列 + 资源状态 |
+
+Webhook 支持 `X-Hub-Signature-256` 签名验证，通过 `--webhook-secret` 或 `WEBHOOK_SECRET` 环境变量配置。
+
+### 远程 Agent 配置
+
+在 `config.yaml` 中配置各平台 Agent 地址，CLI 执行时自动将远程 job 分发到对应 Agent：
+
+```yaml
+agents:
+  nvidia:
+    url: http://nvidia-host:8080
+  iluvatar:
+    url: http://iluvatar-host:8080
+```
+
+### 资源调度
+
+Agent 自动检测 GPU 利用率和系统内存，动态决定并行度：
+- GPU 利用率 < 阈值（默认 10%）且未被 Agent 分配 → 可用
+- 资源不足时 job 自动排队，已完成 job 释放资源后自动调度排队任务
+
+### GitHub Status
+
+设置 `GITHUB_TOKEN` 环境变量后，Agent 会自动上报 commit status：
+- `pending` — job 开始执行
+- `success` / `failure` — job 执行完成
+
+Status context 格式：`ci/infiniops/{job_name}`
diff --git a/.ci/agent.py b/.ci/agent.py
new file mode 100644
index 0000000..3696ce2
--- /dev/null
+++ b/.ci/agent.py
@@ -0,0 +1,971 @@
+#!/usr/bin/env python3
+"""CI Runner Agent: webhook server, resource-aware scheduler, GitHub status reporting.
+
+Usage:
+    # Run jobs locally (or dispatch to remote agents)
+    python .ci/agent.py run --branch master
+    python .ci/agent.py run --branch master --job nvidia_gpu --dry-run
+
+    # Start webhook server
+    python .ci/agent.py serve --platform nvidia --port 8080
+"""
+
+import argparse
+import collections
+import hashlib
+import hmac
+import json
+import os
+import shlex
+import subprocess
+import sys
+import threading
+import time
+import urllib.error
+import urllib.request
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    print(
+        "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr
+    )
+    sys.exit(1)
+
+import ci_resource as res
+import github_status as gh
+import run
+
+# Maximum POST body size (1 MB) to prevent memory exhaustion
+MAX_CONTENT_LENGTH = 1 * 1024 * 1024
+
+# Job states
+STATE_QUEUED = "queued"
+STATE_RUNNING = "running"
+STATE_PENDING = "pending"
+STATE_SUCCESS = "success"
+STATE_FAILURE = "failure"
+STATE_ERROR = "error"
+
+# urllib helpers (module-level for easier mocking in tests)
+urllib_request = urllib.request.Request
+urllib_urlopen = urllib.request.urlopen
+
+
+# ---------------------------------------------------------------------------
+# Data classes
+# ---------------------------------------------------------------------------
+
+
+class JobRequest:
+    """Describes a CI job to be executed."""
+
+    def __init__(self, job_name, branch, commit_sha, config, image_tag=None, results_dir=None):
+        self.job_id = str(uuid.uuid4())[:8]
+        self.job_name = job_name
+        self.branch = branch
+        self.commit_sha = commit_sha
+        self.config = config
+        self.image_tag = image_tag
+        self.results_dir = results_dir or Path("ci-results")
+        self.created_at = datetime.now().isoformat()
+
+        job = config["jobs"][job_name]
+        self.platform = job.get("platform", "nvidia")
+
+    def to_dict(self):
+        return {
+            "job_id": self.job_id,
+            "job_name": self.job_name,
+            "branch": self.branch,
+            "commit_sha": self.commit_sha,
+            "platform": self.platform,
+            "created_at": self.created_at,
+        }
+
+
+class JobResult:
+    """Outcome of a completed job."""
+
+    def __init__(self, job_id, job_name, commit_sha, returncode, results_dir, duration):
+        self.job_id = job_id
+        self.job_name = job_name
+        self.commit_sha = commit_sha
+        self.returncode = returncode
+        self.results_dir = results_dir
+        self.duration = duration
+
+        self.state = STATE_SUCCESS if returncode == 0 else STATE_FAILURE
+
+    def to_dict(self):
+        return {
+            "job_id": self.job_id,
+            "job_name": self.job_name,
+            "commit_sha": self.commit_sha,
+            "state": self.state,
+            "returncode": self.returncode,
+            "results_dir": str(self.results_dir),
+            "duration_seconds": round(self.duration, 1),
+        }
+
+
+# ---------------------------------------------------------------------------
+# Job selection and routing
+# ---------------------------------------------------------------------------
+
+
+def select_jobs(config, platform=None, job_name=None):
+    """Return list of job names to run."""
+    jobs = config.get("jobs", {})
+
+    if job_name:
+        if job_name not in jobs:
+            raise ValueError(f"job {job_name!r} not in config")
+
+        return [job_name]
+
+    if platform:
+        return [
+            name for name, job in jobs.items() if job.get("platform") == platform
+        ]
+
+    return list(jobs.keys())
+
+
+def route_jobs(config, job_names, local_platform=None):
+    """Split jobs into local and remote.
+
+    Returns (local_jobs, remote_jobs) where remote_jobs is a list of
+    (job_name, agent_url) tuples.
+    """
+    agents = config.get("agents", {})
+    jobs = config.get("jobs", {})
+    local = []
+    remote = []
+
+    for name in job_names:
+        job = jobs.get(name, {})
+        platform = job.get("platform", "")
+
+        if not local_platform:
+            local.append(name)
+        elif platform == local_platform:
+            local.append(name)
+        elif platform in agents:
+            remote.append((name, agents[platform].get("url", "")))
+        else:
+            local.append(name)
+
+    return local, remote
+
+
+# ---------------------------------------------------------------------------
+# Scheduler
+# ---------------------------------------------------------------------------
+
+
+class Scheduler:
+    """Resource-aware job scheduler with dynamic parallelism."""
+
+    def __init__(
+        self,
+        config,
+        platform,
+        resource_pool,
+        results_dir=None,
+        max_workers=4,
+        no_status=False,
+        dry_run=False,
+    ):
+        self._config = config
+        self._platform = platform
+        self._resource_pool = resource_pool
+        self._results_dir = results_dir or Path("ci-results")
+        self._no_status = no_status
+        self._dry_run = dry_run
+        self._queue = collections.deque()
+        self._jobs: dict[str, dict] = {}  # job_id -> {request, result, state, gpu_ids}
+        self._executor = ThreadPoolExecutor(max_workers=max_workers)
+        self._lock = threading.Lock()
+        self._done_event = threading.Event()
+
+        # GitHub config
+        github_cfg = config.get("github", {})
+        self._status_prefix = github_cfg.get("status_context_prefix", "ci/infiniops")
+        repo = config.get("repo", {})
+        repo_url = repo.get("url", "")
+        self._owner, self._repo = gh.parse_repo_url(repo_url)
+
+    def submit(self, job_request):
+        """Add a job to the queue and attempt to schedule it.
+
+        Returns the job_id.
+        """
+        with self._lock:
+            self._jobs[job_request.job_id] = {
+                "request": job_request,
+                "result": None,
+                "state": STATE_QUEUED,
+                "gpu_ids": [],
+            }
+            self._queue.append(job_request)
+
+        self._try_schedule()
+        return job_request.job_id
+
+    def get_job(self, job_id):
+        """Get job info by ID."""
+        with self._lock:
+            entry = self._jobs.get(job_id)
+
+            if not entry:
+                return None
+
+            info = entry["request"].to_dict()
+            info["state"] = entry["state"]
+
+            if entry["result"]:
+                info.update(entry["result"].to_dict())
+
+            return info
+
+    def get_status(self):
+        """Return scheduler status for the /status endpoint."""
+        with self._lock:
+            queued = [
+                self._jobs[r.job_id]["request"].to_dict()
+                for r in self._queue
+            ]
+            running = []
+            completed = []
+
+            for entry in self._jobs.values():
+                state = entry["state"]
+
+                if state == STATE_RUNNING:
+                    running.append({**entry["request"].to_dict(), "gpu_ids": entry["gpu_ids"]})
+                elif state in (STATE_SUCCESS, STATE_FAILURE):
+                    completed.append(entry["result"].to_dict())
+
+        return {
+            "queued": queued,
+            "running": running,
+            "completed": completed[-20:],  # Last 20
+            "resources": self._resource_pool.get_status(),
+        }
+
+    def wait_all(self):
+        """Block until all submitted jobs are done. Returns list of JobResult."""
+        while True:
+            with self._lock:
+                pending = any(
+                    e["state"] in (STATE_QUEUED, STATE_RUNNING) for e in self._jobs.values()
+                )
+
+            if not pending:
+                break
+
+            self._done_event.wait(timeout=2.0)
+            self._done_event.clear()
+
+        with self._lock:
+            return [
+                e["result"]
+                for e in self._jobs.values()
+                if e["result"] is not None
+            ]
+
+    def _try_schedule(self):
+        """Try to run queued jobs that have enough resources.
+
+        Resource allocation and job submission are split: allocation decisions
+        are made under the lock, but executor.submit() happens outside to
+        prevent deadlock when the thread pool is saturated.
+        """
+        to_launch = []  # [(req, gpu_ids), ...]
+
+        with self._lock:
+            remaining = collections.deque()
+
+            while self._queue:
+                req = self._queue.popleft()
+                job_cfg = self._config["jobs"].get(req.job_name, {})
+                gpu_count = res.parse_gpu_requirement(job_cfg)
+                memory_mb = res.parse_memory_requirement(job_cfg)
+
+                if self._dry_run:
+                    # In dry-run mode, skip resource checks
+                    gpu_ids, ok = [], True
+                else:
+                    gpu_ids, ok = self._resource_pool.allocate(gpu_count, memory_mb)
+
+                if ok:
+                    self._jobs[req.job_id]["state"] = STATE_RUNNING
+                    self._jobs[req.job_id]["gpu_ids"] = gpu_ids
+                    to_launch.append((req, gpu_ids))
+                else:
+                    remaining.append(req)
+
+            self._queue = remaining
+
+        # Submit outside the lock to avoid deadlock with ThreadPoolExecutor
+        for req, gpu_ids in to_launch:
+            self._executor.submit(self._run_job, req, gpu_ids)
+
+    def _run_job(self, req, gpu_ids):
+        """Execute a single job in a worker thread.
+
+        Wrapped in try/finally to guarantee GPU resources are always released
+        and job state is updated even on unexpected exceptions.
+        """
+        context = gh.build_status_context(self._status_prefix, req.job_name)
+        result = None
+
+        try:
+            # Post pending status
+            if not self._no_status:
+                gh.post_commit_status(
+                    self._owner,
+                    self._repo,
+                    req.commit_sha,
+                    STATE_PENDING,
+                    context,
+                    f"Running {req.job_name}...",
+                )
+
+            job_cfg = self._config["jobs"][req.job_name]
+            all_stages = job_cfg.get("stages", [])
+            repo_url = self._config.get("repo", {}).get("url", "")
+            commit_short = req.commit_sha[:7] if len(req.commit_sha) > 7 else req.commit_sha
+            results_dir = run.build_results_dir(
+                req.results_dir, req.platform, all_stages, commit_short
+            )
+
+            gpu_id_str = ",".join(str(g) for g in gpu_ids) if gpu_ids else None
+            docker_args = run.build_docker_args(
+                self._config,
+                req.job_name,
+                repo_url,
+                req.branch,
+                all_stages,
+                "/workspace",
+                req.image_tag,
+                gpu_id_override=gpu_id_str,
+                results_dir=results_dir,
+            )
+
+            start = time.monotonic()
+
+            if self._dry_run:
+                print(f"[dry-run] {req.job_name}: {shlex.join(docker_args)}")
+                returncode = 0
+            else:
+                results_dir.mkdir(parents=True, exist_ok=True)
+                proc = subprocess.run(docker_args)
+                returncode = proc.returncode
+
+            duration = time.monotonic() - start
+
+            result = JobResult(
+                job_id=req.job_id,
+                job_name=req.job_name,
+                commit_sha=req.commit_sha,
+                returncode=returncode,
+                results_dir=results_dir,
+                duration=duration,
+            )
+
+            # Post final status
+            if not self._no_status:
+                gh.post_commit_status(
+                    self._owner,
+                    self._repo,
+                    req.commit_sha,
+                    result.state,
+                    context,
+                    f"{req.job_name}: {result.state} in {duration:.0f}s",
+                )
+        except Exception as e:
+            print(f"error: job {req.job_name} failed with exception: {e}", file=sys.stderr)
+
+            if result is None:
+                result = JobResult(
+                    job_id=req.job_id,
+                    job_name=req.job_name,
+                    commit_sha=req.commit_sha,
+                    returncode=-1,
+                    results_dir=req.results_dir,
+                    duration=0,
+                )
+
+            if not self._no_status:
+                gh.post_commit_status(
+                    self._owner,
+                    self._repo,
+                    req.commit_sha,
+                    STATE_ERROR,
+                    context,
+                    f"{req.job_name}: internal error",
+                )
+        finally:
+            # Always release resources and update state
+            self._resource_pool.release(gpu_ids)
+
+            with self._lock:
+                self._jobs[req.job_id]["result"] = result
+                self._jobs[req.job_id]["state"] = result.state if result else STATE_FAILURE
+
+            self._done_event.set()
+            self._try_schedule()
+
+        return result
+
+
+# ---------------------------------------------------------------------------
+# Webhook server
+# ---------------------------------------------------------------------------
+
+
+def verify_signature(secret, body, signature_header):
+    """Verify GitHub webhook HMAC-SHA256 signature."""
+    if not signature_header:
+        return False
+
+    expected = "sha256=" + hmac.new(
+        secret.encode("utf-8"), body, hashlib.sha256
+    ).hexdigest()
+    return hmac.compare_digest(expected, signature_header)
+
+
+def _verify_api_token(handler):
+    """Check Bearer token for /api/run authentication.
+
+    Returns True if authenticated, False (and sends 401) if not.
+    When no api_token is configured on the server, all requests are allowed.
+    """
+    api_token = getattr(handler.server, "api_token", None)
+
+    if not api_token:
+        return True
+
+    auth_header = handler.headers.get("Authorization", "")
+
+    if auth_header == f"Bearer {api_token}":
+        return True
+
+    handler._respond_json(401, {"error": "unauthorized"})
+    return False
+
+
+class WebhookHandler(BaseHTTPRequestHandler):
+    """HTTP handler for GitHub webhooks and API endpoints."""
+
+    def log_message(self, format, *args):
+        print(f"[agent] {args[0]}", file=sys.stderr)
+
+    def do_GET(self):
+        if self.path == "/health":
+            self._respond_json(200, {"status": "ok", "platform": self.server.platform})
+        elif self.path == "/status":
+            status = self.server.scheduler.get_status()
+            self._respond_json(200, status)
+        elif self.path.startswith("/api/job/"):
+            self._handle_api_job()
+        else:
+            self._respond_json(404, {"error": "not found"})
+
+    def do_POST(self):
+        content_length = int(self.headers.get("Content-Length", 0))
+
+        if content_length > MAX_CONTENT_LENGTH:
+            self._respond_json(413, {"error": "payload too large"})
+            return
+
+        body = self.rfile.read(content_length)
+
+        if self.path == "/webhook":
+            self._handle_webhook(body)
+        elif self.path == "/api/run":
+            self._handle_api_run(body)
+        else:
+            self._respond_json(404, {"error": "not found"})
+
+    def _handle_webhook(self, body):
+        # Verify signature if secret is configured
+        if self.server.webhook_secret:
+            sig = self.headers.get("X-Hub-Signature-256", "")
+
+            if not verify_signature(self.server.webhook_secret, body, sig):
+                self._respond_json(401, {"error": "invalid signature"})
+                return
+
+        event_type = self.headers.get("X-GitHub-Event", "")
+
+        if event_type == "ping":
+            self._respond_json(200, {"msg": "pong"})
+            return
+
+        try:
+            payload = json.loads(body)
+        except json.JSONDecodeError:
+            self._respond_json(400, {"error": "invalid JSON"})
+            return
+
+        if event_type == "push":
+            branch, sha = self._parse_push(payload)
+        elif event_type == "pull_request":
+            action = payload.get("action", "")
+
+            if action not in ("opened", "synchronize"):
+                self._respond_json(200, {"msg": f"ignored PR action: {action}"})
+                return
+
+            branch, sha = self._parse_pull_request(payload)
+        else:
+            self._respond_json(200, {"msg": f"ignored event: {event_type}"})
+            return
+
+        if not branch or not sha:
+            self._respond_json(400, {"error": "could not extract branch/sha"})
+            return
+
+        job_ids = self._submit_jobs(branch, sha)
+        self._respond_json(200, {"accepted": True, "job_ids": job_ids})
+
+    def _handle_api_run(self, body):
+        """Handle /api/run: remote job trigger (requires Bearer token auth)."""
+        if not _verify_api_token(self):
+            return
+
+        try:
+            payload = json.loads(body)
+        except json.JSONDecodeError:
+            self._respond_json(400, {"error": "invalid JSON"})
+            return
+
+        branch = payload.get("branch", "")
+        sha = payload.get("commit_sha", "")
+        job_name = payload.get("job")
+        image_tag = payload.get("image_tag")
+
+        if not branch:
+            self._respond_json(400, {"error": "branch is required"})
+            return
+
+        if not sha:
+            sha = run.get_git_commit()
+
+        job_ids = self._submit_jobs(branch, sha, job_name=job_name, image_tag=image_tag)
+        self._respond_json(200, {"accepted": True, "job_ids": job_ids})
+
+    def _handle_api_job(self):
+        """Handle GET /api/job/{id}."""
+        parts = self.path.split("/")
+
+        if len(parts) < 4:
+            self._respond_json(400, {"error": "missing job_id"})
+            return
+
+        job_id = parts[3]
+        info = self.server.scheduler.get_job(job_id)
+
+        if info is None:
+            self._respond_json(404, {"error": f"job {job_id} not found"})
+        else:
+            self._respond_json(200, info)
+
+    def _parse_push(self, payload):
+        branch = payload.get("ref", "").removeprefix("refs/heads/")
+        sha = payload.get("after", "")
+        return branch, sha
+
+    def _parse_pull_request(self, payload):
+        pr = payload.get("pull_request", {})
+        head = pr.get("head", {})
+        branch = head.get("ref", "")
+        sha = head.get("sha", "")
+        return branch, sha
+
+    def _submit_jobs(self, branch, sha, job_name=None, image_tag=None):
+        config = self.server.config
+        job_names = select_jobs(config, platform=self.server.platform, job_name=job_name)
+        job_ids = []
+
+        for name in job_names:
+            req = JobRequest(
+                job_name=name,
+                branch=branch,
+                commit_sha=sha,
+                config=config,
+                image_tag=image_tag,
+                results_dir=self.server.results_dir,
+            )
+            jid = self.server.scheduler.submit(req)
+            job_ids.append(jid)
+
+        return job_ids
+
+    def _respond_json(self, status_code, data):
+        body = json.dumps(data, indent=2).encode("utf-8")
+        self.send_response(status_code)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+
+class AgentServer(HTTPServer):
+    """HTTP server with scheduler and config context."""
+
+    def __init__(
+        self,
+        host,
+        port,
+        config,
+        scheduler,
+        platform,
+        webhook_secret=None,
+        api_token=None,
+        results_dir=None,
+    ):
+        super().__init__((host, port), WebhookHandler)
+        self.config = config
+        self.scheduler = scheduler
+        self.platform = platform
+        self.webhook_secret = webhook_secret
+        self.api_token = api_token
+        self.results_dir = results_dir or Path("ci-results")
+
+
+# ---------------------------------------------------------------------------
+# Remote job dispatch (for CLI triggering remote agents)
+# ---------------------------------------------------------------------------
+
+
+def dispatch_remote_job(agent_url, job_name, branch, commit_sha, image_tag=None, api_token=None):
+    """Send a job to a remote agent via HTTP API. Returns job_id or None."""
+    url = f"{agent_url.rstrip('/')}/api/run"
+    body = {
+        "branch": branch,
+        "commit_sha": commit_sha,
+        "job": job_name,
+    }
+
+    if image_tag:
+        body["image_tag"] = image_tag
+
+    data = json.dumps(body).encode("utf-8")
+    headers = {"Content-Type": "application/json"}
+
+    if api_token:
+        headers["Authorization"] = f"Bearer {api_token}"
+
+    req = urllib_request(url, data=data, headers=headers, method="POST")
+
+    try:
+        with urllib_urlopen(req, timeout=30) as resp:
+            result = json.loads(resp.read())
+            job_ids = result.get("job_ids", [])
+            return job_ids[0] if job_ids else None
+    except Exception as e:
+        print(f"error: failed to dispatch to {agent_url}: {e}", file=sys.stderr)
+        return None
+
+
+def poll_remote_job(agent_url, job_id, interval=5.0, timeout=7200):
+    """Poll a remote agent for job completion. Returns final state dict or None."""
+    url = f"{agent_url.rstrip('/')}/api/job/{job_id}"
+    deadline = time.monotonic() + timeout
+
+    while time.monotonic() < deadline:
+        try:
+            req = urllib_request(url)
+
+            with urllib_urlopen(req, timeout=10) as resp:
+                info = json.loads(resp.read())
+
+            state = info.get("state", "")
+
+            if state in (STATE_SUCCESS, STATE_FAILURE):
+                return info
+        except Exception:
+            pass
+
+        time.sleep(interval)
+
+    return None
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def cmd_run(args):
+    """Handle 'run' subcommand: execute jobs locally and/or remotely."""
+    config = run.load_config(args.config)
+    commit_sha = args.commit or run.get_git_commit(short=False)
+
+    # Determine which jobs to run
+    try:
+        job_names = select_jobs(config, platform=args.platform, job_name=args.job)
+    except ValueError as e:
+        print(f"error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    if not job_names:
+        print("error: no matching jobs found", file=sys.stderr)
+        sys.exit(1)
+
+    # Detect local platform (if running serve on this machine, use that; otherwise guess)
+    local_platform = args.platform
+    local_jobs, remote_jobs = route_jobs(config, job_names, local_platform)
+
+    # Run local jobs
+    local_results = []
+
+    if local_jobs:
+        pool = res.ResourcePool(
+            local_platform or "unknown",
+            utilization_threshold=args.utilization_threshold,
+        )
+        scheduler = Scheduler(
+            config,
+            local_platform or "unknown",
+            pool,
+            results_dir=args.results_dir,
+            no_status=args.no_status,
+            dry_run=args.dry_run,
+        )
+
+        for name in local_jobs:
+            req = JobRequest(
+                job_name=name,
+                branch=args.branch,
+                commit_sha=commit_sha,
+                config=config,
+                image_tag=args.image_tag,
+                results_dir=args.results_dir,
+            )
+            scheduler.submit(req)
+
+        local_results = scheduler.wait_all()
+
+    # Dispatch remote jobs
+    remote_results = []
+    api_token = os.environ.get("AGENT_API_TOKEN", "")
+
+    if remote_jobs and not args.dry_run:
+        # Dispatch all remote jobs first, then poll concurrently
+        dispatched = []  # [(name, agent_url, job_id)]
+
+        for name, agent_url in remote_jobs:
+            if not agent_url:
+                print(f"warning: no agent URL for {name}, skipping", file=sys.stderr)
+                remote_results.append({"job_name": name, "state": "error"})
+                continue
+
+            print(f"==> dispatching {name} to {agent_url}", file=sys.stderr)
+            job_id = dispatch_remote_job(
+                agent_url, name, args.branch, commit_sha, args.image_tag,
+                api_token=api_token or None,
+            )
+
+            if job_id:
+                print(f"    job_id: {job_id}", file=sys.stderr)
+                dispatched.append((name, agent_url, job_id))
+            else:
+                print(f"    failed to dispatch {name}", file=sys.stderr)
+                remote_results.append({"job_name": name, "state": "error"})
+
+        # Poll all dispatched jobs concurrently
+        if dispatched:
+            with ThreadPoolExecutor(max_workers=len(dispatched)) as executor:
+                futures = {
+                    executor.submit(poll_remote_job, url, jid): (name, url, jid)
+                    for name, url, jid in dispatched
+                }
+
+                for future in futures:
+                    name, _, _ = futures[future]
+                    result = future.result()
+
+                    if result:
+                        remote_results.append(result)
+                    else:
+                        print(f"    timeout waiting for {name}", file=sys.stderr)
+                        remote_results.append({"job_name": name, "state": "timeout"})
+
+    elif remote_jobs and args.dry_run:
+        for name, agent_url in remote_jobs:
+            print(f"[dry-run] dispatch {name} to {agent_url}")
+
+    # Summary
+    print("\n========== Results ==========")
+    all_ok = True
+
+    for r in local_results:
+        status = "PASS" if r.returncode == 0 else "FAIL"
+
+        if r.returncode != 0:
+            all_ok = False
+
+        print(f"  {status}  {r.job_name}  ({r.duration:.0f}s)  {r.results_dir}")
+
+    for r in remote_results:
+        state = r.get("state", "unknown")
+        name = r.get("job_name", "?")
+        status = "PASS" if state == STATE_SUCCESS else "FAIL"
+
+        if state != STATE_SUCCESS:
+            all_ok = False
+
+        duration = r.get("duration_seconds", 0)
+        print(f"  {status}  {name}  ({duration:.0f}s)  [remote]")
+
+    if not all_ok:
+        sys.exit(1)
+
+
+def cmd_serve(args):
+    """Handle 'serve' subcommand: start webhook server."""
+    config = run.load_config(args.config)
+
+    pool = res.ResourcePool(
+        args.platform,
+        utilization_threshold=args.utilization_threshold,
+    )
+    scheduler = Scheduler(
+        config,
+        args.platform,
+        pool,
+        results_dir=args.results_dir,
+    )
+
+    webhook_secret = args.webhook_secret or os.environ.get("WEBHOOK_SECRET", "")
+    api_token = args.api_token or os.environ.get("AGENT_API_TOKEN", "")
+
+    if not webhook_secret:
+        print(
+            "WARNING: No webhook secret configured. Webhook endpoint accepts "
+            "unsigned requests. Set --webhook-secret or WEBHOOK_SECRET for production.",
+            file=sys.stderr,
+        )
+
+    if not api_token:
+        print(
+            "WARNING: No API token configured. /api/run endpoint is unauthenticated. "
+            "Set --api-token or AGENT_API_TOKEN for production.",
+            file=sys.stderr,
+        )
+
+    server = AgentServer(
+        args.host,
+        args.port,
+        config,
+        scheduler,
+        args.platform,
+        webhook_secret=webhook_secret or None,
+        api_token=api_token or None,
+        results_dir=args.results_dir,
+    )
+
+    print(
+        f"Agent serving on {args.host}:{args.port} (platform={args.platform})",
+        file=sys.stderr,
+    )
+    print(f"  POST /webhook  — GitHub webhook", file=sys.stderr)
+    print(f"  POST /api/run  — remote job trigger", file=sys.stderr)
+    print(f"  GET  /health   — health check", file=sys.stderr)
+    print(f"  GET  /status   — queue & resource status", file=sys.stderr)
+    print(f"  GET  /api/job/{{id}} — job status", file=sys.stderr)
+
+    try:
+        server.serve_forever()
+    except KeyboardInterrupt:
+        print("\nShutting down...", file=sys.stderr)
+        server.shutdown()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="CI Runner Agent: run jobs locally, dispatch remotely, or serve webhooks",
+    )
+    subparsers = parser.add_subparsers(dest="command")
+
+    # --- run subcommand ---
+    run_parser = subparsers.add_parser("run", help="Run CI jobs")
+    run_parser.add_argument(
+        "--config",
+        type=Path,
+        default=Path(__file__).resolve().parent / "config.yaml",
+    )
+    run_parser.add_argument("--branch", type=str, required=True, help="Branch to test")
+    run_parser.add_argument("--job", type=str, help="Specific job name")
+    run_parser.add_argument("--platform", type=str, help="Filter jobs by platform")
+    run_parser.add_argument("--image-tag", type=str, help="Override image tag")
+    run_parser.add_argument("--commit", type=str, help="Override commit SHA")
+    run_parser.add_argument(
+        "--results-dir",
+        type=Path,
+        default=Path("ci-results"),
+    )
+    run_parser.add_argument(
+        "--utilization-threshold",
+        type=int,
+        default=10,
+        help="GPU utilization threshold (%%) to consider free (default: 10)",
+    )
+    run_parser.add_argument("--no-status", action="store_true", help="Skip GitHub status")
+    run_parser.add_argument("--dry-run", action="store_true")
+
+    # --- serve subcommand ---
+    serve_parser = subparsers.add_parser("serve", help="Start webhook server")
+    serve_parser.add_argument(
+        "--config",
+        type=Path,
+        default=Path(__file__).resolve().parent / "config.yaml",
+    )
+    serve_parser.add_argument(
+        "--platform",
+        type=str,
+        required=True,
+        help="Platform this agent handles (nvidia, iluvatar, etc.)",
+    )
+    serve_parser.add_argument("--port", type=int, default=8080)
+    serve_parser.add_argument("--host", type=str, default="0.0.0.0")
+    serve_parser.add_argument("--webhook-secret", type=str)
+    serve_parser.add_argument(
+        "--api-token",
+        type=str,
+        help="Bearer token for /api/run authentication (or AGENT_API_TOKEN env var)",
+    )
+    serve_parser.add_argument(
+        "--results-dir",
+        type=Path,
+        default=Path("ci-results"),
+    )
+    serve_parser.add_argument(
+        "--utilization-threshold",
+        type=int,
+        default=10,
+    )
+
+    args = parser.parse_args()
+
+    if args.command == "run":
+        cmd_run(args)
+    elif args.command == "serve":
+        cmd_serve(args)
+    else:
+        parser.print_help()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.ci/build.py b/.ci/build.py
index 2339319..7953209 100644
--- a/.ci/build.py
+++ b/.ci/build.py
@@ -9,32 +9,7 @@
 import sys
 from pathlib import Path
 
-try:
-    import yaml
-except ImportError:
-    print(
-        "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr
-    )
-    sys.exit(1)
-
-
-def load_config(path):
-    with open(path, encoding="utf-8") as f:
-        return yaml.safe_load(f)
-
-
-def get_git_commit(ref="HEAD"):
-    result = subprocess.run(
-        ["git", "rev-parse", "--short", ref],
-        capture_output=True,
-        text=True,
-    )
-
-    if result.returncode != 0:
-        print(f"error: failed to get commit hash for `{ref}`", file=sys.stderr)
-        sys.exit(1)
-
-    return result.stdout.strip()
+from utils import get_git_commit, load_config
 
 
 def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"):
diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py
new file mode 100644
index 0000000..f3dbfb1
--- /dev/null
+++ b/.ci/ci_resource.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+"""Resource detection and allocation for CI Runner Agent."""
+
+import os
+import subprocess
+import threading
+from dataclasses import dataclass, field
+
+# GPU passthrough styles
+GPU_STYLE_NVIDIA = "nvidia"
+GPU_STYLE_NONE = "none"
+
+
+@dataclass
+class GpuInfo:
+    index: int
+    memory_used_mb: float
+    memory_total_mb: float
+    utilization_pct: float
+
+
+@dataclass
+class SystemResources:
+    total_memory_mb: float
+    available_memory_mb: float
+    cpu_count: int
+
+
+class ResourcePool:
+    """Thread-safe GPU and system resource manager.
+
+    Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi)
+    and tracks allocations to enable dynamic parallel scheduling.
+    """
+
+    GPU_QUERY_TOOLS = {
+        "nvidia": "nvidia-smi",
+        "iluvatar": "ixsmi",
+    }
+
+    def __init__(self, platform, utilization_threshold=10):
+        self._platform = platform
+        self._utilization_threshold = utilization_threshold
+        self._allocated: set[int] = set()
+        self._lock = threading.Lock()
+
+    @property
+    def platform(self):
+        return self._platform
+
+    @property
+    def allocated(self):
+        with self._lock:
+            return set(self._allocated)
+
+    def detect_gpus(self) -> list[GpuInfo]:
+        """Query GPU status via platform-specific CLI tool."""
+        tool = self.GPU_QUERY_TOOLS.get(self._platform)
+
+        if not tool:
+            return []
+
+        try:
+            result = subprocess.run(
+                [
+                    tool,
+                    "--query-gpu=index,memory.used,memory.total,utilization.gpu",
+                    "--format=csv,noheader,nounits",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+        except (FileNotFoundError, subprocess.TimeoutExpired):
+            return []
+
+        if result.returncode != 0:
+            return []
+
+        gpus = []
+
+        for line in result.stdout.strip().splitlines():
+            parts = [p.strip() for p in line.split(",")]
+
+            if len(parts) < 4:
+                continue
+
+            try:
+                gpus.append(
+                    GpuInfo(
+                        index=int(parts[0]),
+                        memory_used_mb=float(parts[1]),
+                        memory_total_mb=float(parts[2]),
+                        utilization_pct=float(parts[3]),
+                    )
+                )
+            except (ValueError, IndexError):
+                continue
+
+        return gpus
+
+    def detect_system_resources(self) -> SystemResources:
+        """Read system memory from /proc/meminfo and CPU count."""
+        total_mb = 0.0
+        available_mb = 0.0
+
+        try:
+            with open("/proc/meminfo", encoding="utf-8") as f:
+                for line in f:
+                    if line.startswith("MemTotal:"):
+                        total_mb = float(line.split()[1]) / 1024
+                    elif line.startswith("MemAvailable:"):
+                        available_mb = float(line.split()[1]) / 1024
+        except OSError:
+            pass
+
+        return SystemResources(
+            total_memory_mb=total_mb,
+            available_memory_mb=available_mb,
+            cpu_count=os.cpu_count() or 1,
+        )
+
+    def get_free_gpus(self) -> list[int]:
+        """Return GPU indices with utilization below threshold."""
+        gpus = self.detect_gpus()
+        return [
+            g.index
+            for g in gpus
+            if g.utilization_pct < self._utilization_threshold
+        ]
+
+    def allocate(self, gpu_count, memory_mb=0) -> tuple[list[int], bool]:
+        """Try to allocate GPUs and check memory.
+
+        Returns (allocated_gpu_ids, success). On failure returns ([], False).
+        GPU detection and memory checks run outside the lock to avoid blocking
+        other threads while subprocess.run (nvidia-smi) executes.
+        """
+        if gpu_count <= 0:
+            if memory_mb > 0:
+                sys_res = self.detect_system_resources()
+
+                if sys_res.available_memory_mb < memory_mb:
+                    return ([], False)
+
+            return ([], True)
+
+        # Detect GPUs and memory outside the lock (subprocess.run can block)
+        free_gpus = set(self.get_free_gpus())
+        sys_res = self.detect_system_resources() if memory_mb > 0 else None
+
+        with self._lock:
+            available = free_gpus - self._allocated
+
+            if len(available) < gpu_count:
+                return ([], False)
+
+            if sys_res is not None and sys_res.available_memory_mb < memory_mb:
+                return ([], False)
+
+            selected = sorted(available)[:gpu_count]
+            self._allocated.update(selected)
+            return (selected, True)
+
+    def release(self, gpu_ids):
+        """Return GPUs to the free pool."""
+        with self._lock:
+            self._allocated -= set(gpu_ids)
+
+    def get_status(self) -> dict:
+        """Return current resource status for API endpoints."""
+        gpus = self.detect_gpus()
+        sys_res = self.detect_system_resources()
+
+        with self._lock:
+            allocated = sorted(self._allocated)
+
+        return {
+            "platform": self._platform,
+            "gpus": [
+                {
+                    "index": g.index,
+                    "memory_used_mb": g.memory_used_mb,
+                    "memory_total_mb": g.memory_total_mb,
+                    "utilization_pct": g.utilization_pct,
+                    "allocated_by_agent": g.index in allocated,
+                }
+                for g in gpus
+            ],
+            "allocated_gpu_ids": allocated,
+            "system": {
+                "total_memory_mb": round(sys_res.total_memory_mb, 1),
+                "available_memory_mb": round(sys_res.available_memory_mb, 1),
+                "cpu_count": sys_res.cpu_count,
+            },
+            "utilization_threshold": self._utilization_threshold,
+        }
+
+
+def parse_gpu_requirement(job_config) -> int:
+    """Extract GPU count requirement from a job config."""
+    resources = job_config.get("resources", {})
+    gpu_style = resources.get("gpu_style", GPU_STYLE_NVIDIA)
+
+    if gpu_style == GPU_STYLE_NONE:
+        return 0
+
+    gpu_ids = str(resources.get("gpu_ids", ""))
+
+    if not gpu_ids:
+        return resources.get("gpu_count", 0)
+
+    if gpu_ids == "all":
+        return 0  # "all" means use all available, don't reserve specific count
+
+    return len(gpu_ids.split(","))
+
+
+def parse_memory_requirement(job_config) -> float:
+    """Extract memory requirement in MB from a job config."""
+    resources = job_config.get("resources", {})
+    memory = str(resources.get("memory", ""))
+
+    if not memory:
+        return 0
+
+    memory = memory.lower().strip()
+
+    if memory.endswith("gb"):
+        return float(memory[:-2]) * 1024
+    elif memory.endswith("g"):
+        return float(memory[:-1]) * 1024
+    elif memory.endswith("mb"):
+        return float(memory[:-2])
+    elif memory.endswith("m"):
+        return float(memory[:-1])
+
+    try:
+        return float(memory) * 1024  # Default: GB
+    except ValueError:
+        return 0
diff --git a/.ci/config.yaml b/.ci/config.yaml
index a86174a..e62bc07 100644
--- a/.ci/config.yaml
+++ b/.ci/config.yaml
@@ -2,32 +2,69 @@ repo:
   url: https://github.com/InfiniTensor/InfiniOps.git
   branch: master
 
-images:
-  nvidia:
-    dockerfile: .ci/images/nvidia/
-    build_args:
-      BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
-  ascend:                              # TODO: Ascend image is not ready yet
-    dockerfile: .ci/images/ascend/
-    build_args:
-      BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0
-    private_sdk:
-      source_env: PRIVATE_SDK_URL
+github:
+  status_context_prefix: "ci/infiniops"    # GitHub Commit Status context 前缀
+
+# agents:                                  # 远程 Agent 地址（CLI 跨机器触发用）
+#   nvidia:
+#     url: http://nvidia-host:8080
+#   iluvatar:
+#     url: http://iluvatar-host:8080
 
-jobs:
-  nvidia_gpu:
-    image: latest
-    platform: nvidia
-    resources:
-      gpu_ids: "0"                       # 指定 GPU ID，如 "0" "0,2" "all"
-      memory: 32GB
-      shm_size: 16g                      # 避免 PyTorch 默认 64MB SHMEM 不足
-      timeout: 3600
+platforms:
+  nvidia:
+    image:
+      dockerfile: .ci/images/nvidia/
+      build_args:
+        BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
+    setup: pip install .[dev] --no-build-isolation
+    jobs:
+      gpu:
+        resources:
+          gpu_ids: "0"                     # 指定 GPU ID，如 "0" "0,2" "all"
+          memory: 32GB
+          shm_size: 16g                    # 避免 PyTorch 默认 64MB SHMEM 不足
+          timeout: 3600
+        # env:                             # 可选，注入容器环境变量
+        #   MY_VAR: value
+        stages:
+          - name: test
+            run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
 
-    setup: pip install .[dev]
-    # env:                             # 可选，注入容器环境变量
-    #   MY_VAR: value
+  iluvatar:
+    image:
+      dockerfile: .ci/images/iluvatar/
+      build_args:
+        BASE_IMAGE: corex:qs_pj20250825
+        APT_MIRROR: http://archive.ubuntu.com/ubuntu
+        PIP_INDEX_URL: https://pypi.org/simple
+    docker_args:
+      - "--privileged"
+      - "--cap-add=ALL"
+      - "--pid=host"
+      - "--ipc=host"
+    volumes:
+      - /dev:/dev
+      - /lib/firmware:/lib/firmware
+      - /usr/src:/usr/src
+      - /lib/modules:/lib/modules
+    setup: pip install .[dev] --no-build-isolation
+    jobs:
+      gpu:
+        resources:
+          gpu_ids: "0"                     # 通过 CUDA_VISIBLE_DEVICES 控制可见 GPU
+          gpu_style: none                  # CoreX 设备通过 --privileged + /dev 挂载透传
+          memory: 32GB
+          shm_size: 16g
+          timeout: 3600
+        stages:
+          - name: test
+            run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
 
-    stages:
-      - name: test
-        run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
+  ascend:                                  # TODO: Ascend image is not ready yet
+    image:
+      dockerfile: .ci/images/ascend/
+      build_args:
+        BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0
+      private_sdk:
+        source_env: PRIVATE_SDK_URL
diff --git a/.ci/github_status.py b/.ci/github_status.py
new file mode 100644
index 0000000..a7abb8f
--- /dev/null
+++ b/.ci/github_status.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+"""GitHub Commit Status API wrapper using urllib (zero external dependencies)."""
+
+import json
+import os
+import re
+import sys
+import urllib.error
+import urllib.request
+
+
+def parse_repo_url(url):
+    """Extract (owner, repo) from a GitHub URL.
+
+    Handles:
+      - https://github.com/Owner/Repo.git
+      - git@github.com:Owner/Repo.git
+    """
+    # HTTPS format
+    m = re.match(r"https?://[^/]+/([^/]+)/([^/]+?)(?:\.git)?$", url)
+
+    if m:
+        return m.group(1), m.group(2)
+
+    # SSH format
+    m = re.match(r"git@[^:]+:([^/]+)/([^/]+?)(?:\.git)?$", url)
+
+    if m:
+        return m.group(1), m.group(2)
+
+    return "", ""
+
+
+def build_status_context(prefix, job_name):
+    """Build status context string, e.g. 'ci/infiniops/nvidia_gpu'."""
+    return f"{prefix}/{job_name}"
+
+
+def post_commit_status(
+    owner,
+    repo,
+    sha,
+    state,
+    context,
+    description,
+    target_url=None,
+    token=None,
+):
+    """Post a commit status to GitHub.
+
+    Args:
+        state: One of 'pending', 'success', 'failure', 'error'.
+        Returns True on success, False on failure.
+    """
+    token = token or os.environ.get("GITHUB_TOKEN", "")
+
+    if not token:
+        print("warning: GITHUB_TOKEN not set, skipping status update", file=sys.stderr)
+        return False
+
+    if not owner or not repo or not sha:
+        print("warning: missing owner/repo/sha, skipping status update", file=sys.stderr)
+        return False
+
+    url = f"https://api.github.com/repos/{owner}/{repo}/statuses/{sha}"
+    body = {
+        "state": state,
+        "context": context,
+        "description": description[:140],
+    }
+
+    if target_url:
+        body["target_url"] = target_url
+
+    data = json.dumps(body).encode("utf-8")
+    req = urllib.request.Request(
+        url,
+        data=data,
+        headers={
+            "Authorization": f"token {token}",
+            "Accept": "application/vnd.github.v3+json",
+            "Content-Type": "application/json",
+        },
+        method="POST",
+    )
+
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            return 200 <= resp.status < 300
+    except urllib.error.HTTPError as e:
+        print(
+            f"warning: GitHub status API returned {e.code}: {e.reason}",
+            file=sys.stderr,
+        )
+        return False
+    except urllib.error.URLError as e:
+        print(f"warning: GitHub status API error: {e.reason}", file=sys.stderr)
+        return False
diff --git a/.ci/images/iluvatar/Dockerfile b/.ci/images/iluvatar/Dockerfile
new file mode 100644
index 0000000..f098e5f
--- /dev/null
+++ b/.ci/images/iluvatar/Dockerfile
@@ -0,0 +1,53 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# CoreX runtime environment (base image sets these in /etc/bash.bashrc,
+# but docker build RUN uses /bin/sh which doesn't source it)
+ENV PATH=/usr/local/corex/bin:/usr/local/corex-4.3.0/corex-toolbox-1.0.0/bin:/usr/local/corex/lib64/python3/dist-packages/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages
+ENV LD_LIBRARY_PATH=/usr/local/corex/lib64:/usr/local/lib:/usr/local/openmpi/lib
+
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy
+ARG https_proxy
+ARG no_proxy
+
+ARG APT_MIRROR
+RUN if [ -n "$APT_MIRROR" ]; then \
+        sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \
+    fi && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        git \
+        ninja-build \
+        coreutils \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN ln -sf $(which python3) /usr/local/bin/python 2>/dev/null || true
+
+ARG PIP_INDEX_URL
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir \
+    ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \
+    scikit-build-core \
+    pybind11 \
+    libclang \
+    pytest \
+    pytest-cov \
+    pytest-xdist \
+    pyyaml \
+    ruff==0.15.7
+
+RUN pip config set global.index-url https://pypi.org/simple
+
+# Pin pre-installed CoreX torch to prevent pip from replacing it with upstream version
+RUN pip show torch >/dev/null 2>&1 && \
+    echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \
+    touch /etc/pip-constraints.txt
+ENV PIP_CONSTRAINT=/etc/pip-constraints.txt
+
+WORKDIR /workspace
diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile
index 74ccfd1..05da963 100644
--- a/.ci/images/nvidia/Dockerfile
+++ b/.ci/images/nvidia/Dockerfile
@@ -10,7 +10,11 @@ ARG http_proxy
 ARG https_proxy
 ARG no_proxy
 
-RUN apt-get update && \
+ARG APT_MIRROR
+RUN if [ -n "$APT_MIRROR" ]; then \
+        sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \
+    fi && \
+    apt-get update && \
     apt-get install -y --no-install-recommends \
         git \
         cmake \
@@ -19,13 +23,24 @@ RUN apt-get update && \
         libclang-dev \
     && rm -rf /var/lib/apt/lists/*
 
-RUN pip install --no-cache-dir \
+
+ARG PIP_INDEX_URL
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir \
+    ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \
     scikit-build-core \
     pybind11 \
     libclang \
     pytest \
     pytest-cov \
     pytest-xdist \
-    pyyaml
+    pyyaml \
+    ruff==0.15.7
+
+# Pin pre-installed torch to prevent pip from replacing it with a different version
+RUN pip show torch >/dev/null 2>&1 && \
+    echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \
+    touch /etc/pip-constraints.txt
+ENV PIP_CONSTRAINT=/etc/pip-constraints.txt
 
 WORKDIR /workspace
diff --git a/.ci/run.py b/.ci/run.py
index 0c8d648..2575781 100644
--- a/.ci/run.py
+++ b/.ci/run.py
@@ -9,31 +9,8 @@
 from datetime import datetime
 from pathlib import Path
 
-try:
-    import yaml
-except ImportError:
-    print(
-        "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr
-    )
-    sys.exit(1)
-
-
-def load_config(path):
-    with open(path, encoding="utf-8") as f:
-        return yaml.safe_load(f)
-
-
-def get_git_commit(ref="HEAD"):
-    result = subprocess.run(
-        ["git", "rev-parse", "--short", ref],
-        capture_output=True,
-        text=True,
-    )
-
-    if result.returncode != 0:
-        return "unknown"
-
-    return result.stdout.strip()
+from ci_resource import GPU_STYLE_NVIDIA, GPU_STYLE_NONE
+from utils import get_git_commit, load_config
 
 
 def build_results_dir(base, platform, stages, commit):
@@ -155,16 +132,29 @@ def build_docker_args(
         args.append("-e")
         args.append(f"STAGE_{i + 1}_CMD={s['run']}")
 
+    # Platform-specific device access
+    for flag in job.get("docker_args", []):
+        args.append(flag)
+
+    for vol in job.get("volumes", []):
+        args.extend(["-v", vol])
+
     gpu_id = gpu_id_override or str(resources.get("gpu_ids", ""))
     gpu_count = resources.get("gpu_count", 0)
-
-    if gpu_id:
-        if gpu_id == "all":
-            args.extend(["--gpus", "all"])
-        else:
-            args.extend(["--gpus", f'"device={gpu_id}"'])
-    elif gpu_count and gpu_count > 0:
-        args.extend(["--gpus", f"count={gpu_count}"])
+    gpu_style = resources.get("gpu_style", GPU_STYLE_NVIDIA)
+
+    if gpu_style == GPU_STYLE_NVIDIA:
+        if gpu_id:
+            if gpu_id == "all":
+                args.extend(["--gpus", "all"])
+            else:
+                args.extend(["--gpus", f'"device={gpu_id}"'])
+        elif gpu_count and gpu_count > 0:
+            args.extend(["--gpus", f"count={gpu_count}"])
+    elif gpu_style == GPU_STYLE_NONE and gpu_id and gpu_id != "all":
+        # For platforms like Iluvatar/CoreX that use --privileged + /dev mount,
+        # control visible GPUs via CUDA_VISIBLE_DEVICES.
+        args.extend(["-e", f"CUDA_VISIBLE_DEVICES={gpu_id}"])
 
     memory = resources.get("memory")
 
diff --git a/.ci/tests/conftest.py b/.ci/tests/conftest.py
index 98079cd..38ed716 100644
--- a/.ci/tests/conftest.py
+++ b/.ci/tests/conftest.py
@@ -6,37 +6,41 @@
 
 import pytest
 
+from utils import normalize_config
+
 
 @pytest.fixture
 def minimal_config():
-    return {
+    """Minimal platform-centric config, normalized to flat format."""
+    raw = {
         "repo": {
             "url": "https://github.com/InfiniTensor/InfiniOps.git",
             "branch": "master",
         },
-        "images": {
+        "platforms": {
             "nvidia": {
-                "dockerfile": ".ci/images/nvidia/",
-                "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"},
-            }
-        },
-        "jobs": {
-            "nvidia_gpu": {
-                "image": "latest",
-                "platform": "nvidia",
-                "resources": {
-                    "gpu_ids": "0",
-                    "memory": "32GB",
-                    "shm_size": "16g",
-                    "timeout": 3600,
+                "image": {
+                    "dockerfile": ".ci/images/nvidia/",
+                    "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"},
                 },
                 "setup": "pip install .[dev]",
-                "stages": [
-                    {
-                        "name": "test",
-                        "run": "pytest tests/ -v",
+                "jobs": {
+                    "gpu": {
+                        "resources": {
+                            "gpu_ids": "0",
+                            "memory": "32GB",
+                            "shm_size": "16g",
+                            "timeout": 3600,
+                        },
+                        "stages": [
+                            {
+                                "name": "test",
+                                "run": "pytest tests/ -v",
+                            }
+                        ],
                     }
-                ],
+                },
             }
         },
     }
+    return normalize_config(raw)
diff --git a/.ci/tests/test_agent.py b/.ci/tests/test_agent.py
new file mode 100644
index 0000000..5741385
--- /dev/null
+++ b/.ci/tests/test_agent.py
@@ -0,0 +1,503 @@
+import hashlib
+import hmac
+import json
+import threading
+import time
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+import agent
+import ci_resource as res
+from utils import normalize_config
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def agent_config():
+    raw = {
+        "repo": {
+            "url": "https://github.com/InfiniTensor/InfiniOps.git",
+            "branch": "master",
+        },
+        "github": {
+            "status_context_prefix": "ci/infiniops",
+        },
+        "agents": {
+            "nvidia": {"url": "http://nvidia-host:8080"},
+            "iluvatar": {"url": "http://iluvatar-host:8080"},
+        },
+        "platforms": {
+            "nvidia": {
+                "image": {
+                    "dockerfile": ".ci/images/nvidia/",
+                    "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"},
+                },
+                "setup": "pip install .[dev]",
+                "jobs": {
+                    "gpu": {
+                        "resources": {
+                            "gpu_ids": "0",
+                            "memory": "32GB",
+                            "shm_size": "16g",
+                            "timeout": 3600,
+                        },
+                        "stages": [{"name": "test", "run": "pytest tests/ -v"}],
+                    },
+                },
+            },
+            "iluvatar": {
+                "image": {
+                    "dockerfile": ".ci/images/iluvatar/",
+                    "build_args": {"BASE_IMAGE": "corex:qs_pj20250825"},
+                },
+                "setup": "pip install .[dev]",
+                "jobs": {
+                    "gpu": {
+                        "resources": {
+                            "gpu_ids": "0",
+                            "gpu_style": "none",
+                            "memory": "32GB",
+                            "shm_size": "16g",
+                            "timeout": 3600,
+                        },
+                        "stages": [{"name": "test", "run": "pytest tests/ -v"}],
+                    },
+                },
+            },
+        },
+    }
+    return normalize_config(raw)
+
+
+@pytest.fixture
+def mock_resource_pool():
+    pool = MagicMock(spec=res.ResourcePool)
+    pool.platform = "nvidia"
+    pool.allocate.return_value = ([0], True)
+    pool.release.return_value = None
+    pool.get_status.return_value = {"platform": "nvidia", "gpus": [], "allocated_gpu_ids": [], "system": {}}
+    return pool
+
+
+# ---------------------------------------------------------------------------
+# select_jobs
+# ---------------------------------------------------------------------------
+
+
+def test_select_jobs_by_name(agent_config):
+    jobs = agent.select_jobs(agent_config, job_name="nvidia_gpu")
+    assert jobs == ["nvidia_gpu"]
+
+
+def test_select_jobs_by_platform(agent_config):
+    jobs = agent.select_jobs(agent_config, platform="nvidia")
+    assert jobs == ["nvidia_gpu"]
+
+
+def test_select_jobs_by_platform_iluvatar(agent_config):
+    jobs = agent.select_jobs(agent_config, platform="iluvatar")
+    assert jobs == ["iluvatar_gpu"]
+
+
+def test_select_jobs_all(agent_config):
+    jobs = agent.select_jobs(agent_config)
+    assert set(jobs) == {"nvidia_gpu", "iluvatar_gpu"}
+
+
+def test_select_jobs_invalid_name(agent_config):
+    with pytest.raises(ValueError, match="not_exist"):
+        agent.select_jobs(agent_config, job_name="not_exist")
+
+
+# ---------------------------------------------------------------------------
+# route_jobs
+# ---------------------------------------------------------------------------
+
+
+def test_route_jobs_local(agent_config):
+    local, remote = agent.route_jobs(agent_config, ["nvidia_gpu"], local_platform="nvidia")
+    assert local == ["nvidia_gpu"]
+    assert remote == []
+
+
+def test_route_jobs_remote(agent_config):
+    local, remote = agent.route_jobs(agent_config, ["iluvatar_gpu"], local_platform="nvidia")
+    assert local == []
+    assert len(remote) == 1
+    assert remote[0][0] == "iluvatar_gpu"
+    assert remote[0][1] == "http://iluvatar-host:8080"
+
+
+def test_route_jobs_mixed(agent_config):
+    local, remote = agent.route_jobs(
+        agent_config, ["nvidia_gpu", "iluvatar_gpu"], local_platform="nvidia"
+    )
+    assert local == ["nvidia_gpu"]
+    assert len(remote) == 1
+
+
+def test_route_jobs_no_platform(agent_config):
+    local, remote = agent.route_jobs(agent_config, ["nvidia_gpu", "iluvatar_gpu"])
+    assert len(local) == 2
+    assert remote == []
+
+
+# ---------------------------------------------------------------------------
+# verify_signature
+# ---------------------------------------------------------------------------
+
+
+def test_verify_signature_valid():
+    secret = "my-secret"
+    body = b'{"action": "push"}'
+    sig = "sha256=" + hmac.new(secret.encode(), body, hashlib.sha256).hexdigest()
+    assert agent.verify_signature(secret, body, sig) is True
+
+
+def test_verify_signature_invalid():
+    assert agent.verify_signature("secret", b"body", "sha256=wrong") is False
+
+
+def test_verify_signature_empty():
+    assert agent.verify_signature("secret", b"body", "") is False
+
+
+# ---------------------------------------------------------------------------
+# JobRequest / JobResult
+# ---------------------------------------------------------------------------
+
+
+def test_job_request_fields(agent_config):
+    req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config)
+    assert req.job_name == "nvidia_gpu"
+    assert req.platform == "nvidia"
+    assert req.commit_sha == "abc123"
+    assert len(req.job_id) == 8
+    d = req.to_dict()
+    assert d["job_name"] == "nvidia_gpu"
+
+
+def test_job_result_success():
+    r = agent.JobResult("id1", "nvidia_gpu", "abc", 0, Path("/tmp/res"), 42.5)
+    assert r.state == "success"
+
+
+def test_job_result_failure():
+    r = agent.JobResult("id1", "nvidia_gpu", "abc", 1, Path("/tmp/res"), 10.0)
+    assert r.state == "failure"
+
+
+# ---------------------------------------------------------------------------
+# Scheduler
+# ---------------------------------------------------------------------------
+
+
+def test_scheduler_submit_and_run(agent_config, mock_resource_pool, monkeypatch):
+    monkeypatch.setattr("subprocess.run", lambda cmd, **kw: MagicMock(returncode=0))
+    monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
+
+    scheduler = agent.Scheduler(
+        agent_config, "nvidia", mock_resource_pool,
+        results_dir=Path("/tmp/test-results"),
+        no_status=True, dry_run=True,
+    )
+    req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config,
+                           results_dir=Path("/tmp/test-results"))
+    jid = scheduler.submit(req)
+    results = scheduler.wait_all()
+    assert len(results) == 1
+    assert results[0].state == "success"
+
+
+def test_scheduler_queues_when_no_resources(agent_config, monkeypatch):
+    pool = MagicMock(spec=res.ResourcePool)
+    pool.allocate.return_value = ([], False)
+    pool.get_status.return_value = {"platform": "nvidia", "gpus": [], "allocated_gpu_ids": [], "system": {}}
+
+    scheduler = agent.Scheduler(
+        agent_config, "nvidia", pool,
+        no_status=True, dry_run=False,
+    )
+
+    req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config)
+    scheduler.submit(req)
+
+    info = scheduler.get_job(req.job_id)
+    assert info["state"] == "queued"
+
+
+def test_scheduler_get_status(agent_config, mock_resource_pool):
+    scheduler = agent.Scheduler(
+        agent_config, "nvidia", mock_resource_pool,
+        no_status=True, dry_run=True,
+    )
+
+    status = scheduler.get_status()
+    assert "queued" in status
+    assert "running" in status
+    assert "completed" in status
+    assert "resources" in status
+
+
+# ---------------------------------------------------------------------------
+# WebhookHandler — push event parsing
+# ---------------------------------------------------------------------------
+
+
+def test_webhook_parse_push():
+    handler = agent.WebhookHandler.__new__(agent.WebhookHandler)
+    payload = {"ref": "refs/heads/feat/test", "after": "abc123def456"}
+    branch, sha = handler._parse_push(payload)
+    assert branch == "feat/test"
+    assert sha == "abc123def456"
+
+
+def test_webhook_parse_pr():
+    handler = agent.WebhookHandler.__new__(agent.WebhookHandler)
+    payload = {
+        "pull_request": {
+            "head": {
+                "ref": "feat/pr-branch",
+                "sha": "def789",
+            }
+        }
+    }
+    branch, sha = handler._parse_pull_request(payload)
+    assert branch == "feat/pr-branch"
+    assert sha == "def789"
+
+
+# ---------------------------------------------------------------------------
+# Integration-style: webhook HTTP test
+# ---------------------------------------------------------------------------
+
+
+def _urlopen_no_proxy(url_or_req, **kwargs):
+    """urlopen that bypasses any HTTP_PROXY."""
+    import urllib.request
+
+    opener = urllib.request.build_opener(urllib.request.ProxyHandler({}))
+    return opener.open(url_or_req, **kwargs)
+
+
+def test_health_endpoint(agent_config, mock_resource_pool):
+    scheduler = agent.Scheduler(
+        agent_config, "nvidia", mock_resource_pool,
+        no_status=True,
+    )
+    server = agent.AgentServer(
+        "127.0.0.1", 0, agent_config, scheduler, "nvidia",
+    )
+    port = server.server_address[1]
+
+    t = threading.Thread(target=server.handle_request, daemon=True)
+    t.start()
+
+    try:
+        resp = _urlopen_no_proxy(f"http://127.0.0.1:{port}/health", timeout=5)
+        data = json.loads(resp.read())
+        assert data["status"] == "ok"
+        assert data["platform"] == "nvidia"
+    finally:
+        server.server_close()
+
+
+def test_api_run_endpoint(agent_config, mock_resource_pool, monkeypatch):
+    monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
+
+    scheduler = agent.Scheduler(
+        agent_config, "nvidia", mock_resource_pool,
+        no_status=True, dry_run=True,
+    )
+    server = agent.AgentServer(
+        "127.0.0.1", 0, agent_config, scheduler, "nvidia",
+        results_dir=Path("/tmp/test-results"),
+    )
+    port = server.server_address[1]
+
+    t = threading.Thread(target=server.handle_request, daemon=True)
+    t.start()
+
+    import urllib.request
+
+    body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode()
+    req = urllib.request.Request(
+        f"http://127.0.0.1:{port}/api/run",
+        data=body,
+        headers={"Content-Type": "application/json"},
+    )
+
+    try:
+        resp = _urlopen_no_proxy(req, timeout=5)
+        data = json.loads(resp.read())
+        assert data["accepted"] is True
+        assert len(data["job_ids"]) >= 1
+    finally:
+        server.server_close()
+
+
+def test_webhook_with_signature(agent_config, mock_resource_pool, monkeypatch):
+    monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
+
+    scheduler = agent.Scheduler(
+        agent_config, "nvidia", mock_resource_pool,
+        no_status=True, dry_run=True,
+    )
+    secret = "test-secret"
+    server = agent.AgentServer(
+        "127.0.0.1", 0, agent_config, scheduler, "nvidia",
+        webhook_secret=secret,
+        results_dir=Path("/tmp/test-results"),
+    )
+    port = server.server_address[1]
+
+    t = threading.Thread(target=server.handle_request, daemon=True)
+    t.start()
+
+    import urllib.request
+
+    payload = json.dumps({
+        "ref": "refs/heads/master",
+        "after": "abc123def456",
+    }).encode()
+    sig = "sha256=" + hmac.new(secret.encode(), payload, hashlib.sha256).hexdigest()
+
+    req = urllib.request.Request(
+        f"http://127.0.0.1:{port}/webhook",
+        data=payload,
+        headers={
+            "Content-Type": "application/json",
+            "X-GitHub-Event": "push",
+            "X-Hub-Signature-256": sig,
+        },
+    )
+
+    try:
+        resp = _urlopen_no_proxy(req, timeout=5)
+        data = json.loads(resp.read())
+        assert data["accepted"] is True
+    finally:
+        server.server_close()
+
+
+def test_webhook_invalid_signature(agent_config, mock_resource_pool):
+    scheduler = agent.Scheduler(
+        agent_config, "nvidia", mock_resource_pool,
+        no_status=True,
+    )
+    server = agent.AgentServer(
+        "127.0.0.1", 0, agent_config, scheduler, "nvidia",
+        webhook_secret="real-secret",
+    )
+    port = server.server_address[1]
+
+    t = threading.Thread(target=server.handle_request, daemon=True)
+    t.start()
+
+    import urllib.error
+    import urllib.request
+
+    payload = b'{"ref": "refs/heads/master", "after": "abc"}'
+    req = urllib.request.Request(
+        f"http://127.0.0.1:{port}/webhook",
+        data=payload,
+        headers={
+            "Content-Type": "application/json",
+            "X-GitHub-Event": "push",
+            "X-Hub-Signature-256": "sha256=invalid",
+        },
+    )
+
+    try:
+        with pytest.raises(urllib.error.HTTPError) as exc_info:
+            _urlopen_no_proxy(req, timeout=5)
+
+        assert exc_info.value.code == 401
+    finally:
+        server.server_close()
+
+
+# ---------------------------------------------------------------------------
+# API token authentication
+# ---------------------------------------------------------------------------
+
+
+def test_api_run_requires_token(agent_config, mock_resource_pool, monkeypatch):
+    """When api_token is set, /api/run rejects requests without valid token."""
+    monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
+
+    scheduler = agent.Scheduler(
+        agent_config, "nvidia", mock_resource_pool,
+        no_status=True, dry_run=True,
+    )
+    server = agent.AgentServer(
+        "127.0.0.1", 0, agent_config, scheduler, "nvidia",
+        api_token="my-secret-token",
+        results_dir=Path("/tmp/test-results"),
+    )
+    port = server.server_address[1]
+
+    t = threading.Thread(target=server.handle_request, daemon=True)
+    t.start()
+
+    import urllib.error
+    import urllib.request
+
+    body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode()
+    req = urllib.request.Request(
+        f"http://127.0.0.1:{port}/api/run",
+        data=body,
+        headers={"Content-Type": "application/json"},
+    )
+
+    try:
+        with pytest.raises(urllib.error.HTTPError) as exc_info:
+            _urlopen_no_proxy(req, timeout=5)
+
+        assert exc_info.value.code == 401
+    finally:
+        server.server_close()
+
+
+def test_api_run_accepts_valid_token(agent_config, mock_resource_pool, monkeypatch):
+    """When api_token is set, /api/run accepts requests with correct Bearer token."""
+    monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
+
+    scheduler = agent.Scheduler(
+        agent_config, "nvidia", mock_resource_pool,
+        no_status=True, dry_run=True,
+    )
+    server = agent.AgentServer(
+        "127.0.0.1", 0, agent_config, scheduler, "nvidia",
+        api_token="my-secret-token",
+        results_dir=Path("/tmp/test-results"),
+    )
+    port = server.server_address[1]
+
+    t = threading.Thread(target=server.handle_request, daemon=True)
+    t.start()
+
+    import urllib.request
+
+    body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode()
+    req = urllib.request.Request(
+        f"http://127.0.0.1:{port}/api/run",
+        data=body,
+        headers={
+            "Content-Type": "application/json",
+            "Authorization": "Bearer my-secret-token",
+        },
+    )
+
+    try:
+        resp = _urlopen_no_proxy(req, timeout=5)
+        data = json.loads(resp.read())
+        assert data["accepted"] is True
+    finally:
+        server.server_close()
diff --git a/.ci/tests/test_github_status.py b/.ci/tests/test_github_status.py
new file mode 100644
index 0000000..0efa36e
--- /dev/null
+++ b/.ci/tests/test_github_status.py
@@ -0,0 +1,144 @@
+import json
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+import github_status as gh
+
+
+# ---------------------------------------------------------------------------
+# parse_repo_url
+# ---------------------------------------------------------------------------
+
+
+def test_parse_repo_url_https():
+    owner, repo = gh.parse_repo_url("https://github.com/InfiniTensor/InfiniOps.git")
+    assert owner == "InfiniTensor"
+    assert repo == "InfiniOps"
+
+
+def test_parse_repo_url_https_no_git():
+    owner, repo = gh.parse_repo_url("https://github.com/Owner/Repo")
+    assert owner == "Owner"
+    assert repo == "Repo"
+
+
+def test_parse_repo_url_ssh():
+    owner, repo = gh.parse_repo_url("git@github.com:Owner/Repo.git")
+    assert owner == "Owner"
+    assert repo == "Repo"
+
+
+def test_parse_repo_url_invalid():
+    owner, repo = gh.parse_repo_url("not-a-url")
+    assert owner == ""
+    assert repo == ""
+
+
+# ---------------------------------------------------------------------------
+# build_status_context
+# ---------------------------------------------------------------------------
+
+
+def test_build_status_context():
+    ctx = gh.build_status_context("ci/infiniops", "nvidia_gpu")
+    assert ctx == "ci/infiniops/nvidia_gpu"
+
+
+# ---------------------------------------------------------------------------
+# post_commit_status
+# ---------------------------------------------------------------------------
+
+
+def test_post_status_no_token(monkeypatch):
+    monkeypatch.delenv("GITHUB_TOKEN", raising=False)
+    result = gh.post_commit_status("owner", "repo", "abc123", "success", "ctx", "desc")
+    assert result is False
+
+
+def test_post_status_missing_owner():
+    result = gh.post_commit_status("", "repo", "abc123", "success", "ctx", "desc", token="tok")
+    assert result is False
+
+
+def test_post_status_success(monkeypatch):
+    mock_response = MagicMock()
+    mock_response.status = 201
+    mock_response.__enter__ = MagicMock(return_value=mock_response)
+    mock_response.__exit__ = MagicMock(return_value=False)
+
+    captured_req = {}
+
+    def mock_urlopen(req, **kwargs):
+        captured_req["url"] = req.full_url
+        captured_req["data"] = json.loads(req.data)
+        captured_req["headers"] = dict(req.headers)
+        return mock_response
+
+    monkeypatch.setattr("urllib.request.urlopen", mock_urlopen)
+
+    result = gh.post_commit_status(
+        "InfiniTensor",
+        "InfiniOps",
+        "abc123def",
+        "success",
+        "ci/infiniops/nvidia_gpu",
+        "Tests passed",
+        token="ghp_test_token",
+    )
+
+    assert result is True
+    assert "abc123def" in captured_req["url"]
+    assert captured_req["data"]["state"] == "success"
+    assert captured_req["data"]["context"] == "ci/infiniops/nvidia_gpu"
+    assert "ghp_test_token" in captured_req["headers"]["Authorization"]
+
+
+def test_post_status_http_error(monkeypatch):
+    import urllib.error
+
+    def mock_urlopen(req, **kwargs):
+        raise urllib.error.HTTPError(
+            url="", code=422, msg="Unprocessable", hdrs=None, fp=None
+        )
+
+    monkeypatch.setattr("urllib.request.urlopen", mock_urlopen)
+
+    result = gh.post_commit_status(
+        "owner", "repo", "sha", "success", "ctx", "desc", token="tok"
+    )
+    assert result is False
+
+
+def test_post_status_url_error(monkeypatch):
+    import urllib.error
+
+    def mock_urlopen(req, **kwargs):
+        raise urllib.error.URLError("connection refused")
+
+    monkeypatch.setattr("urllib.request.urlopen", mock_urlopen)
+
+    result = gh.post_commit_status(
+        "owner", "repo", "sha", "success", "ctx", "desc", token="tok"
+    )
+    assert result is False
+
+
+def test_post_status_truncates_description(monkeypatch):
+    mock_response = MagicMock()
+    mock_response.status = 201
+    mock_response.__enter__ = MagicMock(return_value=mock_response)
+    mock_response.__exit__ = MagicMock(return_value=False)
+
+    captured = {}
+
+    def mock_urlopen(req, **kwargs):
+        captured["data"] = json.loads(req.data)
+        return mock_response
+
+    monkeypatch.setattr("urllib.request.urlopen", mock_urlopen)
+
+    long_desc = "x" * 200
+    gh.post_commit_status("o", "r", "sha", "success", "ctx", long_desc, token="tok")
+
+    assert len(captured["data"]["description"]) == 140
diff --git a/.ci/tests/test_resource.py b/.ci/tests/test_resource.py
new file mode 100644
index 0000000..b75043c
--- /dev/null
+++ b/.ci/tests/test_resource.py
@@ -0,0 +1,324 @@
+import threading
+
+import pytest
+
+import ci_resource as res
+
+
+# ---------------------------------------------------------------------------
+# GpuInfo / SystemResources
+# ---------------------------------------------------------------------------
+
+
+def test_gpu_info_fields():
+    g = res.GpuInfo(index=0, memory_used_mb=1000, memory_total_mb=8000, utilization_pct=50)
+    assert g.index == 0
+    assert g.memory_total_mb == 8000
+
+
+def test_system_resources_fields():
+    s = res.SystemResources(total_memory_mb=32000, available_memory_mb=16000, cpu_count=8)
+    assert s.cpu_count == 8
+
+
+# ---------------------------------------------------------------------------
+# detect_gpus
+# ---------------------------------------------------------------------------
+
+
+def test_detect_gpus_nvidia_parses_csv(monkeypatch):
+    csv_output = "0, 512, 8192, 5\n1, 1024, 8192, 80\n"
+
+    def mock_run(cmd, **kwargs):
+        class R:
+            returncode = 0
+            stdout = csv_output
+
+        return R()
+
+    monkeypatch.setattr("subprocess.run", mock_run)
+
+    pool = res.ResourcePool("nvidia")
+    gpus = pool.detect_gpus()
+    assert len(gpus) == 2
+    assert gpus[0].index == 0
+    assert gpus[0].memory_used_mb == 512
+    assert gpus[0].utilization_pct == 5
+    assert gpus[1].index == 1
+    assert gpus[1].utilization_pct == 80
+
+
+def test_detect_gpus_empty_on_failure(monkeypatch):
+    def mock_run(cmd, **kwargs):
+        class R:
+            returncode = 1
+            stdout = ""
+
+        return R()
+
+    monkeypatch.setattr("subprocess.run", mock_run)
+
+    pool = res.ResourcePool("nvidia")
+    assert pool.detect_gpus() == []
+
+
+def test_detect_gpus_unknown_platform():
+    pool = res.ResourcePool("unknown_platform")
+    assert pool.detect_gpus() == []
+
+
+def test_detect_gpus_file_not_found(monkeypatch):
+    def mock_run(cmd, **kwargs):
+        raise FileNotFoundError("nvidia-smi not found")
+
+    monkeypatch.setattr("subprocess.run", mock_run)
+
+    pool = res.ResourcePool("nvidia")
+    assert pool.detect_gpus() == []
+
+
+# ---------------------------------------------------------------------------
+# detect_system_resources
+# ---------------------------------------------------------------------------
+
+
+def test_detect_system_resources(monkeypatch, tmp_path):
+    meminfo = tmp_path / "meminfo"
+    meminfo.write_text(
+        "MemTotal:       32000000 kB\n"
+        "MemFree:        10000000 kB\n"
+        "MemAvailable:   20000000 kB\n"
+    )
+
+    import io
+    _real_open = open
+
+    def fake_open(path, **kw):
+        if str(path) == "/proc/meminfo":
+            return _real_open(str(meminfo), **kw)
+        return _real_open(path, **kw)
+
+    monkeypatch.setattr("builtins.open", fake_open)
+
+    pool = res.ResourcePool("nvidia")
+    sys_res = pool.detect_system_resources()
+    assert abs(sys_res.total_memory_mb - 32000000 / 1024) < 1
+    assert abs(sys_res.available_memory_mb - 20000000 / 1024) < 1
+    assert sys_res.cpu_count > 0
+
+
+# ---------------------------------------------------------------------------
+# get_free_gpus
+# ---------------------------------------------------------------------------
+
+
+def test_get_free_gpus_filters_by_utilization(monkeypatch):
+    csv_output = "0, 100, 8192, 5\n1, 4000, 8192, 95\n2, 200, 8192, 8\n"
+
+    def mock_run(cmd, **kwargs):
+        class R:
+            returncode = 0
+            stdout = csv_output
+
+        return R()
+
+    monkeypatch.setattr("subprocess.run", mock_run)
+
+    pool = res.ResourcePool("nvidia", utilization_threshold=10)
+    free = pool.get_free_gpus()
+    assert 0 in free
+    assert 2 in free
+    assert 1 not in free
+
+
+# ---------------------------------------------------------------------------
+# allocate / release
+# ---------------------------------------------------------------------------
+
+
+def test_allocate_success(monkeypatch):
+    csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n"
+
+    def mock_run(cmd, **kwargs):
+        class R:
+            returncode = 0
+            stdout = csv_output
+
+        return R()
+
+    monkeypatch.setattr("subprocess.run", mock_run)
+
+    pool = res.ResourcePool("nvidia", utilization_threshold=10)
+    gpu_ids, ok = pool.allocate(1)
+    assert ok is True
+    assert len(gpu_ids) == 1
+    assert gpu_ids[0] in (0, 1)
+
+
+def test_allocate_insufficient_gpus(monkeypatch):
+    csv_output = "0, 100, 8192, 5\n"
+
+    def mock_run(cmd, **kwargs):
+        class R:
+            returncode = 0
+            stdout = csv_output
+
+        return R()
+
+    monkeypatch.setattr("subprocess.run", mock_run)
+
+    pool = res.ResourcePool("nvidia", utilization_threshold=10)
+    gpu_ids, ok = pool.allocate(3)
+    assert ok is False
+    assert gpu_ids == []
+
+
+def test_allocate_zero_gpus():
+    pool = res.ResourcePool("unknown")
+    gpu_ids, ok = pool.allocate(0)
+    assert ok is True
+    assert gpu_ids == []
+
+
+def test_release_frees_gpus(monkeypatch):
+    csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n"
+
+    def mock_run(cmd, **kwargs):
+        class R:
+            returncode = 0
+            stdout = csv_output
+
+        return R()
+
+    monkeypatch.setattr("subprocess.run", mock_run)
+
+    pool = res.ResourcePool("nvidia", utilization_threshold=10)
+    gpu_ids, ok = pool.allocate(2)
+    assert ok is True
+    assert len(gpu_ids) == 2
+
+    # All GPUs allocated, next allocation should fail
+    _, ok2 = pool.allocate(1)
+    assert ok2 is False
+
+    # Release one
+    pool.release([gpu_ids[0]])
+    gpu_ids2, ok3 = pool.allocate(1)
+    assert ok3 is True
+    assert gpu_ids2 == [gpu_ids[0]]
+
+
+def test_allocate_excludes_allocated(monkeypatch):
+    csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n"
+
+    def mock_run(cmd, **kwargs):
+        class R:
+            returncode = 0
+            stdout = csv_output
+
+        return R()
+
+    monkeypatch.setattr("subprocess.run", mock_run)
+
+    pool = res.ResourcePool("nvidia", utilization_threshold=10)
+    gpu_ids1, _ = pool.allocate(1)
+    gpu_ids2, _ = pool.allocate(1)
+
+    assert gpu_ids1 != gpu_ids2
+    assert set(gpu_ids1 + gpu_ids2) == {0, 1}
+
+
+def test_thread_safety(monkeypatch):
+    csv_output = "0, 0, 8192, 0\n1, 0, 8192, 0\n2, 0, 8192, 0\n3, 0, 8192, 0\n"
+
+    def mock_run(cmd, **kwargs):
+        class R:
+            returncode = 0
+            stdout = csv_output
+
+        return R()
+
+    monkeypatch.setattr("subprocess.run", mock_run)
+
+    pool = res.ResourcePool("nvidia", utilization_threshold=50)
+    allocated_all = []
+    lock = threading.Lock()
+
+    def allocate_one():
+        ids, ok = pool.allocate(1)
+
+        if ok:
+            with lock:
+                allocated_all.extend(ids)
+
+    threads = [threading.Thread(target=allocate_one) for _ in range(4)]
+
+    for t in threads:
+        t.start()
+
+    for t in threads:
+        t.join()
+
+    assert len(allocated_all) == 4
+    assert len(set(allocated_all)) == 4
+
+
+# ---------------------------------------------------------------------------
+# get_status
+# ---------------------------------------------------------------------------
+
+
+def test_get_status(monkeypatch):
+    csv_output = "0, 512, 8192, 5\n"
+
+    def mock_run(cmd, **kwargs):
+        class R:
+            returncode = 0
+            stdout = csv_output
+
+        return R()
+
+    monkeypatch.setattr("subprocess.run", mock_run)
+
+    pool = res.ResourcePool("nvidia")
+    status = pool.get_status()
+    assert status["platform"] == "nvidia"
+    assert len(status["gpus"]) == 1
+    assert "system" in status
+
+
+# ---------------------------------------------------------------------------
+# parse_gpu_requirement / parse_memory_requirement
+# ---------------------------------------------------------------------------
+
+
+def test_parse_gpu_requirement_nvidia():
+    job = {"resources": {"gpu_ids": "0,1", "gpu_style": "nvidia"}}
+    assert res.parse_gpu_requirement(job) == 2
+
+
+def test_parse_gpu_requirement_none():
+    job = {"resources": {"gpu_style": "none"}}
+    assert res.parse_gpu_requirement(job) == 0
+
+
+def test_parse_gpu_requirement_all():
+    job = {"resources": {"gpu_ids": "all"}}
+    assert res.parse_gpu_requirement(job) == 0
+
+
+def test_parse_gpu_requirement_default():
+    job = {"resources": {"gpu_ids": "0"}}
+    assert res.parse_gpu_requirement(job) == 1
+
+
+def test_parse_memory_requirement_gb():
+    assert res.parse_memory_requirement({"resources": {"memory": "32GB"}}) == 32 * 1024
+
+
+def test_parse_memory_requirement_mb():
+    assert res.parse_memory_requirement({"resources": {"memory": "512MB"}}) == 512
+
+
+def test_parse_memory_requirement_empty():
+    assert res.parse_memory_requirement({"resources": {}}) == 0
diff --git a/.ci/tests/test_utils.py b/.ci/tests/test_utils.py
new file mode 100644
index 0000000..2a930d3
--- /dev/null
+++ b/.ci/tests/test_utils.py
@@ -0,0 +1,90 @@
+from utils import normalize_config
+
+
+def test_normalize_creates_flat_jobs():
+    raw = {
+        "repo": {"url": "https://github.com/org/repo.git"},
+        "platforms": {
+            "nvidia": {
+                "image": {"dockerfile": ".ci/images/nvidia/"},
+                "setup": "pip install .",
+                "docker_args": ["--gpus", "all"],
+                "jobs": {
+                    "gpu": {
+                        "resources": {"gpu_ids": "0"},
+                        "stages": [{"name": "test", "run": "pytest"}],
+                    },
+                    "multi_gpu": {
+                        "resources": {"gpu_ids": "0,1"},
+                        "stages": [{"name": "test", "run": "pytest"}],
+                    },
+                },
+            },
+        },
+    }
+    config = normalize_config(raw)
+
+    assert "nvidia_gpu" in config["jobs"]
+    assert "nvidia_multi_gpu" in config["jobs"]
+    assert config["jobs"]["nvidia_gpu"]["platform"] == "nvidia"
+    assert config["jobs"]["nvidia_gpu"]["setup"] == "pip install ."
+    assert config["jobs"]["nvidia_gpu"]["docker_args"] == ["--gpus", "all"]
+    assert config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] == "0"
+    assert config["jobs"]["nvidia_multi_gpu"]["resources"]["gpu_ids"] == "0,1"
+
+
+def test_normalize_extracts_images():
+    raw = {
+        "platforms": {
+            "nvidia": {
+                "image": {
+                    "dockerfile": ".ci/images/nvidia/",
+                    "build_args": {"BASE_IMAGE": "pytorch:latest"},
+                },
+                "jobs": {},
+            },
+        },
+    }
+    config = normalize_config(raw)
+    assert config["images"]["nvidia"]["dockerfile"] == ".ci/images/nvidia/"
+    assert config["images"]["nvidia"]["build_args"]["BASE_IMAGE"] == "pytorch:latest"
+
+
+def test_normalize_job_overrides_platform_defaults():
+    raw = {
+        "platforms": {
+            "nvidia": {
+                "setup": "default setup",
+                "jobs": {
+                    "special": {
+                        "setup": "custom setup",
+                        "stages": [],
+                    },
+                },
+            },
+        },
+    }
+    config = normalize_config(raw)
+    assert config["jobs"]["nvidia_special"]["setup"] == "custom setup"
+
+
+def test_normalize_preserves_top_level_keys():
+    raw = {
+        "repo": {"url": "https://github.com/org/repo.git"},
+        "github": {"status_context_prefix": "ci/test"},
+        "agents": {"nvidia": {"url": "http://host:8080"}},
+        "platforms": {},
+    }
+    config = normalize_config(raw)
+    assert config["repo"]["url"] == "https://github.com/org/repo.git"
+    assert config["github"]["status_context_prefix"] == "ci/test"
+    assert config["agents"]["nvidia"]["url"] == "http://host:8080"
+
+
+def test_normalize_passthrough_flat_config():
+    """Old flat format without 'platforms' key is returned as-is."""
+    flat = {
+        "images": {"nvidia": {}},
+        "jobs": {"nvidia_gpu": {"platform": "nvidia"}},
+    }
+    assert normalize_config(flat) is flat
diff --git a/.ci/utils.py b/.ci/utils.py
new file mode 100644
index 0000000..7932ba6
--- /dev/null
+++ b/.ci/utils.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+"""Shared utilities for the CI toolchain."""
+
+import subprocess
+import sys
+
+try:
+    import yaml
+except ImportError:
+    print(
+        "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr
+    )
+    sys.exit(1)
+
+
+def normalize_config(raw):
+    """Convert platform-centric config to flat images/jobs format.
+
+    Input (new format):
+        platforms:
+          nvidia:
+            image: {dockerfile: ..., build_args: ...}
+            setup: pip install .[dev]
+            jobs:
+              gpu: {resources: ..., stages: ...}
+
+    Output (flat format consumed by run.py / build.py / agent.py):
+        images:
+          nvidia: {dockerfile: ..., build_args: ...}
+        jobs:
+          nvidia_gpu: {platform: nvidia, setup: ..., resources: ..., stages: ...}
+
+    If the config already uses the flat format (no 'platforms' key), returns as-is.
+    """
+    if "platforms" not in raw:
+        return raw
+
+    config = {}
+
+    for key in ("repo", "github", "agents"):
+        if key in raw:
+            config[key] = raw[key]
+
+    config["images"] = {}
+    config["jobs"] = {}
+
+    for platform, pcfg in raw.get("platforms", {}).items():
+        # Image config
+        if "image" in pcfg:
+            config["images"][platform] = pcfg["image"]
+
+        # Platform-level defaults inherited by jobs
+        defaults = {}
+
+        for key in ("image_tag", "docker_args", "volumes", "setup", "env"):
+            if key in pcfg:
+                defaults[key] = pcfg[key]
+
+        # Flatten jobs: {platform}_{job_name}
+        for job_name, job_cfg in pcfg.get("jobs", {}).items():
+            full_name = f"{platform}_{job_name}"
+            flat = {
+                "platform": platform,
+                "image": defaults.get("image_tag", "latest"),
+            }
+
+            # Apply platform defaults
+            for key in ("docker_args", "volumes", "setup", "env"):
+                if key in defaults:
+                    flat[key] = defaults[key]
+
+            # Job-level overrides
+            flat.update(job_cfg)
+
+            config["jobs"][full_name] = flat
+
+    return config
+
+
+def load_config(path):
+    """Load a YAML config file and normalize to flat format."""
+    with open(path, encoding="utf-8") as f:
+        raw = yaml.safe_load(f)
+
+    return normalize_config(raw)
+
+
+def get_git_commit(ref="HEAD", short=True):
+    """Get git commit SHA. Returns 'unknown' on failure."""
+    cmd = ["git", "rev-parse"]
+
+    if short:
+        cmd.append("--short")
+
+    cmd.append(ref)
+    result = subprocess.run(cmd, capture_output=True, text=True)
+
+    if result.returncode != 0:
+        return "unknown"
+
+    return result.stdout.strip()

From 5292415c3c5d939fac96788ae55c9d21a6963021 Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Mon, 23 Mar 2026 06:03:23 +0000
Subject: [PATCH 05/16] docs: add multi-machine deployment guide for NVIDIA and
 Iluvatar platform

---
 .ci/README.md | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)

diff --git a/.ci/README.md b/.ci/README.md
index 33841ca..4e826e8 100644
--- a/.ci/README.md
+++ b/.ci/README.md
@@ -241,3 +241,154 @@ Agent 自动检测 GPU 利用率和系统内存，动态决定并行度：
 - `success` / `failure` — job 执行完成
 
 Status context 格式：`ci/infiniops/{job_name}`
+
+---
+
+## 多机部署指南
+
+以 NVIDIA + Iluvatar 双平台为例，说明如何在两台机器上部署 Agent 并实现跨平台并行测试。
+
+### 前置条件（两台机器共同）
+
+```bash
+# 1. Python 3.10+ 和依赖
+pip install pyyaml
+
+# 2. Docker 已安装
+docker --version
+
+# 3. 克隆仓库
+git clone https://github.com/InfiniTensor/InfiniOps.git
+cd InfiniOps
+```
+
+### NVIDIA 机器配置
+
+```bash
+# 1. 安装 NVIDIA Container Toolkit
+#    参考: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html
+
+# 2. 验证 GPU 可见
+nvidia-smi
+
+# 3. 构建 CI 镜像
+python .ci/build.py --platform nvidia
+```
+
+### Iluvatar 机器配置
+
+```bash
+# 1. 确认 CoreX 运行时已安装
+ixsmi
+
+# 2. 确认基础镜像已导入（非公开镜像，需提前准备）
+docker images | grep corex    # 应有 corex:qs_pj20250825
+
+# 3. 构建 CI 镜像
+python .ci/build.py --platform iluvatar
+```
+
+### 启动 Agent 服务
+
+在各自机器上启动 Agent：
+
+```bash
+# NVIDIA 机器
+python .ci/agent.py serve --platform nvidia --port 8080
+
+# Iluvatar 机器
+python .ci/agent.py serve --platform iluvatar --port 8080
+```
+
+验证连通性：
+
+```bash
+curl http://<nvidia-ip>:8080/health
+curl http://<iluvatar-ip>:8080/health
+```
+
+### 配置远程 Agent 地址
+
+在触发端的 `config.yaml` 中添加 `agents` 段：
+
+```yaml
+agents:
+  nvidia:
+    url: http://<nvidia-ip>:8080
+  iluvatar:
+    url: http://<iluvatar-ip>:8080
+```
+
+### 触发跨平台测试
+
+```bash
+# 一键运行所有平台的 job
+python .ci/agent.py run --branch master
+
+# 预览模式（不实际执行）
+python .ci/agent.py run --branch master --dry-run --no-status
+
+# 只运行指定平台
+python .ci/agent.py run --branch master --platform nvidia
+```
+
+### 可选配置
+
+#### GitHub Status 上报
+
+两台机器均设置环境变量，各自上报所属平台的测试状态：
+
+```bash
+export GITHUB_TOKEN=ghp_xxxxxxxxxxxx
+```
+
+#### API Token 认证
+
+Agent 暴露在非可信网络时，建议启用 Token 认证：
+
+```bash
+# 启动 Agent 时指定 token
+python .ci/agent.py serve --platform nvidia --port 8080 --api-token <secret>
+
+# 或通过环境变量
+export API_TOKEN=<secret>
+```
+
+#### GitHub Webhook 自动触发
+
+在 GitHub repo → Settings → Webhooks 中为每台机器添加 Webhook：
+
+| 字段 | 值 |
+|---|---|
+| Payload URL | `http://<机器IP>:8080/webhook` |
+| Content type | `application/json` |
+| Secret | 与 `--webhook-secret` 一致 |
+| Events | `push` 和 `pull_request` |
+
+启动时配置 secret：
+
+```bash
+python .ci/agent.py serve --platform nvidia --port 8080 --webhook-secret <github-secret>
+
+# 或通过环境变量
+export WEBHOOK_SECRET=<github-secret>
+```
+
+### 验证清单
+
+```bash
+# 1. 各机器单独 dry-run
+python .ci/agent.py run --branch master --platform nvidia --dry-run --no-status
+python .ci/agent.py run --branch master --platform iluvatar --dry-run --no-status
+
+# 2. 健康检查
+curl http://<nvidia-ip>:8080/health
+curl http://<iluvatar-ip>:8080/health
+
+# 3. 查看资源状态
+curl http://<nvidia-ip>:8080/status
+curl http://<iluvatar-ip>:8080/status
+
+# 4. 跨平台一键测试
+python .ci/agent.py run --branch master
+```

From 5eb8fdcaf3742989847501b90f8f17c53e849a14 Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Mon, 23 Mar 2026 09:30:36 +0000
Subject: [PATCH 06/16] feat(ci): enhance CI configuration and agent
 functionality with platform detection and job resolution

---
 .ci/README.md           | 117 ++++++++++++++++++----------
 .ci/agent.py            | 164 +++++++++++++---------------------------
 .ci/ci_resource.py      |  10 +++
 .ci/run.py              | 143 ++++++++++++++++++++++++-----------
 .ci/tests/test_agent.py |  32 --------
 .ci/utils.py            |  11 +++
 6 files changed, 248 insertions(+), 229 deletions(-)

diff --git a/.ci/README.md b/.ci/README.md
index 4e826e8..1926c66 100644
--- a/.ci/README.md
+++ b/.ci/README.md
@@ -3,16 +3,24 @@
 ```
 .ci/
 ├── config.yaml              # 统一配置（镜像、job、Agent 定义）
-├── utils.py                 # 共享工具（load_config、get_git_commit）
+├── utils.py                 # 共享工具（load_config、normalize_config、get_git_commit）
 ├── agent.py                 # Runner Agent（调度、Webhook、远程触发）
 ├── build.py                 # 镜像构建
 ├── run.py                   # CI 流水线执行（Docker 层）
 ├── ci_resource.py           # GPU/内存资源检测与分配
 ├── github_status.py         # GitHub Commit Status 上报
-└── images/
-    ├── nvidia/Dockerfile
-    ├── iluvatar/Dockerfile
-    └── ascend/Dockerfile
+├── images/
+│   ├── nvidia/Dockerfile
+│   ├── iluvatar/Dockerfile
+│   └── ascend/Dockerfile
+└── tests/                   # 单元测试
+    ├── conftest.py
+    ├── test_agent.py
+    ├── test_build.py
+    ├── test_run.py
+    ├── test_resource.py
+    ├── test_github_status.py
+    └── test_utils.py
 ```
 
 **前置依赖**：Docker、Python 3.10+、`pip install pyyaml`
@@ -29,13 +37,22 @@ repo:
   url: https://github.com/InfiniTensor/InfiniOps.git
   branch: master
 
+github:
+  status_context_prefix: "ci/infiniops"
+
+agents:                                  # 远程 Agent 地址（CLI 跨机器触发用）
+  nvidia:
+    url: http://nvidia-host:8080
+  iluvatar:
+    url: http://iluvatar-host:8080
+
 platforms:
   nvidia:
     image:                              # 镜像定义
       dockerfile: .ci/images/nvidia/
       build_args:
         BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
-    setup: pip install .[dev]           # 平台级默认值，job 可覆盖
+    setup: pip install .[dev] --no-build-isolation
     jobs:
       gpu:                              # 展平后为 nvidia_gpu
         resources:
@@ -64,7 +81,7 @@ platforms:
       - /lib/firmware:/lib/firmware
       - /usr/src:/usr/src
       - /lib/modules:/lib/modules
-    setup: pip install .[dev]
+    setup: pip install .[dev] --no-build-isolation
     jobs:
       gpu:                              # 展平后为 iluvatar_gpu
         resources:
@@ -103,6 +120,7 @@ platforms:
 | 参数 | 说明 |
 |---|---|
 | `--platform nvidia\|iluvatar\|ascend\|all` | 构建平台，默认 `all` |
+| `--commit` | 指定 commit ref 作为镜像 tag（默认 HEAD） |
 | `--force` | 跳过 Dockerfile 变更检测 |
 | `--dry-run` | 打印命令不执行 |
 
@@ -126,25 +144,31 @@ python .ci/build.py --force
 
 ## 流水线执行 `run.py`
 
+平台自动发现（通过检测 `nvidia-smi`/`ixsmi`），无需手动指定。
+
 | 参数 | 说明 |
 |---|---|
-| `--job` | 指定 job 名称（默认第一个） |
-| `--branch` | 覆盖克隆分支 |
+| `--config` | 配置文件路径（默认 `.ci/config.yaml`） |
+| `--job` | job 名称：短名（`gpu`）或完整名（`nvidia_gpu`）。缺省运行当前平台所有 job |
+| `--branch` | 覆盖克隆分支（默认读 config `repo.branch`） |
 | `--stage` | 只运行指定 stage |
 | `--image-tag` | 覆盖镜像 tag |
-| `--gpu-id` | 覆盖 GPU 设备 ID（仅 nvidia gpu_style） |
+| `--gpu-id` | 覆盖 GPU 设备 ID（nvidia 通过 `--gpus`，其他平台通过 `CUDA_VISIBLE_DEVICES`） |
 | `--results-dir` | 宿主机目录，挂载到容器 `/workspace/results` |
 | `--dry-run` | 打印 docker 命令不执行 |
 
 ```bash
-# 运行 NVIDIA job
-python .ci/run.py --job nvidia_gpu --branch master
+# 最简用法：自动检测平台，运行所有 job，使用 config 默认分支
+python .ci/run.py
+
+# 指定 job 短名
+python .ci/run.py --job gpu
 
-# 运行 Iluvatar job
-python .ci/run.py --job iluvatar_gpu --branch feat/ci-nvidia
+# 完整 job 名（向后兼容）
+python .ci/run.py --job nvidia_gpu
 
 # 只跑 test stage，预览命令
-python .ci/run.py --job iluvatar_gpu --stage test --dry-run
+python .ci/run.py --job gpu --stage test --dry-run
 ```
 
 容器内执行流程：`git clone` → `checkout` → `setup` → stages。
@@ -158,7 +182,7 @@ python .ci/run.py --job iluvatar_gpu --stage test --dry-run
 |---|---|---|---|
 | NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvcr.io/nvidia/pytorch:24.10-py3` | 标准 CUDA |
 | Iluvatar | `--privileged` + `/dev` 挂载 | `corex:qs_pj20250825` | CoreX 运行时，CUDA 兼容 |
-| Ascend | TODO | `ascend-pytorch:24.0.0` | 待完善 |
+| Ascend | TODO | `ascend-pytorch:24.0.0` | 待完善，镜像和 job 尚未就绪 |
 
 ---
 
@@ -169,43 +193,54 @@ Runner Agent 支持 CLI 手动触发、GitHub Webhook 自动触发、资源感
 ### CLI 手动执行
 
 ```bash
-# 运行所有 job（本地 + 远程 Agent）
-python .ci/agent.py run --branch master
+# 运行所有 job（分发到远程 Agent，使用 config 默认分支）
+python .ci/agent.py run
+
+# 指定分支
+python .ci/agent.py run --branch feat/xxx
 
 # 运行指定 job
-python .ci/agent.py run --branch master --job nvidia_gpu
+python .ci/agent.py run --job nvidia_gpu
 
 # 按平台运行
-python .ci/agent.py run --branch master --platform nvidia
+python .ci/agent.py run --platform nvidia
 
 # 预览命令
-python .ci/agent.py run --branch master --dry-run --no-status
+python .ci/agent.py run --dry-run
 ```
 
 | 参数 | 说明 |
 |---|---|
-| `--branch` | 测试分支（必填） |
+| `--branch` | 测试分支（默认读 config `repo.branch`） |
 | `--job` | 指定 job 名称 |
 | `--platform` | 按平台过滤 job |
 | `--commit` | 覆盖 commit SHA |
 | `--image-tag` | 覆盖镜像 tag |
-| `--results-dir` | 结果目录（默认 `ci-results`） |
-| `--utilization-threshold` | GPU 空闲阈值百分比（默认 10） |
-| `--no-status` | 跳过 GitHub Status 上报 |
 | `--dry-run` | 预览模式 |
 
 ### Webhook 服务
 
-每台平台机器部署一个 Agent 实例：
+每台平台机器部署一个 Agent 实例（平台自动发现）：
 
 ```bash
 # NVIDIA 机器
-python .ci/agent.py serve --platform nvidia --port 8080
+python .ci/agent.py serve --port 8080
 
 # Iluvatar 机器
-python .ci/agent.py serve --platform iluvatar --port 8080
+python .ci/agent.py serve --port 8080
 ```
 
+`serve` 子命令额外参数：
+
+| 参数 | 说明 |
+|---|---|
+| `--port` | 监听端口（默认 8080） |
+| `--host` | 监听地址（默认 `0.0.0.0`） |
+| `--webhook-secret` | GitHub Webhook 签名密钥（或 `WEBHOOK_SECRET` 环境变量） |
+| `--api-token` | `/api/run` Bearer 认证令牌（或 `AGENT_API_TOKEN` 环境变量） |
+| `--results-dir` | 结果目录（默认 `ci-results`） |
+| `--utilization-threshold` | GPU 空闲阈值百分比（默认 10） |
+
 | 端点 | 方法 | 说明 |
 |---|---|---|
 | `/webhook` | POST | GitHub Webhook（push/pull_request） |
@@ -293,11 +328,11 @@ python .ci/build.py --platform iluvatar
 在各自机器上启动 Agent：
 
 ```bash
-# NVIDIA 机器
-python .ci/agent.py serve --platform nvidia --port 8080
+# NVIDIA 机器（平台自动发现）
+python .ci/agent.py serve --port 8080
 
-# Iluvatar 机器
-python .ci/agent.py serve --platform iluvatar --port 8080
+# Iluvatar 机器（平台自动发现）
+python .ci/agent.py serve --port 8080
 ```
 
 验证连通性：
@@ -322,14 +357,14 @@ agents:
 ### 触发跨平台测试
 
 ```bash
-# 一键运行所有平台的 job
-python .ci/agent.py run --branch master
+# 一键运行所有平台的 job（使用 config 默认分支）
+python .ci/agent.py run
 
 # 预览模式（不实际执行）
-python .ci/agent.py run --branch master --dry-run --no-status
+python .ci/agent.py run --dry-run
 
 # 只运行指定平台
-python .ci/agent.py run --branch master --platform nvidia
+python .ci/agent.py run --platform nvidia
 ```
 
 ### 可选配置
@@ -348,10 +383,10 @@ Agent 暴露在非可信网络时，建议启用 Token 认证：
 
 ```bash
 # 启动 Agent 时指定 token
-python .ci/agent.py serve --platform nvidia --port 8080 --api-token <secret>
+python .ci/agent.py serve --port 8080 --api-token <secret>
 
 # 或通过环境变量
-export API_TOKEN=<secret>
+export AGENT_API_TOKEN=<secret>
 ```
 
 #### GitHub Webhook 自动触发
@@ -368,7 +403,7 @@ export API_TOKEN=<secret>
 启动时配置 secret：
 
 ```bash
-python .ci/agent.py serve --platform nvidia --port 8080 --webhook-secret <github-secret>
+python .ci/agent.py serve --port 8080 --webhook-secret <github-secret>
 
 # 或通过环境变量
 export WEBHOOK_SECRET=<github-secret>
@@ -378,8 +413,8 @@ export WEBHOOK_SECRET=<github-secret>
 
 ```bash
 # 1. 各机器单独 dry-run
-python .ci/agent.py run --branch master --platform nvidia --dry-run --no-status
-python .ci/agent.py run --branch master --platform iluvatar --dry-run --no-status
+python .ci/agent.py run --platform nvidia --dry-run
+python .ci/agent.py run --platform iluvatar --dry-run
 
 # 2. 健康检查
 curl http://<nvidia-ip>:8080/health
diff --git a/.ci/agent.py b/.ci/agent.py
index 3696ce2..8c53814 100644
--- a/.ci/agent.py
+++ b/.ci/agent.py
@@ -3,11 +3,11 @@
 
 Usage:
     # Run jobs locally (or dispatch to remote agents)
-    python .ci/agent.py run --branch master
+    python .ci/agent.py run
     python .ci/agent.py run --branch master --job nvidia_gpu --dry-run
 
-    # Start webhook server
-    python .ci/agent.py serve --platform nvidia --port 8080
+    # Start webhook server (auto-detects platform)
+    python .ci/agent.py serve --port 8080
 """
 
 import argparse
@@ -137,32 +137,6 @@ def select_jobs(config, platform=None, job_name=None):
     return list(jobs.keys())
 
 
-def route_jobs(config, job_names, local_platform=None):
-    """Split jobs into local and remote.
-
-    Returns (local_jobs, remote_jobs) where remote_jobs is a list of
-    (job_name, agent_url) tuples.
-    """
-    agents = config.get("agents", {})
-    jobs = config.get("jobs", {})
-    local = []
-    remote = []
-
-    for name in job_names:
-        job = jobs.get(name, {})
-        platform = job.get("platform", "")
-
-        if not local_platform:
-            local.append(name)
-        elif platform == local_platform:
-            local.append(name)
-        elif platform in agents:
-            remote.append((name, agents[platform].get("url", "")))
-        else:
-            local.append(name)
-
-    return local, remote
-
 
 # ---------------------------------------------------------------------------
 # Scheduler
@@ -707,8 +681,10 @@ def poll_remote_job(agent_url, job_id, interval=5.0, timeout=7200):
 
 
 def cmd_run(args):
-    """Handle 'run' subcommand: execute jobs locally and/or remotely."""
+    """Handle 'run' subcommand: dispatch jobs to platform agents via HTTP."""
     config = run.load_config(args.config)
+    agents = config.get("agents", {})
+    branch = args.branch or config.get("repo", {}).get("branch", "master")
     commit_sha = args.commit or run.get_git_commit(short=False)
 
     # Determine which jobs to run
@@ -722,57 +698,34 @@ def cmd_run(args):
         print("error: no matching jobs found", file=sys.stderr)
         sys.exit(1)
 
-    # Detect local platform (if running serve on this machine, use that; otherwise guess)
-    local_platform = args.platform
-    local_jobs, remote_jobs = route_jobs(config, job_names, local_platform)
+    # Resolve agent URL for each job
+    jobs_to_dispatch = []  # [(name, agent_url)]
 
-    # Run local jobs
-    local_results = []
-
-    if local_jobs:
-        pool = res.ResourcePool(
-            local_platform or "unknown",
-            utilization_threshold=args.utilization_threshold,
-        )
-        scheduler = Scheduler(
-            config,
-            local_platform or "unknown",
-            pool,
-            results_dir=args.results_dir,
-            no_status=args.no_status,
-            dry_run=args.dry_run,
-        )
+    for name in job_names:
+        job = config.get("jobs", {}).get(name, {})
+        platform = job.get("platform", "")
+        agent_url = agents.get(platform, {}).get("url", "")
 
-        for name in local_jobs:
-            req = JobRequest(
-                job_name=name,
-                branch=args.branch,
-                commit_sha=commit_sha,
-                config=config,
-                image_tag=args.image_tag,
-                results_dir=args.results_dir,
-            )
-            scheduler.submit(req)
+        if not agent_url:
+            print(f"error: no agent URL configured for platform {platform!r} (job {name})", file=sys.stderr)
+            sys.exit(1)
 
-        local_results = scheduler.wait_all()
+        jobs_to_dispatch.append((name, agent_url))
 
-    # Dispatch remote jobs
-    remote_results = []
     api_token = os.environ.get("AGENT_API_TOKEN", "")
+    results = []
 
-    if remote_jobs and not args.dry_run:
-        # Dispatch all remote jobs first, then poll concurrently
+    if args.dry_run:
+        for name, agent_url in jobs_to_dispatch:
+            print(f"[dry-run] dispatch {name} to {agent_url}")
+    else:
+        # Dispatch all jobs, then poll concurrently
         dispatched = []  # [(name, agent_url, job_id)]
 
-        for name, agent_url in remote_jobs:
-            if not agent_url:
-                print(f"warning: no agent URL for {name}, skipping", file=sys.stderr)
-                remote_results.append({"job_name": name, "state": "error"})
-                continue
-
+        for name, agent_url in jobs_to_dispatch:
             print(f"==> dispatching {name} to {agent_url}", file=sys.stderr)
             job_id = dispatch_remote_job(
-                agent_url, name, args.branch, commit_sha, args.image_tag,
+                agent_url, name, branch, commit_sha, args.image_tag,
                 api_token=api_token or None,
             )
 
@@ -781,9 +734,8 @@ def cmd_run(args):
                 dispatched.append((name, agent_url, job_id))
             else:
                 print(f"    failed to dispatch {name}", file=sys.stderr)
-                remote_results.append({"job_name": name, "state": "error"})
+                results.append({"job_name": name, "state": "error"})
 
-        # Poll all dispatched jobs concurrently
         if dispatched:
             with ThreadPoolExecutor(max_workers=len(dispatched)) as executor:
                 futures = {
@@ -796,28 +748,16 @@ def cmd_run(args):
                     result = future.result()
 
                     if result:
-                        remote_results.append(result)
+                        results.append(result)
                     else:
                         print(f"    timeout waiting for {name}", file=sys.stderr)
-                        remote_results.append({"job_name": name, "state": "timeout"})
-
-    elif remote_jobs and args.dry_run:
-        for name, agent_url in remote_jobs:
-            print(f"[dry-run] dispatch {name} to {agent_url}")
+                        results.append({"job_name": name, "state": "timeout"})
 
     # Summary
     print("\n========== Results ==========")
     all_ok = True
 
-    for r in local_results:
-        status = "PASS" if r.returncode == 0 else "FAIL"
-
-        if r.returncode != 0:
-            all_ok = False
-
-        print(f"  {status}  {r.job_name}  ({r.duration:.0f}s)  {r.results_dir}")
-
-    for r in remote_results:
+    for r in results:
         state = r.get("state", "unknown")
         name = r.get("job_name", "?")
         status = "PASS" if state == STATE_SUCCESS else "FAIL"
@@ -826,7 +766,7 @@ def cmd_run(args):
             all_ok = False
 
         duration = r.get("duration_seconds", 0)
-        print(f"  {status}  {name}  ({duration:.0f}s)  [remote]")
+        print(f"  {status}  {name}  ({duration:.0f}s)")
 
     if not all_ok:
         sys.exit(1)
@@ -836,13 +776,31 @@ def cmd_serve(args):
     """Handle 'serve' subcommand: start webhook server."""
     config = run.load_config(args.config)
 
+    platform = res.detect_platform()
+
+    if not platform:
+        print(
+            "error: could not detect platform (no nvidia-smi or ixsmi found)",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    platform_jobs = select_jobs(config, platform=platform)
+
+    if not platform_jobs:
+        print(
+            f"error: platform {platform!r} detected but no jobs defined in config",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
     pool = res.ResourcePool(
-        args.platform,
+        platform,
         utilization_threshold=args.utilization_threshold,
     )
     scheduler = Scheduler(
         config,
-        args.platform,
+        platform,
         pool,
         results_dir=args.results_dir,
     )
@@ -869,14 +827,14 @@ def cmd_serve(args):
         args.port,
         config,
         scheduler,
-        args.platform,
+        platform,
         webhook_secret=webhook_secret or None,
         api_token=api_token or None,
         results_dir=args.results_dir,
     )
 
     print(
-        f"Agent serving on {args.host}:{args.port} (platform={args.platform})",
+        f"Agent serving on {args.host}:{args.port} (platform={platform})",
         file=sys.stderr,
     )
     print(f"  POST /webhook  — GitHub webhook", file=sys.stderr)
@@ -905,23 +863,11 @@ def main():
         type=Path,
         default=Path(__file__).resolve().parent / "config.yaml",
     )
-    run_parser.add_argument("--branch", type=str, required=True, help="Branch to test")
+    run_parser.add_argument("--branch", type=str, help="Branch to test (default: config repo.branch)")
     run_parser.add_argument("--job", type=str, help="Specific job name")
     run_parser.add_argument("--platform", type=str, help="Filter jobs by platform")
     run_parser.add_argument("--image-tag", type=str, help="Override image tag")
     run_parser.add_argument("--commit", type=str, help="Override commit SHA")
-    run_parser.add_argument(
-        "--results-dir",
-        type=Path,
-        default=Path("ci-results"),
-    )
-    run_parser.add_argument(
-        "--utilization-threshold",
-        type=int,
-        default=10,
-        help="GPU utilization threshold (%%) to consider free (default: 10)",
-    )
-    run_parser.add_argument("--no-status", action="store_true", help="Skip GitHub status")
     run_parser.add_argument("--dry-run", action="store_true")
 
     # --- serve subcommand ---
@@ -931,12 +877,6 @@ def main():
         type=Path,
         default=Path(__file__).resolve().parent / "config.yaml",
     )
-    serve_parser.add_argument(
-        "--platform",
-        type=str,
-        required=True,
-        help="Platform this agent handles (nvidia, iluvatar, etc.)",
-    )
     serve_parser.add_argument("--port", type=int, default=8080)
     serve_parser.add_argument("--host", type=str, default="0.0.0.0")
     serve_parser.add_argument("--webhook-secret", type=str)
diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py
index f3dbfb1..47b9737 100644
--- a/.ci/ci_resource.py
+++ b/.ci/ci_resource.py
@@ -2,6 +2,7 @@
 """Resource detection and allocation for CI Runner Agent."""
 
 import os
+import shutil
 import subprocess
 import threading
 from dataclasses import dataclass, field
@@ -239,3 +240,12 @@ def parse_memory_requirement(job_config) -> float:
         return float(memory) * 1024  # Default: GB
     except ValueError:
         return 0
+
+
+def detect_platform():
+    """Auto-detect the current platform by probing GPU query tools on PATH."""
+    for platform, tool in ResourcePool.GPU_QUERY_TOOLS.items():
+        if shutil.which(tool):
+            return platform
+
+    return None
diff --git a/.ci/run.py b/.ci/run.py
index 2575781..6c108e4 100644
--- a/.ci/run.py
+++ b/.ci/run.py
@@ -9,7 +9,7 @@
 from datetime import datetime
 from pathlib import Path
 
-from ci_resource import GPU_STYLE_NVIDIA, GPU_STYLE_NONE
+from ci_resource import GPU_STYLE_NVIDIA, GPU_STYLE_NONE, detect_platform
 from utils import get_git_commit, load_config
 
 
@@ -183,6 +183,42 @@ def build_docker_args(
     return args
 
 
+def resolve_job_names(jobs, platform, job=None):
+    """Resolve job names for a platform.
+
+    - ``job=None`` — all jobs for the platform.
+    - ``job="gpu"`` (short name) — matched via ``short_name`` field.
+    - ``job="nvidia_gpu"`` (full name) — direct lookup.
+    """
+    if job and job in jobs:
+        return [job]
+
+    if job:
+        matches = [
+            name for name, cfg in jobs.items()
+            if cfg.get("platform") == platform and cfg.get("short_name") == job
+        ]
+
+        if not matches:
+            print(
+                f"error: job {job!r} not found for platform {platform!r}",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
+        return matches
+
+    matches = [
+        name for name, cfg in jobs.items() if cfg.get("platform") == platform
+    ]
+
+    if not matches:
+        print(f"error: no jobs for platform {platform!r}", file=sys.stderr)
+        sys.exit(1)
+
+    return matches
+
+
 def main():
     parser = argparse.ArgumentParser(description="Run Docker CI pipeline")
     parser.add_argument(
@@ -191,8 +227,12 @@ def main():
         default=Path(__file__).resolve().parent / "config.yaml",
         help="Path to config.yaml",
     )
-    parser.add_argument("--branch", type=str, help="Override repo branch")
-    parser.add_argument("--job", type=str, help="Job name to run (default: first job)")
+    parser.add_argument("--branch", type=str, help="Override repo branch (default: config repo.branch)")
+    parser.add_argument(
+        "--job",
+        type=str,
+        help="Job name: short name (gpu) or full name (nvidia_gpu). Default: all jobs",
+    )
     parser.add_argument(
         "--stage",
         type=str,
@@ -226,53 +266,68 @@ def main():
     repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git")
     branch = args.branch or repo.get("branch", "master")
 
-    jobs = config.get("jobs", {})
+    platform = detect_platform()
 
-    if not jobs:
-        print("error: no jobs in config", file=sys.stderr)
-        sys.exit(1)
-
-    job_name = args.job or next(iter(jobs))
-
-    if job_name not in jobs:
-        print(f"error: job {job_name!r} not in config", file=sys.stderr)
+    if not platform:
+        print(
+            "error: could not detect platform (no nvidia-smi or ixsmi found)",
+            file=sys.stderr,
+        )
         sys.exit(1)
 
-    job = jobs[job_name]
-    all_stages = job.get("stages", [])
-
-    if args.stage:
-        stages = [s for s in all_stages if s["name"] == args.stage]
-
-        if not stages:
-            print(f"error: stage {args.stage!r} not found", file=sys.stderr)
-            sys.exit(1)
-    else:
-        stages = all_stages
+    print(f"platform: {platform}", file=sys.stderr)
 
-    platform = job.get("platform", "nvidia")
-    commit = get_git_commit()
-    results_dir = build_results_dir(args.results_dir, platform, stages, commit)
-
-    workdir = "/workspace"
-    docker_args = build_docker_args(
-        config,
-        job_name,
-        repo_url,
-        branch,
-        stages,
-        workdir,
-        args.image_tag,
-        gpu_id_override=args.gpu_id,
-        results_dir=results_dir,
-    )
+    jobs = config.get("jobs", {})
 
-    if args.dry_run:
-        print(shlex.join(docker_args))
-        return
+    if not jobs:
+        print("error: no jobs in config", file=sys.stderr)
+        sys.exit(1)
 
-    results_dir.mkdir(parents=True, exist_ok=True)
-    sys.exit(subprocess.run(docker_args).returncode)
+    job_names = resolve_job_names(jobs, platform, job=args.job)
+    failed = 0
+
+    for job_name in job_names:
+        job = jobs[job_name]
+        all_stages = job.get("stages", [])
+
+        if args.stage:
+            stages = [s for s in all_stages if s["name"] == args.stage]
+
+            if not stages:
+                print(f"error: stage {args.stage!r} not found in {job_name}", file=sys.stderr)
+                sys.exit(1)
+        else:
+            stages = all_stages
+
+        job_platform = job.get("platform", platform)
+        commit = get_git_commit()
+        results_dir = build_results_dir(args.results_dir, job_platform, stages, commit)
+
+        docker_args = build_docker_args(
+            config,
+            job_name,
+            repo_url,
+            branch,
+            stages,
+            "/workspace",
+            args.image_tag,
+            gpu_id_override=args.gpu_id,
+            results_dir=results_dir,
+        )
+
+        if args.dry_run:
+            print(shlex.join(docker_args))
+            continue
+
+        print(f"==> running job: {job_name}", file=sys.stderr)
+        results_dir.mkdir(parents=True, exist_ok=True)
+        returncode = subprocess.run(docker_args).returncode
+
+        if returncode != 0:
+            print(f"job {job_name} failed (exit code {returncode})", file=sys.stderr)
+            failed += 1
+
+    sys.exit(1 if failed else 0)
 
 
 if __name__ == "__main__":
diff --git a/.ci/tests/test_agent.py b/.ci/tests/test_agent.py
index 5741385..aa181c4 100644
--- a/.ci/tests/test_agent.py
+++ b/.ci/tests/test_agent.py
@@ -115,38 +115,6 @@ def test_select_jobs_invalid_name(agent_config):
         agent.select_jobs(agent_config, job_name="not_exist")
 
 
-# ---------------------------------------------------------------------------
-# route_jobs
-# ---------------------------------------------------------------------------
-
-
-def test_route_jobs_local(agent_config):
-    local, remote = agent.route_jobs(agent_config, ["nvidia_gpu"], local_platform="nvidia")
-    assert local == ["nvidia_gpu"]
-    assert remote == []
-
-
-def test_route_jobs_remote(agent_config):
-    local, remote = agent.route_jobs(agent_config, ["iluvatar_gpu"], local_platform="nvidia")
-    assert local == []
-    assert len(remote) == 1
-    assert remote[0][0] == "iluvatar_gpu"
-    assert remote[0][1] == "http://iluvatar-host:8080"
-
-
-def test_route_jobs_mixed(agent_config):
-    local, remote = agent.route_jobs(
-        agent_config, ["nvidia_gpu", "iluvatar_gpu"], local_platform="nvidia"
-    )
-    assert local == ["nvidia_gpu"]
-    assert len(remote) == 1
-
-
-def test_route_jobs_no_platform(agent_config):
-    local, remote = agent.route_jobs(agent_config, ["nvidia_gpu", "iluvatar_gpu"])
-    assert len(local) == 2
-    assert remote == []
-
 
 # ---------------------------------------------------------------------------
 # verify_signature
diff --git a/.ci/utils.py b/.ci/utils.py
index 7932ba6..07dec87 100644
--- a/.ci/utils.py
+++ b/.ci/utils.py
@@ -61,6 +61,7 @@ def normalize_config(raw):
             full_name = f"{platform}_{job_name}"
             flat = {
                 "platform": platform,
+                "short_name": job_name,
                 "image": defaults.get("image_tag", "latest"),
             }
 
@@ -74,6 +75,16 @@ def normalize_config(raw):
 
             config["jobs"][full_name] = flat
 
+    # Warn on mismatched agent/platform keys (catches typos like 'nvdia').
+    agent_keys = set(config.get("agents", {}).keys())
+    platform_keys = set(raw.get("platforms", {}).keys())
+
+    for key in agent_keys - platform_keys:
+        print(
+            f"warning: agents.{key} has no matching platform in platforms.*",
+            file=sys.stderr,
+        )
+
     return config
 
 

From 038f884e1873908ffb61f64f9b4d73e851f6d0b9 Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Mon, 23 Mar 2026 15:55:27 +0000
Subject: [PATCH 07/16] feat(ci): add MetaX platform CI support

Add Dockerfile, config, and mx-smi GPU detection for MetaX (MACA) platform.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .ci/ci_resource.py          | 83 ++++++++++++++++++++++++++++++++++++-
 .ci/config.yaml             | 24 +++++++++++
 .ci/images/metax/Dockerfile | 46 ++++++++++++++++++++
 3 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 .ci/images/metax/Dockerfile

diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py
index 47b9737..a24041f 100644
--- a/.ci/ci_resource.py
+++ b/.ci/ci_resource.py
@@ -30,13 +30,14 @@ class SystemResources:
 class ResourcePool:
     """Thread-safe GPU and system resource manager.
 
-    Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi)
+    Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi, mx-smi)
     and tracks allocations to enable dynamic parallel scheduling.
     """
 
     GPU_QUERY_TOOLS = {
         "nvidia": "nvidia-smi",
         "iluvatar": "ixsmi",
+        "metax": "mx-smi",
     }
 
     def __init__(self, platform, utilization_threshold=10):
@@ -56,6 +57,9 @@ def allocated(self):
 
     def detect_gpus(self) -> list[GpuInfo]:
         """Query GPU status via platform-specific CLI tool."""
+        if self._platform == "metax":
+            return self._detect_gpus_metax()
+
         tool = self.GPU_QUERY_TOOLS.get(self._platform)
 
         if not tool:
@@ -100,6 +104,83 @@ def detect_gpus(self) -> list[GpuInfo]:
 
         return gpus
 
+    def _detect_gpus_metax(self) -> list[GpuInfo]:
+        """Parse mx-smi output for MetaX GPUs.
+
+        Runs --show-memory and --show-usage separately and merges results.
+        Output format example:
+            GPU#0  MXC550  0000:1a:00.0
+                Memory
+                    vis_vram total  : 67108864 KB
+                    vis_vram used   : 879032 KB
+                Utilization
+                    GPU             : 0 %
+        """
+        import re
+
+        def run_mxsmi(flag):
+            try:
+                r = subprocess.run(
+                    ["mx-smi", flag],
+                    capture_output=True, text=True, timeout=10,
+                )
+                return r.stdout if r.returncode == 0 else ""
+            except (FileNotFoundError, subprocess.TimeoutExpired):
+                return ""
+
+        mem_out = run_mxsmi("--show-memory")
+        util_out = run_mxsmi("--show-usage")
+
+        # Parse memory: collect {index: (used_kb, total_kb)}
+        mem = {}
+        current = None
+        for line in mem_out.splitlines():
+            m = re.match(r"GPU#(\d+)", line.strip())
+            if m:
+                current = int(m.group(1))
+                mem[current] = [0.0, 0.0]
+                continue
+            if current is None:
+                continue
+            m = re.search(r"vis_vram total\s*:\s*([\d.]+)\s*KB", line)
+            if m:
+                mem[current][1] = float(m.group(1)) / 1024  # KB -> MB
+            m = re.search(r"vis_vram used\s*:\s*([\d.]+)\s*KB", line)
+            if m:
+                mem[current][0] = float(m.group(1)) / 1024  # KB -> MB
+
+        # Parse utilization: collect {index: utilization_pct}
+        util = {}
+        current = None
+        in_util = False
+        for line in util_out.splitlines():
+            m = re.match(r"GPU#(\d+)", line.strip())
+            if m:
+                current = int(m.group(1))
+                in_util = False
+                continue
+            if current is None:
+                continue
+            if "Utilization" in line:
+                in_util = True
+                continue
+            if in_util:
+                m = re.match(r"\s*GPU\s*:\s*([\d.]+)\s*%", line)
+                if m:
+                    util[current] = float(m.group(1))
+                    in_util = False
+
+        gpus = []
+        for idx in sorted(mem):
+            used_mb, total_mb = mem[idx]
+            gpus.append(GpuInfo(
+                index=idx,
+                memory_used_mb=used_mb,
+                memory_total_mb=total_mb,
+                utilization_pct=util.get(idx, 0.0),
+            ))
+        return gpus
+
     def detect_system_resources(self) -> SystemResources:
         """Read system memory from /proc/meminfo and CPU count."""
         total_mb = 0.0
diff --git a/.ci/config.yaml b/.ci/config.yaml
index e62bc07..171c9e9 100644
--- a/.ci/config.yaml
+++ b/.ci/config.yaml
@@ -61,6 +61,30 @@ platforms:
           - name: test
             run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
 
+  metax:
+    image:
+      dockerfile: .ci/images/metax/
+      build_args:
+        BASE_IMAGE: cr.metax-tech.com/public-library/maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64
+        APT_MIRROR: http://archive.ubuntu.com/ubuntu
+        PIP_INDEX_URL: https://pypi.org/simple
+    docker_args:
+      - "--privileged"
+      - "--ulimit=memlock=-1"
+      - "--ulimit=stack=67108864"
+    setup: pip install .[dev] --no-build-isolation
+    jobs:
+      gpu:
+        resources:
+          gpu_ids: "0"
+          gpu_style: none                  # MetaX 设备通过 --privileged 透传，无需 CUDA_VISIBLE_DEVICES
+          memory: 32GB
+          shm_size: 16g
+          timeout: 3600
+        stages:
+          - name: test
+            run: pytest tests/ -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
+
   ascend:                                  # TODO: Ascend image is not ready yet
     image:
       dockerfile: .ci/images/ascend/
diff --git a/.ci/images/metax/Dockerfile b/.ci/images/metax/Dockerfile
new file mode 100644
index 0000000..fda527c
--- /dev/null
+++ b/.ci/images/metax/Dockerfile
@@ -0,0 +1,46 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# conda Python is used in this image
+ENV PATH=/opt/conda/bin:${PATH}
+
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy
+ARG https_proxy
+ARG no_proxy
+
+ARG APT_MIRROR
+RUN if [ -n "$APT_MIRROR" ]; then \
+        sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \
+    fi && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        git \
+        cmake \
+        ninja-build \
+        coreutils \
+        libclang-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG PIP_INDEX_URL
+RUN pip install --no-cache-dir \
+    ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \
+    scikit-build-core \
+    pybind11 \
+    libclang \
+    pytest-cov \
+    pytest-xdist \
+    pyyaml \
+    ruff==0.15.7
+
+# Pin pre-installed MetaX torch to prevent pip from replacing it with upstream version
+RUN pip show torch >/dev/null 2>&1 && \
+    echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \
+    touch /etc/pip-constraints.txt
+ENV PIP_CONSTRAINT=/etc/pip-constraints.txt
+
+WORKDIR /workspace

From 78deba2189d7e1cc17891b1ba55c20a9e89d403c Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Mon, 23 Mar 2026 16:27:00 +0000
Subject: [PATCH 08/16] feat(ci): improve job dispatch logging and handle job
 results more effectively

---
 .ci/agent.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/.ci/agent.py b/.ci/agent.py
index 8c53814..0fa3715 100644
--- a/.ci/agent.py
+++ b/.ci/agent.py
@@ -24,7 +24,7 @@
 import urllib.error
 import urllib.request
 import uuid
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
 from http.server import BaseHTTPRequestHandler, HTTPServer
 from pathlib import Path
@@ -717,13 +717,18 @@ def cmd_run(args):
 
     if args.dry_run:
         for name, agent_url in jobs_to_dispatch:
-            print(f"[dry-run] dispatch {name} to {agent_url}")
+            platform, _, job = name.partition("_")
+            print(f"[dry-run] dispatch {platform} {job} job to {agent_url}")
     else:
-        # Dispatch all jobs, then poll concurrently
+        # Dispatch all jobs, then poll concurrently.
         dispatched = []  # [(name, agent_url, job_id)]
 
         for name, agent_url in jobs_to_dispatch:
-            print(f"==> dispatching {name} to {agent_url}", file=sys.stderr)
+            platform, _, job = name.partition("_")
+            print(
+                f"==> dispatching {platform} {job} job to {agent_url}",
+                file=sys.stderr,
+            )
             job_id = dispatch_remote_job(
                 agent_url, name, branch, commit_sha, args.image_tag,
                 api_token=api_token or None,
@@ -743,14 +748,21 @@ def cmd_run(args):
                     for name, url, jid in dispatched
                 }
 
-                for future in futures:
+                for future in as_completed(futures):
                     name, _, _ = futures[future]
                     result = future.result()
 
                     if result:
+                        state = result.get("state", "unknown")
+                        duration = result.get("duration_seconds", 0)
+                        tag = "PASS" if state == STATE_SUCCESS else "FAIL"
+                        print(
+                            f"<== {tag}  {name}  ({duration:.0f}s)",
+                            file=sys.stderr,
+                        )
                         results.append(result)
                     else:
-                        print(f"    timeout waiting for {name}", file=sys.stderr)
+                        print(f"<== TIMEOUT  {name}", file=sys.stderr)
                         results.append({"job_name": name, "state": "timeout"})
 
     # Summary

From a599ba9615e08f5e730ec7745e1bc98c276edb1f Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Tue, 24 Mar 2026 17:45:27 +0800
Subject: [PATCH 09/16] feat(ci): add Moore Threads (MUSA) platform CI support

Add GPU detection via mthreads-gmi, Dockerfile, config, and update docs
with Moore and MetaX platform deployment instructions.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .ci/README.md               | 65 ++++++++++++++++++++++++++++---
 .ci/ci_resource.py          | 76 ++++++++++++++++++++++++++++++++++++-
 .ci/config.yaml             | 22 +++++++++++
 .ci/images/moore/Dockerfile | 38 +++++++++++++++++++
 4 files changed, 194 insertions(+), 7 deletions(-)
 create mode 100644 .ci/images/moore/Dockerfile

diff --git a/.ci/README.md b/.ci/README.md
index 1926c66..12e8094 100644
--- a/.ci/README.md
+++ b/.ci/README.md
@@ -12,6 +12,8 @@
 ├── images/
 │   ├── nvidia/Dockerfile
 │   ├── iluvatar/Dockerfile
+│   ├── metax/Dockerfile
+│   ├── moore/Dockerfile
 │   └── ascend/Dockerfile
 └── tests/                   # 单元测试
     ├── conftest.py
@@ -119,7 +121,7 @@ platforms:
 
 | 参数 | 说明 |
 |---|---|
-| `--platform nvidia\|iluvatar\|ascend\|all` | 构建平台，默认 `all` |
+| `--platform nvidia\|iluvatar\|metax\|moore\|ascend\|all` | 构建平台，默认 `all` |
 | `--commit` | 指定 commit ref 作为镜像 tag（默认 HEAD） |
 | `--force` | 跳过 Dockerfile 变更检测 |
 | `--dry-run` | 打印命令不执行 |
@@ -144,7 +146,7 @@ python .ci/build.py --force
 
 ## 流水线执行 `run.py`
 
-平台自动发现（通过检测 `nvidia-smi`/`ixsmi`），无需手动指定。
+平台自动发现（通过检测 `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi`），无需手动指定。
 
 | 参数 | 说明 |
 |---|---|
@@ -182,6 +184,8 @@ python .ci/run.py --job gpu --stage test --dry-run
 |---|---|---|---|
 | NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvcr.io/nvidia/pytorch:24.10-py3` | 标准 CUDA |
 | Iluvatar | `--privileged` + `/dev` 挂载 | `corex:qs_pj20250825` | CoreX 运行时，CUDA 兼容 |
+| MetaX | `--privileged` | `maca-pytorch:3.2.1.4` | MACA 运行时，通过 `mx-smi` 检测 |
+| Moore | `--privileged` | `vllm_musa:20251112_hygon` | MUSA 运行时，通过 `mthreads-gmi` 检测 |
 | Ascend | TODO | `ascend-pytorch:24.0.0` | 待完善，镜像和 job 尚未就绪 |
 
 ---
@@ -228,6 +232,9 @@ python .ci/agent.py serve --port 8080
 
 # Iluvatar 机器
 python .ci/agent.py serve --port 8080
+
+# MetaX 机器
+python .ci/agent.py serve --port 8080
 ```
 
 `serve` 子命令额外参数：
@@ -261,6 +268,10 @@ agents:
     url: http://nvidia-host:8080
   iluvatar:
     url: http://iluvatar-host:8080
+  metax:
+    url: http://metax-host:8080
+  moore:
+    url: http://moore-host:8080
 ```
 
 ### 资源调度
@@ -281,9 +292,9 @@ Status context 格式：`ci/infiniops/{job_name}`
 
 ## 多机部署指南
 
-以 NVIDIA + Iluvatar 双平台为例，说明如何在两台机器上部署 Agent 并实现跨平台并行测试。
+以 NVIDIA + Iluvatar + MetaX + Moore 多平台为例，说明如何在多台机器上部署 Agent 并实现跨平台并行测试。
 
-### 前置条件（两台机器共同）
+### 前置条件（所有机器共同）
 
 ```bash
 # 1. Python 3.10+ 和依赖
@@ -323,6 +334,32 @@ docker images | grep corex    # 应有 corex:qs_pj20250825
 python .ci/build.py --platform iluvatar
 ```
 
+### MetaX 机器配置
+
+```bash
+# 1. 确认 MACA 运行时已安装
+mx-smi
+
+# 2. 确认基础镜像已导入（非公开镜像，需提前准备）
+docker images | grep maca-pytorch    # 应有 maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64
+
+# 3. 构建 CI 镜像
+python .ci/build.py --platform metax
+```
+
+### Moore 机器配置
+
+```bash
+# 1. 确认 MUSA 运行时已安装
+mthreads-gmi
+
+# 2. 确认基础镜像已导入（非公开镜像，需提前准备）
+docker images | grep vllm_musa    # 应有 vllm_musa:20251112_hygon
+
+# 3. 构建 CI 镜像
+python .ci/build.py --platform moore
+```
+
 ### 启动 Agent 服务
 
 在各自机器上启动 Agent：
@@ -333,6 +370,12 @@ python .ci/agent.py serve --port 8080
 
 # Iluvatar 机器（平台自动发现）
 python .ci/agent.py serve --port 8080
+
+# MetaX 机器（平台自动发现）
+python .ci/agent.py serve --port 8080
+
+# Moore 机器（平台自动发现）
+python .ci/agent.py serve --port 8080
 ```
 
 验证连通性：
@@ -340,6 +383,8 @@ python .ci/agent.py serve --port 8080
 ```bash
 curl http://<nvidia-ip>:8080/health
 curl http://<iluvatar-ip>:8080/health
+curl http://<metax-ip>:8080/health
+curl http://<moore-ip>:8080/health
 ```
 
 ### 配置远程 Agent 地址
@@ -352,6 +397,10 @@ agents:
     url: http://<nvidia-ip>:8080
   iluvatar:
     url: http://<iluvatar-ip>:8080
+  metax:
+    url: http://<metax-ip>:8080
+  moore:
+    url: http://<moore-ip>:8080
 ```
 
 ### 触发跨平台测试
@@ -371,7 +420,7 @@ python .ci/agent.py run --platform nvidia
 
 #### GitHub Status 上报
 
-两台机器均设置环境变量，各自上报所属平台的测试状态：
+所有机器均设置环境变量，各自上报所属平台的测试状态：
 
 ```bash
 export GITHUB_TOKEN=ghp_xxxxxxxxxxxx
@@ -415,14 +464,20 @@ export WEBHOOK_SECRET=<github-secret>
 # 1. 各机器单独 dry-run
 python .ci/agent.py run --platform nvidia --dry-run
 python .ci/agent.py run --platform iluvatar --dry-run
+python .ci/agent.py run --platform metax --dry-run
+python .ci/agent.py run --platform moore --dry-run
 
 # 2. 健康检查
 curl http://<nvidia-ip>:8080/health
 curl http://<iluvatar-ip>:8080/health
+curl http://<metax-ip>:8080/health
+curl http://<moore-ip>:8080/health
 
 # 3. 查看资源状态
 curl http://<nvidia-ip>:8080/status
 curl http://<iluvatar-ip>:8080/status
+curl http://<metax-ip>:8080/status
+curl http://<moore-ip>:8080/status
 
 # 4. 跨平台一键测试
 python .ci/agent.py run --branch master
diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py
index a24041f..a49cbff 100644
--- a/.ci/ci_resource.py
+++ b/.ci/ci_resource.py
@@ -1,7 +1,9 @@
 #!/usr/bin/env python3
 """Resource detection and allocation for CI Runner Agent."""
 
+import json
 import os
+import re
 import shutil
 import subprocess
 import threading
@@ -30,7 +32,7 @@ class SystemResources:
 class ResourcePool:
     """Thread-safe GPU and system resource manager.
 
-    Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi, mx-smi)
+    Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi, mx-smi, mthreads-gmi)
     and tracks allocations to enable dynamic parallel scheduling.
     """
 
@@ -38,6 +40,7 @@ class ResourcePool:
         "nvidia": "nvidia-smi",
         "iluvatar": "ixsmi",
         "metax": "mx-smi",
+        "moore": "mthreads-gmi",
     }
 
     def __init__(self, platform, utilization_threshold=10):
@@ -60,6 +63,9 @@ def detect_gpus(self) -> list[GpuInfo]:
         if self._platform == "metax":
             return self._detect_gpus_metax()
 
+        if self._platform == "moore":
+            return self._detect_gpus_moore()
+
         tool = self.GPU_QUERY_TOOLS.get(self._platform)
 
         if not tool:
@@ -116,7 +122,6 @@ def _detect_gpus_metax(self) -> list[GpuInfo]:
                 Utilization
                     GPU             : 0 %
         """
-        import re
 
         def run_mxsmi(flag):
             try:
@@ -181,6 +186,73 @@ def run_mxsmi(flag):
             ))
         return gpus
 
+    def _detect_gpus_moore(self) -> list[GpuInfo]:
+        """Parse mthreads-gmi JSON output for Moore Threads GPUs.
+
+        Uses: mthreads-gmi -q --json
+        Expected JSON structure:
+            {
+              "Attached GPUs": {
+                "GPU 00000000:3B:00.0": {
+                  "Minor Number": "0",
+                  "Memory Usage": {
+                    "Total": "24576 MiB",
+                    "Used": "512 MiB"
+                  },
+                  "Utilization": {
+                    "Gpu": "5 %"
+                  }
+                }
+              }
+            }
+        """
+        def extract_number(s):
+            m = re.search(r"([\d.]+)", str(s))
+            return float(m.group(1)) if m else 0.0
+
+        try:
+            result = subprocess.run(
+                ["mthreads-gmi", "-q", "--json"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+        except (FileNotFoundError, subprocess.TimeoutExpired):
+            return []
+
+        if result.returncode != 0:
+            return []
+
+        try:
+            data = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            return []
+
+        gpus = []
+        attached = data.get("Attached GPUs", {})
+
+        for gpu_data in attached.values():
+            try:
+                index = int(gpu_data.get("Minor Number", len(gpus)))
+
+                mem = gpu_data.get("Memory Usage", {})
+                total_mb = extract_number(mem.get("Total", "0 MiB"))
+                used_mb = extract_number(mem.get("Used", "0 MiB"))
+                util_pct = extract_number(
+                    gpu_data.get("Utilization", {}).get("Gpu", "0 %")
+                )
+
+                gpus.append(GpuInfo(
+                    index=index,
+                    memory_used_mb=used_mb,
+                    memory_total_mb=total_mb,
+                    utilization_pct=util_pct,
+                ))
+            except (ValueError, AttributeError):
+                continue
+
+        return sorted(gpus, key=lambda g: g.index)
+
     def detect_system_resources(self) -> SystemResources:
         """Read system memory from /proc/meminfo and CPU count."""
         total_mb = 0.0
diff --git a/.ci/config.yaml b/.ci/config.yaml
index 171c9e9..24b4006 100644
--- a/.ci/config.yaml
+++ b/.ci/config.yaml
@@ -85,6 +85,28 @@ platforms:
           - name: test
             run: pytest tests/ -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
 
+  moore:
+    image:
+      dockerfile: .ci/images/moore/
+      build_args:
+        BASE_IMAGE: sh-harbor.mthreads.com/mcctest/vllm_musa:20251112_hygon
+        APT_MIRROR: http://archive.ubuntu.com/ubuntu
+        PIP_INDEX_URL: https://pypi.org/simple
+    docker_args:
+      - "--privileged"
+    setup: pip install .[dev] --no-build-isolation
+    jobs:
+      gpu:
+        resources:
+          gpu_ids: "0"
+          gpu_style: none                  # Moore 设备通过 --privileged 透传，MTHREADS_VISIBLE_DEVICES 由基础镜像设置
+          memory: 32GB
+          shm_size: 16g
+          timeout: 3600
+        stages:
+          - name: test
+            run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
+
   ascend:                                  # TODO: Ascend image is not ready yet
     image:
       dockerfile: .ci/images/ascend/
diff --git a/.ci/images/moore/Dockerfile b/.ci/images/moore/Dockerfile
new file mode 100644
index 0000000..9a073ba
--- /dev/null
+++ b/.ci/images/moore/Dockerfile
@@ -0,0 +1,38 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# MUSA_HOME, PATH, LD_LIBRARY_PATH already set by base image
+
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy
+ARG https_proxy
+ARG no_proxy
+
+ARG APT_MIRROR
+RUN if [ -n "$APT_MIRROR" ]; then \
+        sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \
+    fi && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ninja-build \
+        libclang-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG PIP_INDEX_URL
+RUN pip install --no-cache-dir \
+    ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \
+    scikit-build-core \
+    libclang \
+    pytest-cov \
+    pytest-xdist \
+    ruff==0.15.7
+
+# Pin pre-installed torch to prevent pip from replacing it with upstream version
+RUN echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt
+ENV PIP_CONSTRAINT=/etc/pip-constraints.txt
+
+WORKDIR /workspace

From 3166c87b224e8678651a0b69a4e9a13d1775e44e Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Tue, 24 Mar 2026 11:23:23 +0000
Subject: [PATCH 10/16] feat(ci): capture Docker error output for remote job
 diagnostics

---
 .ci/agent.py | 129 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 98 insertions(+), 31 deletions(-)

diff --git a/.ci/agent.py b/.ci/agent.py
index 0fa3715..485221f 100644
--- a/.ci/agent.py
+++ b/.ci/agent.py
@@ -52,6 +52,8 @@
 STATE_FAILURE = "failure"
 STATE_ERROR = "error"
 
+TAIL_LINES = 50
+
 # urllib helpers (module-level for easier mocking in tests)
 urllib_request = urllib.request.Request
 urllib_urlopen = urllib.request.urlopen
@@ -65,7 +67,9 @@
 class JobRequest:
     """Describes a CI job to be executed."""
 
-    def __init__(self, job_name, branch, commit_sha, config, image_tag=None, results_dir=None):
+    def __init__(
+        self, job_name, branch, commit_sha, config, image_tag=None, results_dir=None
+    ):
         self.job_id = str(uuid.uuid4())[:8]
         self.job_name = job_name
         self.branch = branch
@@ -92,18 +96,28 @@ def to_dict(self):
 class JobResult:
     """Outcome of a completed job."""
 
-    def __init__(self, job_id, job_name, commit_sha, returncode, results_dir, duration):
+    def __init__(
+        self,
+        job_id,
+        job_name,
+        commit_sha,
+        returncode,
+        results_dir,
+        duration,
+        error_tail=None,
+    ):
         self.job_id = job_id
         self.job_name = job_name
         self.commit_sha = commit_sha
         self.returncode = returncode
         self.results_dir = results_dir
         self.duration = duration
+        self.error_tail = error_tail or []
 
         self.state = STATE_SUCCESS if returncode == 0 else STATE_FAILURE
 
     def to_dict(self):
-        return {
+        d = {
             "job_id": self.job_id,
             "job_name": self.job_name,
             "commit_sha": self.commit_sha,
@@ -113,6 +127,11 @@ def to_dict(self):
             "duration_seconds": round(self.duration, 1),
         }
 
+        if self.error_tail:
+            d["error_tail"] = self.error_tail
+
+        return d
+
 
 # ---------------------------------------------------------------------------
 # Job selection and routing
@@ -130,14 +149,11 @@ def select_jobs(config, platform=None, job_name=None):
         return [job_name]
 
     if platform:
-        return [
-            name for name, job in jobs.items() if job.get("platform") == platform
-        ]
+        return [name for name, job in jobs.items() if job.get("platform") == platform]
 
     return list(jobs.keys())
 
 
-
 # ---------------------------------------------------------------------------
 # Scheduler
 # ---------------------------------------------------------------------------
@@ -211,10 +227,7 @@ def get_job(self, job_id):
     def get_status(self):
         """Return scheduler status for the /status endpoint."""
         with self._lock:
-            queued = [
-                self._jobs[r.job_id]["request"].to_dict()
-                for r in self._queue
-            ]
+            queued = [self._jobs[r.job_id]["request"].to_dict() for r in self._queue]
             running = []
             completed = []
 
@@ -222,7 +235,9 @@ def get_status(self):
                 state = entry["state"]
 
                 if state == STATE_RUNNING:
-                    running.append({**entry["request"].to_dict(), "gpu_ids": entry["gpu_ids"]})
+                    running.append(
+                        {**entry["request"].to_dict(), "gpu_ids": entry["gpu_ids"]}
+                    )
                 elif state in (STATE_SUCCESS, STATE_FAILURE):
                     completed.append(entry["result"].to_dict())
 
@@ -238,7 +253,8 @@ def wait_all(self):
         while True:
             with self._lock:
                 pending = any(
-                    e["state"] in (STATE_QUEUED, STATE_RUNNING) for e in self._jobs.values()
+                    e["state"] in (STATE_QUEUED, STATE_RUNNING)
+                    for e in self._jobs.values()
                 )
 
             if not pending:
@@ -248,11 +264,7 @@ def wait_all(self):
             self._done_event.clear()
 
         with self._lock:
-            return [
-                e["result"]
-                for e in self._jobs.values()
-                if e["result"] is not None
-            ]
+            return [e["result"] for e in self._jobs.values() if e["result"] is not None]
 
     def _try_schedule(self):
         """Try to run queued jobs that have enough resources.
@@ -315,7 +327,9 @@ def _run_job(self, req, gpu_ids):
             job_cfg = self._config["jobs"][req.job_name]
             all_stages = job_cfg.get("stages", [])
             repo_url = self._config.get("repo", {}).get("url", "")
-            commit_short = req.commit_sha[:7] if len(req.commit_sha) > 7 else req.commit_sha
+            commit_short = (
+                req.commit_sha[:7] if len(req.commit_sha) > 7 else req.commit_sha
+            )
             results_dir = run.build_results_dir(
                 req.results_dir, req.platform, all_stages, commit_short
             )
@@ -338,10 +352,30 @@ def _run_job(self, req, gpu_ids):
             if self._dry_run:
                 print(f"[dry-run] {req.job_name}: {shlex.join(docker_args)}")
                 returncode = 0
+                error_tail = []
             else:
                 results_dir.mkdir(parents=True, exist_ok=True)
-                proc = subprocess.run(docker_args)
-                returncode = proc.returncode
+                proc = subprocess.Popen(
+                    docker_args,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                )
+                tail_buf = collections.deque(maxlen=TAIL_LINES)
+
+                for line in proc.stdout:
+                    sys.stdout.buffer.write(line)
+                    tail_buf.append(line)
+
+                proc.stdout.close()
+                returncode = proc.wait()
+
+                if returncode != 0:
+                    error_tail = [
+                        raw.decode("utf-8", errors="replace").rstrip("\n")
+                        for raw in tail_buf
+                    ]
+                else:
+                    error_tail = []
 
             duration = time.monotonic() - start
 
@@ -352,6 +386,7 @@ def _run_job(self, req, gpu_ids):
                 returncode=returncode,
                 results_dir=results_dir,
                 duration=duration,
+                error_tail=error_tail,
             )
 
             # Post final status
@@ -365,7 +400,9 @@ def _run_job(self, req, gpu_ids):
                     f"{req.job_name}: {result.state} in {duration:.0f}s",
                 )
         except Exception as e:
-            print(f"error: job {req.job_name} failed with exception: {e}", file=sys.stderr)
+            print(
+                f"error: job {req.job_name} failed with exception: {e}", file=sys.stderr
+            )
 
             if result is None:
                 result = JobResult(
@@ -375,6 +412,7 @@ def _run_job(self, req, gpu_ids):
                     returncode=-1,
                     results_dir=req.results_dir,
                     duration=0,
+                    error_tail=[str(e)],
                 )
 
             if not self._no_status:
@@ -392,7 +430,9 @@ def _run_job(self, req, gpu_ids):
 
             with self._lock:
                 self._jobs[req.job_id]["result"] = result
-                self._jobs[req.job_id]["state"] = result.state if result else STATE_FAILURE
+                self._jobs[req.job_id]["state"] = (
+                    result.state if result else STATE_FAILURE
+                )
 
             self._done_event.set()
             self._try_schedule()
@@ -410,9 +450,9 @@ def verify_signature(secret, body, signature_header):
     if not signature_header:
         return False
 
-    expected = "sha256=" + hmac.new(
-        secret.encode("utf-8"), body, hashlib.sha256
-    ).hexdigest()
+    expected = (
+        "sha256=" + hmac.new(secret.encode("utf-8"), body, hashlib.sha256).hexdigest()
+    )
     return hmac.compare_digest(expected, signature_header)
 
 
@@ -567,7 +607,9 @@ def _parse_pull_request(self, payload):
 
     def _submit_jobs(self, branch, sha, job_name=None, image_tag=None):
         config = self.server.config
-        job_names = select_jobs(config, platform=self.server.platform, job_name=job_name)
+        job_names = select_jobs(
+            config, platform=self.server.platform, job_name=job_name
+        )
         job_ids = []
 
         for name in job_names:
@@ -621,7 +663,9 @@ def __init__(
 # ---------------------------------------------------------------------------
 
 
-def dispatch_remote_job(agent_url, job_name, branch, commit_sha, image_tag=None, api_token=None):
+def dispatch_remote_job(
+    agent_url, job_name, branch, commit_sha, image_tag=None, api_token=None
+):
     """Send a job to a remote agent via HTTP API. Returns job_id or None."""
     url = f"{agent_url.rstrip('/')}/api/run"
     body = {
@@ -707,7 +751,10 @@ def cmd_run(args):
         agent_url = agents.get(platform, {}).get("url", "")
 
         if not agent_url:
-            print(f"error: no agent URL configured for platform {platform!r} (job {name})", file=sys.stderr)
+            print(
+                f"error: no agent URL configured for platform {platform!r} (job {name})",
+                file=sys.stderr,
+            )
             sys.exit(1)
 
         jobs_to_dispatch.append((name, agent_url))
@@ -730,7 +777,11 @@ def cmd_run(args):
                 file=sys.stderr,
             )
             job_id = dispatch_remote_job(
-                agent_url, name, branch, commit_sha, args.image_tag,
+                agent_url,
+                name,
+                branch,
+                commit_sha,
+                args.image_tag,
                 api_token=api_token or None,
             )
 
@@ -760,6 +811,20 @@ def cmd_run(args):
                             f"<== {tag}  {name}  ({duration:.0f}s)",
                             file=sys.stderr,
                         )
+
+                        error_tail = result.get("error_tail", [])
+
+                        if error_tail:
+                            print(
+                                f"--- error output (last {len(error_tail)} lines) ---",
+                                file=sys.stderr,
+                            )
+
+                            for line in error_tail:
+                                print(f"    {line}", file=sys.stderr)
+
+                            print("---", file=sys.stderr)
+
                         results.append(result)
                     else:
                         print(f"<== TIMEOUT  {name}", file=sys.stderr)
@@ -875,7 +940,9 @@ def main():
         type=Path,
         default=Path(__file__).resolve().parent / "config.yaml",
     )
-    run_parser.add_argument("--branch", type=str, help="Branch to test (default: config repo.branch)")
+    run_parser.add_argument(
+        "--branch", type=str, help="Branch to test (default: config repo.branch)"
+    )
     run_parser.add_argument("--job", type=str, help="Specific job name")
     run_parser.add_argument("--platform", type=str, help="Filter jobs by platform")
     run_parser.add_argument("--image-tag", type=str, help="Override image tag")

From 04424eac9688c32b0272c316eeaeacc1a2529a58 Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Tue, 24 Mar 2026 11:40:56 +0000
Subject: [PATCH 11/16] feat(ci): capture error output and improve CLI result
 display

- Capture last 50 lines of Docker output via ring buffer so failed
  jobs return diagnostic info to the CLI client.
- Store raw bytes during execution; decode only on the failure path.
- Align job name columns in `<==` result lines for readability.
- Show summary only when jobs fail, removing redundant all-pass output.

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .ci/agent.py | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/.ci/agent.py b/.ci/agent.py
index 485221f..2fa9971 100644
--- a/.ci/agent.py
+++ b/.ci/agent.py
@@ -799,6 +799,9 @@ def cmd_run(args):
                     for name, url, jid in dispatched
                 }
 
+                # Collect name lengths for column alignment.
+                name_width = max(len(n) for n, _, _ in dispatched)
+
                 for future in as_completed(futures):
                     name, _, _ = futures[future]
                     result = future.result()
@@ -808,7 +811,7 @@ def cmd_run(args):
                         duration = result.get("duration_seconds", 0)
                         tag = "PASS" if state == STATE_SUCCESS else "FAIL"
                         print(
-                            f"<== {tag}  {name}  ({duration:.0f}s)",
+                            f"<== {tag}  {name:<{name_width}}  ({duration:.0f}s)",
                             file=sys.stderr,
                         )
 
@@ -827,25 +830,28 @@ def cmd_run(args):
 
                         results.append(result)
                     else:
-                        print(f"<== TIMEOUT  {name}", file=sys.stderr)
+                        print(
+                            f"<== TIMEOUT  {name:<{name_width}}",
+                            file=sys.stderr,
+                        )
                         results.append({"job_name": name, "state": "timeout"})
 
-    # Summary
-    print("\n========== Results ==========")
-    all_ok = True
+    # Summary: only print when there are failures.
+    failed = [r for r in results if r.get("state") != STATE_SUCCESS]
 
-    for r in results:
-        state = r.get("state", "unknown")
-        name = r.get("job_name", "?")
-        status = "PASS" if state == STATE_SUCCESS else "FAIL"
+    if failed:
+        print("\n========== Failed ==========", file=sys.stderr)
+        name_width = max(len(r.get("job_name", "?")) for r in failed)
 
-        if state != STATE_SUCCESS:
-            all_ok = False
-
-        duration = r.get("duration_seconds", 0)
-        print(f"  {status}  {name}  ({duration:.0f}s)")
+        for r in failed:
+            name = r.get("job_name", "?")
+            state = r.get("state", "unknown")
+            duration = r.get("duration_seconds", 0)
+            print(
+                f"  FAIL  {name:<{name_width}}  {state} ({duration:.0f}s)",
+                file=sys.stderr,
+            )
 
-    if not all_ok:
         sys.exit(1)
 
 

From a7fa544a84f7aeb49a6320045374dac9ab89ae68 Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Tue, 24 Mar 2026 23:35:04 +0800
Subject: [PATCH 12/16] feat(ci): add Cambricon MLU platform CI support

- Add .ci/images/cambricon/Dockerfile for AnolisOS-based Cambricon image
- Add cambricon platform to config.yaml with MLU-style GPU passthrough
- Add GPU_STYLE_MLU constant and MLU_VISIBLE_DEVICES support in run.py
- Add cnmon-based GPU detection (_detect_gpus_cambricon) in ci_resource.py
- Add --test CLI flag to override pytest test path at runtime
- Skip empty stage run commands instead of erroring (compilation-only mode)
- Fix _torch_gemm fallback for CPU float16/bfloat16 (upcast to float32)
- Skip bfloat16 on MLU (cnnlBatchMatMulEx does not support it)
- Hoist _PYTEST_VALUE_FLAGS to module level; add ValueError guard in cambricon parser
- Remove redundant yaml import guard in agent.py (utils.py already handles it)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .ci/agent.py                    |  18 ++---
 .ci/ci_resource.py              | 106 ++++++++++++++++++++-----
 .ci/config.yaml                 |  21 +++++
 .ci/github_status.py            |   4 +-
 .ci/images/cambricon/Dockerfile |  33 ++++++++
 .ci/run.py                      |  86 +++++++++++++++++----
 .ci/tests/test_agent.py         | 132 ++++++++++++++++++++++++--------
 .ci/tests/test_github_status.py |   7 +-
 .ci/tests/test_resource.py      |  11 ++-
 tests/test_gemm.py              |  12 ++-
 10 files changed, 341 insertions(+), 89 deletions(-)
 create mode 100644 .ci/images/cambricon/Dockerfile

diff --git a/.ci/agent.py b/.ci/agent.py
index 2fa9971..3fb5d9e 100644
--- a/.ci/agent.py
+++ b/.ci/agent.py
@@ -29,14 +29,6 @@
 from http.server import BaseHTTPRequestHandler, HTTPServer
 from pathlib import Path
 
-try:
-    import yaml
-except ImportError:
-    print(
-        "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr
-    )
-    sys.exit(1)
-
 import ci_resource as res
 import github_status as gh
 import run
@@ -920,11 +912,11 @@ def cmd_serve(args):
         f"Agent serving on {args.host}:{args.port} (platform={platform})",
         file=sys.stderr,
     )
-    print(f"  POST /webhook  — GitHub webhook", file=sys.stderr)
-    print(f"  POST /api/run  — remote job trigger", file=sys.stderr)
-    print(f"  GET  /health   — health check", file=sys.stderr)
-    print(f"  GET  /status   — queue & resource status", file=sys.stderr)
-    print(f"  GET  /api/job/{{id}} — job status", file=sys.stderr)
+    print("  POST /webhook  — GitHub webhook", file=sys.stderr)
+    print("  POST /api/run  — remote job trigger", file=sys.stderr)
+    print("  GET  /health   — health check", file=sys.stderr)
+    print("  GET  /status   — queue & resource status", file=sys.stderr)
+    print("  GET  /api/job/{id} — job status", file=sys.stderr)
 
     try:
         server.serve_forever()
diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py
index a49cbff..bbf27ae 100644
--- a/.ci/ci_resource.py
+++ b/.ci/ci_resource.py
@@ -2,16 +2,18 @@
 """Resource detection and allocation for CI Runner Agent."""
 
 import json
+import operator
 import os
 import re
 import shutil
 import subprocess
 import threading
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 
 # GPU passthrough styles
 GPU_STYLE_NVIDIA = "nvidia"
 GPU_STYLE_NONE = "none"
+GPU_STYLE_MLU = "mlu"
 
 
 @dataclass
@@ -41,6 +43,7 @@ class ResourcePool:
         "iluvatar": "ixsmi",
         "metax": "mx-smi",
         "moore": "mthreads-gmi",
+        "cambricon": "cnmon",
     }
 
     def __init__(self, platform, utilization_threshold=10):
@@ -66,6 +69,9 @@ def detect_gpus(self) -> list[GpuInfo]:
         if self._platform == "moore":
             return self._detect_gpus_moore()
 
+        if self._platform == "cambricon":
+            return self._detect_gpus_cambricon()
+
         tool = self.GPU_QUERY_TOOLS.get(self._platform)
 
         if not tool:
@@ -127,7 +133,9 @@ def run_mxsmi(flag):
             try:
                 r = subprocess.run(
                     ["mx-smi", flag],
-                    capture_output=True, text=True, timeout=10,
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
                 )
                 return r.stdout if r.returncode == 0 else ""
             except (FileNotFoundError, subprocess.TimeoutExpired):
@@ -178,12 +186,14 @@ def run_mxsmi(flag):
         gpus = []
         for idx in sorted(mem):
             used_mb, total_mb = mem[idx]
-            gpus.append(GpuInfo(
-                index=idx,
-                memory_used_mb=used_mb,
-                memory_total_mb=total_mb,
-                utilization_pct=util.get(idx, 0.0),
-            ))
+            gpus.append(
+                GpuInfo(
+                    index=idx,
+                    memory_used_mb=used_mb,
+                    memory_total_mb=total_mb,
+                    utilization_pct=util.get(idx, 0.0),
+                )
+            )
         return gpus
 
     def _detect_gpus_moore(self) -> list[GpuInfo]:
@@ -206,6 +216,7 @@ def _detect_gpus_moore(self) -> list[GpuInfo]:
               }
             }
         """
+
         def extract_number(s):
             m = re.search(r"([\d.]+)", str(s))
             return float(m.group(1)) if m else 0.0
@@ -242,16 +253,77 @@ def extract_number(s):
                     gpu_data.get("Utilization", {}).get("Gpu", "0 %")
                 )
 
-                gpus.append(GpuInfo(
-                    index=index,
-                    memory_used_mb=used_mb,
-                    memory_total_mb=total_mb,
-                    utilization_pct=util_pct,
-                ))
+                gpus.append(
+                    GpuInfo(
+                        index=index,
+                        memory_used_mb=used_mb,
+                        memory_total_mb=total_mb,
+                        utilization_pct=util_pct,
+                    )
+                )
             except (ValueError, AttributeError):
                 continue
 
-        return sorted(gpus, key=lambda g: g.index)
+        return sorted(gpus, key=operator.attrgetter("index"))
+
+    def _detect_gpus_cambricon(self) -> list[GpuInfo]:
+        """Parse cnmon output for Cambricon MLU cards.
+
+        Each card appears as two consecutive data rows:
+            Row 1: | {card}  {vf}  {name}  {fw} | {bus_id} | {util}%  {ecc} |
+            Row 2: | {fan}%  {temp}  {pwr} | {mem_used} MiB/ {mem_total} MiB | ... |
+        """
+        try:
+            result = subprocess.run(
+                ["cnmon"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+        except (FileNotFoundError, subprocess.TimeoutExpired):
+            return []
+
+        if result.returncode != 0:
+            return []
+
+        gpus = []
+        lines = result.stdout.splitlines()
+        i = 0
+
+        while i < len(lines):
+            line = lines[i]
+            # Row 1: "| {index} ... | {bus_id} | {util}%  {ecc} |"
+            m1 = re.match(r"^\|\s+(\d+)\s+.*\|\s*([\d.]+)%", line)
+
+            if m1 and i + 1 < len(lines):
+                try:
+                    card_index = int(m1.group(1))
+                    util_pct = float(m1.group(2))
+                    row2 = lines[i + 1]
+                    mem_m = re.search(r"([\d.]+)\s+MiB/\s*([\d.]+)\s+MiB", row2)
+
+                    if mem_m:
+                        used_mb = float(mem_m.group(1))
+                        total_mb = float(mem_m.group(2))
+                    else:
+                        used_mb, total_mb = 0.0, 0.0
+
+                    gpus.append(
+                        GpuInfo(
+                            index=card_index,
+                            memory_used_mb=used_mb,
+                            memory_total_mb=total_mb,
+                            utilization_pct=util_pct,
+                        )
+                    )
+                except (ValueError, AttributeError):
+                    pass
+                i += 2
+                continue
+
+            i += 1
+
+        return sorted(gpus, key=operator.attrgetter("index"))
 
     def detect_system_resources(self) -> SystemResources:
         """Read system memory from /proc/meminfo and CPU count."""
@@ -278,9 +350,7 @@ def get_free_gpus(self) -> list[int]:
         """Return GPU indices with utilization below threshold."""
         gpus = self.detect_gpus()
         return [
-            g.index
-            for g in gpus
-            if g.utilization_pct < self._utilization_threshold
+            g.index for g in gpus if g.utilization_pct < self._utilization_threshold
         ]
 
     def allocate(self, gpu_count, memory_mb=0) -> tuple[list[int], bool]:
diff --git a/.ci/config.yaml b/.ci/config.yaml
index 24b4006..2509b40 100644
--- a/.ci/config.yaml
+++ b/.ci/config.yaml
@@ -107,6 +107,27 @@ platforms:
           - name: test
             run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
 
+  cambricon:
+    image:
+      dockerfile: .ci/images/cambricon/
+      build_args:
+        BASE_IMAGE: cambricon/pytorch:v1.25.3-torch2.1-anolisos8.8-py310
+        PIP_INDEX_URL: https://pypi.org/simple
+    docker_args:
+      - "--privileged"
+    setup: pip install .[dev] --no-build-isolation
+    jobs:
+      gpu:
+        resources:
+          gpu_ids: "0"
+          gpu_style: mlu                   # Cambricon MLU 通过 --privileged 透传，通过 MLU_VISIBLE_DEVICES 控制可见设备
+          memory: 32GB
+          shm_size: 16g
+          timeout: 3600
+        stages:
+          - name: test
+            run: pytest tests/test_gemm.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
+
   ascend:                                  # TODO: Ascend image is not ready yet
     image:
       dockerfile: .ci/images/ascend/
diff --git a/.ci/github_status.py b/.ci/github_status.py
index a7abb8f..f8f017f 100644
--- a/.ci/github_status.py
+++ b/.ci/github_status.py
@@ -59,7 +59,9 @@ def post_commit_status(
         return False
 
     if not owner or not repo or not sha:
-        print("warning: missing owner/repo/sha, skipping status update", file=sys.stderr)
+        print(
+            "warning: missing owner/repo/sha, skipping status update", file=sys.stderr
+        )
         return False
 
     url = f"https://api.github.com/repos/{owner}/{repo}/statuses/{sha}"
diff --git a/.ci/images/cambricon/Dockerfile b/.ci/images/cambricon/Dockerfile
new file mode 100644
index 0000000..f1282d9
--- /dev/null
+++ b/.ci/images/cambricon/Dockerfile
@@ -0,0 +1,33 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+# Python 3.10 executables (pip-installed tools) live under /usr/local/python3.10/bin.
+ENV PATH=/usr/local/python3.10/bin:${PATH}
+
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy
+ARG https_proxy
+ARG no_proxy
+
+# git and cmake are pre-installed; coreutils-single covers coreutils needs.
+RUN dnf install -y ninja-build && dnf clean all
+
+ARG PIP_INDEX_URL
+RUN pip install --no-cache-dir \
+    ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \
+    scikit-build-core \
+    libclang \
+    pytest \
+    pytest-cov \
+    pytest-xdist \
+    ruff==0.15.7
+
+# Pin pre-installed Cambricon torch to prevent pip from replacing it with upstream version.
+RUN pip show torch >/dev/null 2>&1 && \
+    echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \
+    touch /etc/pip-constraints.txt
+ENV PIP_CONSTRAINT=/etc/pip-constraints.txt
+
+WORKDIR /workspace
diff --git a/.ci/run.py b/.ci/run.py
index 6c108e4..811ba2d 100644
--- a/.ci/run.py
+++ b/.ci/run.py
@@ -9,9 +9,52 @@
 from datetime import datetime
 from pathlib import Path
 
-from ci_resource import GPU_STYLE_NVIDIA, GPU_STYLE_NONE, detect_platform
+from ci_resource import (
+    GPU_STYLE_NVIDIA,
+    GPU_STYLE_NONE,
+    GPU_STYLE_MLU,
+    ResourcePool,
+    detect_platform,
+)
 from utils import get_git_commit, load_config
 
+# Flags that consume the next token as their value (e.g. -n 4, -k expr).
+_PYTEST_VALUE_FLAGS = {"-n", "-k", "-m", "-p", "--tb", "--junitxml", "--rootdir"}
+
+
+def apply_test_override(run_cmd, test_path):
+    """Replace positional test path(s) in a pytest stage command.
+
+    For example: ``pytest tests/ -n 4 ...`` becomes
+    ``pytest tests/test_gemm.py -n 4 ...`` when ``test_path`` is
+    ``tests/test_gemm.py``.
+    """
+    parts = shlex.split(run_cmd)
+
+    if not parts or parts[0] != "pytest":
+        return run_cmd
+
+    result = ["pytest", test_path]
+    skip_next = False
+
+    for p in parts[1:]:
+        if skip_next:
+            result.append(p)
+            skip_next = False
+            continue
+
+        if p.startswith("-"):
+            result.append(p)
+            if p in _PYTEST_VALUE_FLAGS:
+                skip_next = True
+            continue
+
+        # Skip existing test paths; the override is already in result[1].
+        if not ("/" in p or p.endswith(".py") or "::" in p):
+            result.append(p)
+
+    return shlex.join(result)
+
 
 def build_results_dir(base, platform, stages, commit):
     """Build a results directory path: `{base}/{platform}_{stages}_{commit}_{timestamp}`."""
@@ -57,7 +100,7 @@ def build_runner_script():
   name="${!name_var}"
   cmd="${!cmd_var}"
   echo "========== Stage: $name =========="
-  eval "$cmd" || failed=1
+  [ -n "$cmd" ] && { eval "$cmd" || failed=1; }
 done
 echo "========== Summary =========="
 if [ -n "$HOST_UID" ] && [ -n "$HOST_GID" ]; then
@@ -130,7 +173,7 @@ def build_docker_args(
         args.append("-e")
         args.append(f"STAGE_{i + 1}_NAME={s['name']}")
         args.append("-e")
-        args.append(f"STAGE_{i + 1}_CMD={s['run']}")
+        args.append(f"STAGE_{i + 1}_CMD={s.get('run', '')}")
 
     # Platform-specific device access
     for flag in job.get("docker_args", []):
@@ -155,6 +198,10 @@ def build_docker_args(
         # For platforms like Iluvatar/CoreX that use --privileged + /dev mount,
         # control visible GPUs via CUDA_VISIBLE_DEVICES.
         args.extend(["-e", f"CUDA_VISIBLE_DEVICES={gpu_id}"])
+    elif gpu_style == GPU_STYLE_MLU and gpu_id and gpu_id != "all":
+        # For Cambricon MLU platforms that use --privileged,
+        # control visible devices via MLU_VISIBLE_DEVICES.
+        args.extend(["-e", f"MLU_VISIBLE_DEVICES={gpu_id}"])
 
     memory = resources.get("memory")
 
@@ -195,7 +242,8 @@ def resolve_job_names(jobs, platform, job=None):
 
     if job:
         matches = [
-            name for name, cfg in jobs.items()
+            name
+            for name, cfg in jobs.items()
             if cfg.get("platform") == platform and cfg.get("short_name") == job
         ]
 
@@ -208,9 +256,7 @@ def resolve_job_names(jobs, platform, job=None):
 
         return matches
 
-    matches = [
-        name for name, cfg in jobs.items() if cfg.get("platform") == platform
-    ]
+    matches = [name for name, cfg in jobs.items() if cfg.get("platform") == platform]
 
     if not matches:
         print(f"error: no jobs for platform {platform!r}", file=sys.stderr)
@@ -227,7 +273,9 @@ def main():
         default=Path(__file__).resolve().parent / "config.yaml",
         help="Path to config.yaml",
     )
-    parser.add_argument("--branch", type=str, help="Override repo branch (default: config repo.branch)")
+    parser.add_argument(
+        "--branch", type=str, help="Override repo branch (default: config repo.branch)"
+    )
     parser.add_argument(
         "--job",
         type=str,
@@ -254,6 +302,11 @@ def main():
         default=Path("ci-results"),
         help="Base directory for test results (default: ./ci-results)",
     )
+    parser.add_argument(
+        "--test",
+        type=str,
+        help='Override pytest test path, e.g. "tests/test_gemm.py" or "tests/test_gemm.py::test_gemm"',
+    )
     parser.add_argument(
         "--dry-run",
         action="store_true",
@@ -269,10 +322,8 @@ def main():
     platform = detect_platform()
 
     if not platform:
-        print(
-            "error: could not detect platform (no nvidia-smi or ixsmi found)",
-            file=sys.stderr,
-        )
+        tools = ", ".join(ResourcePool.GPU_QUERY_TOOLS.values())
+        print(f"error: could not detect platform (no {tools} found)", file=sys.stderr)
         sys.exit(1)
 
     print(f"platform: {platform}", file=sys.stderr)
@@ -294,11 +345,20 @@ def main():
             stages = [s for s in all_stages if s["name"] == args.stage]
 
             if not stages:
-                print(f"error: stage {args.stage!r} not found in {job_name}", file=sys.stderr)
+                print(
+                    f"error: stage {args.stage!r} not found in {job_name}",
+                    file=sys.stderr,
+                )
                 sys.exit(1)
         else:
             stages = all_stages
 
+        if args.test:
+            stages = [
+                {**s, "run": apply_test_override(s.get("run", ""), args.test)}
+                for s in stages
+            ]
+
         job_platform = job.get("platform", platform)
         commit = get_git_commit()
         results_dir = build_results_dir(args.results_dir, job_platform, stages, commit)
diff --git a/.ci/tests/test_agent.py b/.ci/tests/test_agent.py
index aa181c4..e51af2a 100644
--- a/.ci/tests/test_agent.py
+++ b/.ci/tests/test_agent.py
@@ -2,9 +2,8 @@
 import hmac
 import json
 import threading
-import time
 from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 
 import pytest
 
@@ -81,7 +80,12 @@ def mock_resource_pool():
     pool.platform = "nvidia"
     pool.allocate.return_value = ([0], True)
     pool.release.return_value = None
-    pool.get_status.return_value = {"platform": "nvidia", "gpus": [], "allocated_gpu_ids": [], "system": {}}
+    pool.get_status.return_value = {
+        "platform": "nvidia",
+        "gpus": [],
+        "allocated_gpu_ids": [],
+        "system": {},
+    }
     return pool
 
 
@@ -115,7 +119,6 @@ def test_select_jobs_invalid_name(agent_config):
         agent.select_jobs(agent_config, job_name="not_exist")
 
 
-
 # ---------------------------------------------------------------------------
 # verify_signature
 # ---------------------------------------------------------------------------
@@ -171,13 +174,21 @@ def test_scheduler_submit_and_run(agent_config, mock_resource_pool, monkeypatch)
     monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
 
     scheduler = agent.Scheduler(
-        agent_config, "nvidia", mock_resource_pool,
+        agent_config,
+        "nvidia",
+        mock_resource_pool,
         results_dir=Path("/tmp/test-results"),
-        no_status=True, dry_run=True,
+        no_status=True,
+        dry_run=True,
     )
-    req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config,
-                           results_dir=Path("/tmp/test-results"))
-    jid = scheduler.submit(req)
+    req = agent.JobRequest(
+        "nvidia_gpu",
+        "master",
+        "abc123",
+        agent_config,
+        results_dir=Path("/tmp/test-results"),
+    )
+    scheduler.submit(req)
     results = scheduler.wait_all()
     assert len(results) == 1
     assert results[0].state == "success"
@@ -186,11 +197,19 @@ def test_scheduler_submit_and_run(agent_config, mock_resource_pool, monkeypatch)
 def test_scheduler_queues_when_no_resources(agent_config, monkeypatch):
     pool = MagicMock(spec=res.ResourcePool)
     pool.allocate.return_value = ([], False)
-    pool.get_status.return_value = {"platform": "nvidia", "gpus": [], "allocated_gpu_ids": [], "system": {}}
+    pool.get_status.return_value = {
+        "platform": "nvidia",
+        "gpus": [],
+        "allocated_gpu_ids": [],
+        "system": {},
+    }
 
     scheduler = agent.Scheduler(
-        agent_config, "nvidia", pool,
-        no_status=True, dry_run=False,
+        agent_config,
+        "nvidia",
+        pool,
+        no_status=True,
+        dry_run=False,
     )
 
     req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config)
@@ -202,8 +221,11 @@ def test_scheduler_queues_when_no_resources(agent_config, monkeypatch):
 
 def test_scheduler_get_status(agent_config, mock_resource_pool):
     scheduler = agent.Scheduler(
-        agent_config, "nvidia", mock_resource_pool,
-        no_status=True, dry_run=True,
+        agent_config,
+        "nvidia",
+        mock_resource_pool,
+        no_status=True,
+        dry_run=True,
     )
 
     status = scheduler.get_status()
@@ -256,11 +278,17 @@ def _urlopen_no_proxy(url_or_req, **kwargs):
 
 def test_health_endpoint(agent_config, mock_resource_pool):
     scheduler = agent.Scheduler(
-        agent_config, "nvidia", mock_resource_pool,
+        agent_config,
+        "nvidia",
+        mock_resource_pool,
         no_status=True,
     )
     server = agent.AgentServer(
-        "127.0.0.1", 0, agent_config, scheduler, "nvidia",
+        "127.0.0.1",
+        0,
+        agent_config,
+        scheduler,
+        "nvidia",
     )
     port = server.server_address[1]
 
@@ -280,11 +308,18 @@ def test_api_run_endpoint(agent_config, mock_resource_pool, monkeypatch):
     monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
 
     scheduler = agent.Scheduler(
-        agent_config, "nvidia", mock_resource_pool,
-        no_status=True, dry_run=True,
+        agent_config,
+        "nvidia",
+        mock_resource_pool,
+        no_status=True,
+        dry_run=True,
     )
     server = agent.AgentServer(
-        "127.0.0.1", 0, agent_config, scheduler, "nvidia",
+        "127.0.0.1",
+        0,
+        agent_config,
+        scheduler,
+        "nvidia",
         results_dir=Path("/tmp/test-results"),
     )
     port = server.server_address[1]
@@ -314,12 +349,19 @@ def test_webhook_with_signature(agent_config, mock_resource_pool, monkeypatch):
     monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
 
     scheduler = agent.Scheduler(
-        agent_config, "nvidia", mock_resource_pool,
-        no_status=True, dry_run=True,
+        agent_config,
+        "nvidia",
+        mock_resource_pool,
+        no_status=True,
+        dry_run=True,
     )
     secret = "test-secret"
     server = agent.AgentServer(
-        "127.0.0.1", 0, agent_config, scheduler, "nvidia",
+        "127.0.0.1",
+        0,
+        agent_config,
+        scheduler,
+        "nvidia",
         webhook_secret=secret,
         results_dir=Path("/tmp/test-results"),
     )
@@ -330,10 +372,12 @@ def test_webhook_with_signature(agent_config, mock_resource_pool, monkeypatch):
 
     import urllib.request
 
-    payload = json.dumps({
-        "ref": "refs/heads/master",
-        "after": "abc123def456",
-    }).encode()
+    payload = json.dumps(
+        {
+            "ref": "refs/heads/master",
+            "after": "abc123def456",
+        }
+    ).encode()
     sig = "sha256=" + hmac.new(secret.encode(), payload, hashlib.sha256).hexdigest()
 
     req = urllib.request.Request(
@@ -356,11 +400,17 @@ def test_webhook_with_signature(agent_config, mock_resource_pool, monkeypatch):
 
 def test_webhook_invalid_signature(agent_config, mock_resource_pool):
     scheduler = agent.Scheduler(
-        agent_config, "nvidia", mock_resource_pool,
+        agent_config,
+        "nvidia",
+        mock_resource_pool,
         no_status=True,
     )
     server = agent.AgentServer(
-        "127.0.0.1", 0, agent_config, scheduler, "nvidia",
+        "127.0.0.1",
+        0,
+        agent_config,
+        scheduler,
+        "nvidia",
         webhook_secret="real-secret",
     )
     port = server.server_address[1]
@@ -401,11 +451,18 @@ def test_api_run_requires_token(agent_config, mock_resource_pool, monkeypatch):
     monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
 
     scheduler = agent.Scheduler(
-        agent_config, "nvidia", mock_resource_pool,
-        no_status=True, dry_run=True,
+        agent_config,
+        "nvidia",
+        mock_resource_pool,
+        no_status=True,
+        dry_run=True,
     )
     server = agent.AgentServer(
-        "127.0.0.1", 0, agent_config, scheduler, "nvidia",
+        "127.0.0.1",
+        0,
+        agent_config,
+        scheduler,
+        "nvidia",
         api_token="my-secret-token",
         results_dir=Path("/tmp/test-results"),
     )
@@ -438,11 +495,18 @@ def test_api_run_accepts_valid_token(agent_config, mock_resource_pool, monkeypat
     monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
 
     scheduler = agent.Scheduler(
-        agent_config, "nvidia", mock_resource_pool,
-        no_status=True, dry_run=True,
+        agent_config,
+        "nvidia",
+        mock_resource_pool,
+        no_status=True,
+        dry_run=True,
     )
     server = agent.AgentServer(
-        "127.0.0.1", 0, agent_config, scheduler, "nvidia",
+        "127.0.0.1",
+        0,
+        agent_config,
+        scheduler,
+        "nvidia",
         api_token="my-secret-token",
         results_dir=Path("/tmp/test-results"),
     )
diff --git a/.ci/tests/test_github_status.py b/.ci/tests/test_github_status.py
index 0efa36e..edb2915 100644
--- a/.ci/tests/test_github_status.py
+++ b/.ci/tests/test_github_status.py
@@ -1,7 +1,6 @@
 import json
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 
-import pytest
 
 import github_status as gh
 
@@ -57,7 +56,9 @@ def test_post_status_no_token(monkeypatch):
 
 
 def test_post_status_missing_owner():
-    result = gh.post_commit_status("", "repo", "abc123", "success", "ctx", "desc", token="tok")
+    result = gh.post_commit_status(
+        "", "repo", "abc123", "success", "ctx", "desc", token="tok"
+    )
     assert result is False
 
 
diff --git a/.ci/tests/test_resource.py b/.ci/tests/test_resource.py
index b75043c..ac15b7e 100644
--- a/.ci/tests/test_resource.py
+++ b/.ci/tests/test_resource.py
@@ -1,6 +1,5 @@
 import threading
 
-import pytest
 
 import ci_resource as res
 
@@ -11,13 +10,17 @@
 
 
 def test_gpu_info_fields():
-    g = res.GpuInfo(index=0, memory_used_mb=1000, memory_total_mb=8000, utilization_pct=50)
+    g = res.GpuInfo(
+        index=0, memory_used_mb=1000, memory_total_mb=8000, utilization_pct=50
+    )
     assert g.index == 0
     assert g.memory_total_mb == 8000
 
 
 def test_system_resources_fields():
-    s = res.SystemResources(total_memory_mb=32000, available_memory_mb=16000, cpu_count=8)
+    s = res.SystemResources(
+        total_memory_mb=32000, available_memory_mb=16000, cpu_count=8
+    )
     assert s.cpu_count == 8
 
 
@@ -90,7 +93,7 @@ def test_detect_system_resources(monkeypatch, tmp_path):
         "MemAvailable:   20000000 kB\n"
     )
 
-    import io
+
     _real_open = open
 
     def fake_open(path, **kw):
diff --git a/tests/test_gemm.py b/tests/test_gemm.py
index 43a47b6..491fb47 100644
--- a/tests/test_gemm.py
+++ b/tests/test_gemm.py
@@ -48,6 +48,10 @@ def test_gemm(
     if device == "mlu" and (trans_a or trans_b):
         pytest.skip("transposing is not currently supported on MLU")
 
+    # cnnlBatchMatMulEx does not accept bfloat16 inputs on MLU.
+    if device == "mlu" and dtype == torch.bfloat16:
+        pytest.skip("bfloat16 is not supported by cnnlBatchMatMulEx")
+
     a = randn_strided(a_shape, a_strides, dtype=dtype, device=device)
     b = randn_strided(b_shape, b_strides, dtype=dtype, device=device)
 
@@ -97,8 +101,10 @@ def _torch_gemm(a, b, alpha=1.0, beta=1.0, trans_a=False, trans_b=False, c=None)
 
         return torch.baddbmm(c, a, b, beta=beta, alpha=alpha, out=c)
     except RuntimeError:
-        c_original = c.clone()
-        torch.matmul(a, b, out=c)
-        c.mul_(alpha).add_(c_original, alpha=beta)
+        # Fallback for backends that don't support addmm/baddbmm (e.g. CPU float16/bfloat16):
+        # compute in float32 and cast back.
+        c_original = c.float()
+        result = torch.matmul(a.float(), b.float())
+        c.copy_((alpha * result + beta * c_original).to(c.dtype))
 
         return c

From 7253bcd47871c359271a69385ac33589c40c0fe4 Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Wed, 25 Mar 2026 02:53:51 +0000
Subject: [PATCH 13/16] docs(ci): translate README and comments to English, use
 ngpus for NVIDIA scheduler

- Rewrite README.md entirely in English; add Cambricon to platform
  table and directory tree.
- Translate all inline comments in config.yaml to English.
- Replace `gpu_ids: "0"` with `ngpus: 1` for NVIDIA platform so the
  scheduler auto-picks a free GPU rather than pinning to device 0.
- Add `ngpus` support to `parse_gpu_requirement` in ci_resource.py so
  scheduler correctly counts NVIDIA GPU demand.
- Replace deprecated `gpu_count` fallback with `ngpus` in run.py
  `build_docker_args`.

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .ci/README.md      | 350 +++++++++++++++++++++++----------------------
 .ci/ci_resource.py |   4 +
 .ci/config.yaml    |  26 ++--
 .ci/run.py         |   6 +-
 4 files changed, 199 insertions(+), 187 deletions(-)

diff --git a/.ci/README.md b/.ci/README.md
index 12e8094..f468d90 100644
--- a/.ci/README.md
+++ b/.ci/README.md
@@ -1,21 +1,22 @@
-# .ci — CI 镜像与流水线
+# .ci — CI Images and Pipeline
 
 ```
 .ci/
-├── config.yaml              # 统一配置（镜像、job、Agent 定义）
-├── utils.py                 # 共享工具（load_config、normalize_config、get_git_commit）
-├── agent.py                 # Runner Agent（调度、Webhook、远程触发）
-├── build.py                 # 镜像构建
-├── run.py                   # CI 流水线执行（Docker 层）
-├── ci_resource.py           # GPU/内存资源检测与分配
-├── github_status.py         # GitHub Commit Status 上报
+├── config.yaml              # Unified config (images, jobs, agent definitions)
+├── utils.py                 # Shared utilities (load_config, normalize_config, get_git_commit)
+├── agent.py                 # Runner Agent (scheduler, webhooks, remote dispatch)
+├── build.py                 # Image builder
+├── run.py                   # CI pipeline runner (Docker layer)
+├── ci_resource.py           # GPU/memory detection and allocation
+├── github_status.py         # GitHub Commit Status reporting
 ├── images/
 │   ├── nvidia/Dockerfile
 │   ├── iluvatar/Dockerfile
 │   ├── metax/Dockerfile
 │   ├── moore/Dockerfile
+│   ├── cambricon/Dockerfile
 │   └── ascend/Dockerfile
-└── tests/                   # 单元测试
+└── tests/                   # Unit tests
     ├── conftest.py
     ├── test_agent.py
     ├── test_build.py
@@ -25,14 +26,14 @@
     └── test_utils.py
 ```
 
-**前置依赖**：Docker、Python 3.10+、`pip install pyyaml`
+**Prerequisites**: Docker, Python 3.10+, `pip install pyyaml`
 
 ---
 
-## 配置文件 `config.yaml`
+## Configuration `config.yaml`
 
-配置以 **platform** 为顶级结构，每个平台包含镜像定义、平台级默认值和 job 列表。
-加载时自动展平为 `{platform}_{job}` 格式（如 `nvidia_gpu`）。
+Config uses a **platform-centric** top-level structure. Each platform defines its image, platform-level defaults, and job list.
+At load time, jobs are flattened to `{platform}_{job}` format (e.g., `nvidia_gpu`).
 
 ```yaml
 repo:
@@ -42,7 +43,7 @@ repo:
 github:
   status_context_prefix: "ci/infiniops"
 
-agents:                                  # 远程 Agent 地址（CLI 跨机器触发用）
+agents:                                  # Remote agent URLs (used by CLI for cross-machine dispatch)
   nvidia:
     url: http://nvidia-host:8080
   iluvatar:
@@ -50,13 +51,13 @@ agents:                                  # 远程 Agent 地址（CLI 跨机器
 
 platforms:
   nvidia:
-    image:                              # 镜像定义
+    image:                              # Image definition
       dockerfile: .ci/images/nvidia/
       build_args:
         BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
     setup: pip install .[dev] --no-build-isolation
     jobs:
-      gpu:                              # 展平后为 nvidia_gpu
+      gpu:                              # Flattened as nvidia_gpu
         resources:
           gpu_ids: "0"                  # "0" | "0,2" | "all"
           memory: 32GB
@@ -73,7 +74,7 @@ platforms:
         BASE_IMAGE: corex:qs_pj20250825
         APT_MIRROR: http://archive.ubuntu.com/ubuntu
         PIP_INDEX_URL: https://pypi.org/simple
-    docker_args:                        # 平台级 docker 参数，所有 job 继承
+    docker_args:                        # Platform-level docker args, inherited by all jobs
       - "--privileged"
       - "--cap-add=ALL"
       - "--pid=host"
@@ -85,10 +86,10 @@ platforms:
       - /lib/modules:/lib/modules
     setup: pip install .[dev] --no-build-isolation
     jobs:
-      gpu:                              # 展平后为 iluvatar_gpu
+      gpu:                              # Flattened as iluvatar_gpu
         resources:
           gpu_ids: "0"
-          gpu_style: none               # CoreX 设备通过 --privileged + /dev 挂载
+          gpu_style: none               # CoreX: passthrough via --privileged + /dev mount
           memory: 32GB
           shm_size: 16g
           timeout: 3600
@@ -97,170 +98,171 @@ platforms:
             run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
 ```
 
-### 配置层级说明
+### Config hierarchy
 
-| 层级 | 字段 | 说明 |
+| Level | Field | Description |
 |---|---|---|
-| **平台级** | `image` | 镜像定义（dockerfile、build_args） |
-| | `image_tag` | 默认镜像 tag（默认 `latest`） |
-| | `docker_args` | 额外 docker run 参数（如 `--privileged`） |
-| | `volumes` | 额外挂载卷 |
-| | `setup` | 容器内 setup 命令 |
-| | `env` | 注入容器环境变量 |
-| **Job 级** | `resources.gpu_ids` | GPU 设备 ID |
-| | `resources.gpu_style` | GPU 透传方式：`nvidia`（默认）或 `none` |
-| | `resources.memory` | 容器内存限制 |
-| | `resources.shm_size` | 共享内存大小 |
-| | `resources.timeout` | 容器内脚本最大运行秒数 |
-| | `stages` | 执行阶段列表 |
-| | 以上平台级字段 | Job 可覆盖任意平台级默认值 |
+| **Platform** | `image` | Image definition (dockerfile, build_args) |
+| | `image_tag` | Default image tag (defaults to `latest`) |
+| | `docker_args` | Extra `docker run` args (e.g., `--privileged`) |
+| | `volumes` | Extra volume mounts |
+| | `setup` | In-container setup command |
+| | `env` | Injected container env vars |
+| **Job** | `resources.gpu_ids` | GPU device IDs |
+| | `resources.gpu_style` | GPU passthrough: `nvidia` (default) or `none` |
+| | `resources.memory` | Container memory limit |
+| | `resources.shm_size` | Shared memory size |
+| | `resources.timeout` | Max run time in seconds |
+| | `stages` | Execution stage list |
+| | Any platform field | Jobs can override any platform-level default |
 
 ---
 
-## 镜像构建 `build.py`
+## Image builder `build.py`
 
-| 参数 | 说明 |
+| Flag | Description |
 |---|---|
-| `--platform nvidia\|iluvatar\|metax\|moore\|ascend\|all` | 构建平台，默认 `all` |
-| `--commit` | 指定 commit ref 作为镜像 tag（默认 HEAD） |
-| `--force` | 跳过 Dockerfile 变更检测 |
-| `--dry-run` | 打印命令不执行 |
+| `--platform nvidia\|iluvatar\|metax\|moore\|ascend\|all` | Target platform (default: `all`) |
+| `--commit` | Use specific commit ref as image tag (default: HEAD) |
+| `--force` | Skip Dockerfile change detection |
+| `--dry-run` | Print commands without executing |
 
 ```bash
-# 检测变更后构建（无变更自动跳过）
+# Build with change detection (skips if no Dockerfile changes)
 python .ci/build.py --platform nvidia
 
-# 构建 Iluvatar 镜像
+# Build Iluvatar image
 python .ci/build.py --platform iluvatar --force
 
-# 强制构建全部
+# Force build all platforms
 python .ci/build.py --force
 ```
 
-构建产物以宿主机本地镜像 tag 存储：`infiniops-ci/<platform>:<commit-hash>` 和 `:latest`。
-代理、`no_proxy` 自动从宿主机环境变量透传到 `docker build`。
+Build artifacts are stored as local Docker image tags: `infiniops-ci/<platform>:<commit-hash>` and `:latest`.
+Proxy and `no_proxy` env vars are forwarded from the host to `docker build` automatically.
 
-> `--push` 为预留功能，需在 `config.yaml` 中配置 `registry` 段后方可使用。
+> `--push` is reserved for future use; requires a `registry` section in `config.yaml`.
 
 ---
 
-## 流水线执行 `run.py`
+## Pipeline runner `run.py`
 
-平台自动发现（通过检测 `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi`），无需手动指定。
+Platform is auto-detected (via `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi` on PATH), no manual specification needed.
 
-| 参数 | 说明 |
+| Flag | Description |
 |---|---|
-| `--config` | 配置文件路径（默认 `.ci/config.yaml`） |
-| `--job` | job 名称：短名（`gpu`）或完整名（`nvidia_gpu`）。缺省运行当前平台所有 job |
-| `--branch` | 覆盖克隆分支（默认读 config `repo.branch`） |
-| `--stage` | 只运行指定 stage |
-| `--image-tag` | 覆盖镜像 tag |
-| `--gpu-id` | 覆盖 GPU 设备 ID（nvidia 通过 `--gpus`，其他平台通过 `CUDA_VISIBLE_DEVICES`） |
-| `--results-dir` | 宿主机目录，挂载到容器 `/workspace/results` |
-| `--dry-run` | 打印 docker 命令不执行 |
+| `--config` | Config file path (default: `.ci/config.yaml`) |
+| `--job` | Job name: short (`gpu`) or full (`nvidia_gpu`). Defaults to all jobs for the current platform |
+| `--branch` | Override clone branch (default: config `repo.branch`) |
+| `--stage` | Run only the specified stage |
+| `--image-tag` | Override image tag |
+| `--gpu-id` | Override GPU device IDs (nvidia via `--gpus`, others via `CUDA_VISIBLE_DEVICES`) |
+| `--results-dir` | Host directory mounted to `/workspace/results` inside the container |
+| `--dry-run` | Print docker command without executing |
 
 ```bash
-# 最简用法：自动检测平台，运行所有 job，使用 config 默认分支
+# Simplest usage: auto-detect platform, run all jobs, use config default branch
 python .ci/run.py
 
-# 指定 job 短名
+# Specify short job name
 python .ci/run.py --job gpu
 
-# 完整 job 名（向后兼容）
+# Full job name (backward compatible)
 python .ci/run.py --job nvidia_gpu
 
-# 只跑 test stage，预览命令
+# Run only the test stage, preview mode
 python .ci/run.py --job gpu --stage test --dry-run
 ```
 
-容器内执行流程：`git clone` → `checkout` → `setup` → stages。
-代理从宿主机透传，测试结果写入 `--results-dir`。每次运行均为干净环境（不挂载宿主机 pip 缓存）。
+Container execution flow: `git clone` → `checkout` → `setup` → stages.
+Proxy vars are forwarded from the host. Test results are written to `--results-dir`. Each run uses a clean environment (no host pip cache mounted).
 
 ---
 
-## 平台差异
+## Platform differences
 
-| 平台 | GPU 透传方式 | 基础镜像 | 备注 |
+| Platform | GPU passthrough | Base image | Notes |
 |---|---|---|---|
-| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvcr.io/nvidia/pytorch:24.10-py3` | 标准 CUDA |
-| Iluvatar | `--privileged` + `/dev` 挂载 | `corex:qs_pj20250825` | CoreX 运行时，CUDA 兼容 |
-| MetaX | `--privileged` | `maca-pytorch:3.2.1.4` | MACA 运行时，通过 `mx-smi` 检测 |
-| Moore | `--privileged` | `vllm_musa:20251112_hygon` | MUSA 运行时，通过 `mthreads-gmi` 检测 |
-| Ascend | TODO | `ascend-pytorch:24.0.0` | 待完善，镜像和 job 尚未就绪 |
+| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvcr.io/nvidia/pytorch:24.10-py3` | Standard CUDA |
+| Iluvatar | `--privileged` + `/dev` mount | `corex:qs_pj20250825` | CoreX runtime, CUDA compatible |
+| MetaX | `--privileged` | `maca-pytorch:3.2.1.4` | MACA runtime, detected via `mx-smi` |
+| Moore | `--privileged` | `vllm_musa:20251112_hygon` | MUSA runtime, detected via `mthreads-gmi` |
+| Cambricon | `--privileged` | `cambricon/pytorch:v1.25.3` | Neuware runtime, detected via `cnmon` |
+| Ascend | TODO | `ascend-pytorch:24.0.0` | Not ready, image and jobs pending |
 
 ---
 
 ## Runner Agent `agent.py`
 
-Runner Agent 支持 CLI 手动触发、GitHub Webhook 自动触发、资源感知的动态调度，以及跨机器远程触发。
+The Runner Agent supports CLI manual dispatch, GitHub webhook triggers, resource-aware dynamic scheduling, and cross-machine remote dispatch.
 
-### CLI 手动执行
+### CLI manual execution
 
 ```bash
-# 运行所有 job（分发到远程 Agent，使用 config 默认分支）
+# Run all jobs (dispatched to remote agents, using config default branch)
 python .ci/agent.py run
 
-# 指定分支
+# Specify branch
 python .ci/agent.py run --branch feat/xxx
 
-# 运行指定 job
+# Run a specific job
 python .ci/agent.py run --job nvidia_gpu
 
-# 按平台运行
+# Filter by platform
 python .ci/agent.py run --platform nvidia
 
-# 预览命令
+# Preview mode
 python .ci/agent.py run --dry-run
 ```
 
-| 参数 | 说明 |
+| Flag | Description |
 |---|---|
-| `--branch` | 测试分支（默认读 config `repo.branch`） |
-| `--job` | 指定 job 名称 |
-| `--platform` | 按平台过滤 job |
-| `--commit` | 覆盖 commit SHA |
-| `--image-tag` | 覆盖镜像 tag |
-| `--dry-run` | 预览模式 |
+| `--branch` | Test branch (default: config `repo.branch`) |
+| `--job` | Specific job name |
+| `--platform` | Filter jobs by platform |
+| `--commit` | Override commit SHA |
+| `--image-tag` | Override image tag |
+| `--dry-run` | Preview mode |
 
-### Webhook 服务
+### Webhook server
 
-每台平台机器部署一个 Agent 实例（平台自动发现）：
+Deploy one Agent instance per platform machine (platform is auto-detected):
 
 ```bash
-# NVIDIA 机器
+# NVIDIA machine
 python .ci/agent.py serve --port 8080
 
-# Iluvatar 机器
+# Iluvatar machine
 python .ci/agent.py serve --port 8080
 
-# MetaX 机器
+# MetaX machine
 python .ci/agent.py serve --port 8080
 ```
 
-`serve` 子命令额外参数：
+Additional `serve` flags:
 
-| 参数 | 说明 |
+| Flag | Description |
 |---|---|
-| `--port` | 监听端口（默认 8080） |
-| `--host` | 监听地址（默认 `0.0.0.0`） |
-| `--webhook-secret` | GitHub Webhook 签名密钥（或 `WEBHOOK_SECRET` 环境变量） |
-| `--api-token` | `/api/run` Bearer 认证令牌（或 `AGENT_API_TOKEN` 环境变量） |
-| `--results-dir` | 结果目录（默认 `ci-results`） |
-| `--utilization-threshold` | GPU 空闲阈值百分比（默认 10） |
-
-| 端点 | 方法 | 说明 |
+| `--port` | Listen port (default: 8080) |
+| `--host` | Listen address (default: `0.0.0.0`) |
+| `--webhook-secret` | GitHub webhook signing secret (or `WEBHOOK_SECRET` env var) |
+| `--api-token` | `/api/run` Bearer auth token (or `AGENT_API_TOKEN` env var) |
+| `--results-dir` | Results directory (default: `ci-results`) |
+| `--utilization-threshold` | GPU idle threshold percentage (default: 10) |
+
+| Endpoint | Method | Description |
 |---|---|---|
-| `/webhook` | POST | GitHub Webhook（push/pull_request） |
-| `/api/run` | POST | 远程触发 job |
-| `/api/job/{id}` | GET | 查询 job 状态 |
-| `/health` | GET | 健康检查 |
-| `/status` | GET | 队列 + 资源状态 |
+| `/webhook` | POST | GitHub webhook (push/pull_request) |
+| `/api/run` | POST | Remote job trigger |
+| `/api/job/{id}` | GET | Query job status |
+| `/health` | GET | Health check |
+| `/status` | GET | Queue + resource status |
 
-Webhook 支持 `X-Hub-Signature-256` 签名验证，通过 `--webhook-secret` 或 `WEBHOOK_SECRET` 环境变量配置。
+Webhook supports `X-Hub-Signature-256` signature verification via `--webhook-secret` or `WEBHOOK_SECRET` env var.
 
-### 远程 Agent 配置
+### Remote agent configuration
 
-在 `config.yaml` 中配置各平台 Agent 地址，CLI 执行时自动将远程 job 分发到对应 Agent：
+Configure agent URLs in `config.yaml`; the CLI automatically dispatches remote jobs to the corresponding agents:
 
 ```yaml
 agents:
@@ -274,111 +276,111 @@ agents:
     url: http://moore-host:8080
 ```
 
-### 资源调度
+### Resource scheduling
 
-Agent 自动检测 GPU 利用率和系统内存，动态决定并行度：
-- GPU 利用率 < 阈值（默认 10%）且未被 Agent 分配 → 可用
-- 资源不足时 job 自动排队，已完成 job 释放资源后自动调度排队任务
+The Agent auto-detects GPU utilization and system memory to dynamically determine parallelism:
+- GPU utilization < threshold (default 10%) and not allocated by Agent → available
+- When resources are insufficient, jobs are queued automatically; completed jobs release resources and trigger scheduling of queued tasks
 
 ### GitHub Status
 
-设置 `GITHUB_TOKEN` 环境变量后，Agent 会自动上报 commit status：
-- `pending` — job 开始执行
-- `success` / `failure` — job 执行完成
+Set the `GITHUB_TOKEN` env var and the Agent will automatically report commit status:
+- `pending` — job started
+- `success` / `failure` — job completed
 
-Status context 格式：`ci/infiniops/{job_name}`
+Status context format: `ci/infiniops/{job_name}`
 
 ---
 
-## 多机部署指南
+## Multi-machine deployment guide
 
-以 NVIDIA + Iluvatar + MetaX + Moore 多平台为例，说明如何在多台机器上部署 Agent 并实现跨平台并行测试。
+Example with NVIDIA + Iluvatar + MetaX + Moore multi-platform setup, showing how to deploy agents across machines for cross-platform parallel testing.
 
-### 前置条件（所有机器共同）
+### Prerequisites (all machines)
 
 ```bash
-# 1. Python 3.10+ 和依赖
+# 1. Python 3.10+ and dependencies
 pip install pyyaml
 
-# 2. Docker 已安装
+# 2. Docker installed
 docker --version
 
-# 3. 克隆仓库
+# 3. Clone the repository
 git clone https://github.com/InfiniTensor/InfiniOps.git
 cd InfiniOps
 ```
 
-### NVIDIA 机器配置
+### NVIDIA machine setup
 
 ```bash
-# 1. 安装 NVIDIA Container Toolkit
-#    参考: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html
+# 1. Install NVIDIA Container Toolkit
+#    See: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html
 
-# 2. 验证 GPU 可见
+# 2. Verify GPU visibility
 nvidia-smi
 
-# 3. 构建 CI 镜像
+# 3. Build CI image
 python .ci/build.py --platform nvidia
 ```
 
-### Iluvatar 机器配置
+### Iluvatar machine setup
 
 ```bash
-# 1. 确认 CoreX 运行时已安装
+# 1. Verify CoreX runtime is installed
 ixsmi
 
-# 2. 确认基础镜像已导入（非公开镜像，需提前准备）
-docker images | grep corex    # 应有 corex:qs_pj20250825
+# 2. Verify base image is imported (non-public, must be prepared in advance)
+docker images | grep corex    # Should show corex:qs_pj20250825
 
-# 3. 构建 CI 镜像
+# 3. Build CI image
 python .ci/build.py --platform iluvatar
 ```
 
-### MetaX 机器配置
+### MetaX machine setup
 
 ```bash
-# 1. 确认 MACA 运行时已安装
+# 1. Verify MACA runtime is installed
 mx-smi
 
-# 2. 确认基础镜像已导入（非公开镜像，需提前准备）
-docker images | grep maca-pytorch    # 应有 maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64
+# 2. Verify base image is imported (non-public, must be prepared in advance)
+docker images | grep maca-pytorch    # Should show maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64
 
-# 3. 构建 CI 镜像
+# 3. Build CI image
 python .ci/build.py --platform metax
 ```
 
-### Moore 机器配置
+### Moore machine setup
 
 ```bash
-# 1. 确认 MUSA 运行时已安装
+# 1. Verify MUSA runtime is installed
 mthreads-gmi
 
-# 2. 确认基础镜像已导入（非公开镜像，需提前准备）
-docker images | grep vllm_musa    # 应有 vllm_musa:20251112_hygon
+# 2. Verify base image is imported (non-public, must be prepared in advance)
+docker images | grep vllm_musa    # Should show vllm_musa:20251112_hygon
 
-# 3. 构建 CI 镜像
+# 3. Build CI image
 python .ci/build.py --platform moore
 ```
 
-### 启动 Agent 服务
+### Start Agent services
 
-在各自机器上启动 Agent：
+Start the Agent on each machine:
 
 ```bash
-# NVIDIA 机器（平台自动发现）
+# NVIDIA machine (platform auto-detected)
 python .ci/agent.py serve --port 8080
 
-# Iluvatar 机器（平台自动发现）
+# Iluvatar machine (platform auto-detected)
 python .ci/agent.py serve --port 8080
 
-# MetaX 机器（平台自动发现）
+# MetaX machine (platform auto-detected)
 python .ci/agent.py serve --port 8080
 
-# Moore 机器（平台自动发现）
+# Moore machine (platform auto-detected)
 python .ci/agent.py serve --port 8080
 ```
 
-验证连通性：
+Verify connectivity:
 
 ```bash
 curl http://<nvidia-ip>:8080/health
@@ -387,9 +389,9 @@ curl http://<metax-ip>:8080/health
 curl http://<moore-ip>:8080/health
 ```
 
-### 配置远程 Agent 地址
+### Configure remote agent URLs
 
-在触发端的 `config.yaml` 中添加 `agents` 段：
+Add the `agents` section to `config.yaml` on the trigger machine:
 
 ```yaml
 agents:
@@ -403,82 +405,82 @@ agents:
     url: http://<moore-ip>:8080
 ```
 
-### 触发跨平台测试
+### Trigger cross-platform tests
 
 ```bash
-# 一键运行所有平台的 job（使用 config 默认分支）
+# Run all platform jobs at once (using config default branch)
 python .ci/agent.py run
 
-# 预览模式（不实际执行）
+# Preview mode (no actual execution)
 python .ci/agent.py run --dry-run
 
-# 只运行指定平台
+# Run only a specific platform
 python .ci/agent.py run --platform nvidia
 ```
 
-### 可选配置
+### Optional configuration
 
-#### GitHub Status 上报
+#### GitHub Status reporting
 
-所有机器均设置环境变量，各自上报所属平台的测试状态：
+Set the env var on all machines so each reports its own platform's test status:
 
 ```bash
 export GITHUB_TOKEN=ghp_xxxxxxxxxxxx
 ```
 
-#### API Token 认证
+#### API Token authentication
 
-Agent 暴露在非可信网络时，建议启用 Token 认证：
+When agents are exposed on untrusted networks, enable token auth:
 
 ```bash
-# 启动 Agent 时指定 token
+# Specify token at startup
 python .ci/agent.py serve --port 8080 --api-token <secret>
 
-# 或通过环境变量
+# Or via env var
 export AGENT_API_TOKEN=<secret>
 ```
 
-#### GitHub Webhook 自动触发
+#### GitHub Webhook auto-trigger
 
-在 GitHub repo → Settings → Webhooks 中为每台机器添加 Webhook：
+In GitHub repo → Settings → Webhooks, add a webhook for each machine:
 
-| 字段 | 值 |
+| Field | Value |
 |---|---|
-| Payload URL | `http://<机器IP>:8080/webhook` |
+| Payload URL | `http://<machine-ip>:8080/webhook` |
 | Content type | `application/json` |
-| Secret | 与 `--webhook-secret` 一致 |
-| Events | `push` 和 `pull_request` |
+| Secret | Must match `--webhook-secret` |
+| Events | `push` and `pull_request` |
 
-启动时配置 secret：
+Configure the secret at startup:
 
 ```bash
 python .ci/agent.py serve --port 8080 --webhook-secret <github-secret>
 
-# 或通过环境变量
+# Or via env var
 export WEBHOOK_SECRET=<github-secret>
 ```
 
-### 验证清单
+### Verification checklist
 
 ```bash
-# 1. 各机器单独 dry-run
+# 1. Dry-run each machine individually
 python .ci/agent.py run --platform nvidia --dry-run
 python .ci/agent.py run --platform iluvatar --dry-run
 python .ci/agent.py run --platform metax --dry-run
 python .ci/agent.py run --platform moore --dry-run
 
-# 2. 健康检查
+# 2. Health checks
 curl http://<nvidia-ip>:8080/health
 curl http://<iluvatar-ip>:8080/health
 curl http://<metax-ip>:8080/health
 curl http://<moore-ip>:8080/health
 
-# 3. 查看资源状态
+# 3. Resource status
 curl http://<nvidia-ip>:8080/status
 curl http://<iluvatar-ip>:8080/status
 curl http://<metax-ip>:8080/status
 curl http://<moore-ip>:8080/status
 
-# 4. 跨平台一键测试
+# 4. Cross-platform test
 python .ci/agent.py run --branch master
 ```
diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py
index bbf27ae..51b181f 100644
--- a/.ci/ci_resource.py
+++ b/.ci/ci_resource.py
@@ -429,6 +429,10 @@ def parse_gpu_requirement(job_config) -> int:
     if gpu_style == GPU_STYLE_NONE:
         return 0
 
+    ngpus = resources.get("ngpus")
+    if ngpus is not None:
+        return int(ngpus)
+
     gpu_ids = str(resources.get("gpu_ids", ""))
 
     if not gpu_ids:
diff --git a/.ci/config.yaml b/.ci/config.yaml
index 2509b40..3ac211d 100644
--- a/.ci/config.yaml
+++ b/.ci/config.yaml
@@ -3,13 +3,19 @@ repo:
   branch: master
 
 github:
-  status_context_prefix: "ci/infiniops"    # GitHub Commit Status context 前缀
+  status_context_prefix: "ci/infiniops"
 
-# agents:                                  # 远程 Agent 地址（CLI 跨机器触发用）
+# agents:
 #   nvidia:
 #     url: http://nvidia-host:8080
 #   iluvatar:
 #     url: http://iluvatar-host:8080
+#   metax:
+#     url: http://metax-host:8080
+#   moore:
+#     url: http://moore-host:8080
+#   cambricon:
+#     url: http://cambricon-host:8080
 
 platforms:
   nvidia:
@@ -21,11 +27,11 @@ platforms:
     jobs:
       gpu:
         resources:
-          gpu_ids: "0"                     # 指定 GPU ID，如 "0" "0,2" "all"
+          ngpus: 1                         # Scheduler auto-picks this many free GPUs
           memory: 32GB
-          shm_size: 16g                    # 避免 PyTorch 默认 64MB SHMEM 不足
+          shm_size: 16g                    # Prevent PyTorch default 64MB shared memory limit
           timeout: 3600
-        # env:                             # 可选，注入容器环境变量
+        # env:                             # Optional: inject container env vars
         #   MY_VAR: value
         stages:
           - name: test
@@ -52,8 +58,8 @@ platforms:
     jobs:
       gpu:
         resources:
-          gpu_ids: "0"                     # 通过 CUDA_VISIBLE_DEVICES 控制可见 GPU
-          gpu_style: none                  # CoreX 设备通过 --privileged + /dev 挂载透传
+          gpu_ids: "0"                     # GPU visibility via CUDA_VISIBLE_DEVICES
+          gpu_style: none                  # CoreX: passthrough via --privileged + /dev mount
           memory: 32GB
           shm_size: 16g
           timeout: 3600
@@ -77,7 +83,7 @@ platforms:
       gpu:
         resources:
           gpu_ids: "0"
-          gpu_style: none                  # MetaX 设备通过 --privileged 透传，无需 CUDA_VISIBLE_DEVICES
+          gpu_style: none                  # MetaX: passthrough via --privileged, no CUDA_VISIBLE_DEVICES
           memory: 32GB
           shm_size: 16g
           timeout: 3600
@@ -99,7 +105,7 @@ platforms:
       gpu:
         resources:
           gpu_ids: "0"
-          gpu_style: none                  # Moore 设备通过 --privileged 透传，MTHREADS_VISIBLE_DEVICES 由基础镜像设置
+          gpu_style: none                  # Moore: passthrough via --privileged, MTHREADS_VISIBLE_DEVICES set by base image
           memory: 32GB
           shm_size: 16g
           timeout: 3600
@@ -120,7 +126,7 @@ platforms:
       gpu:
         resources:
           gpu_ids: "0"
-          gpu_style: mlu                   # Cambricon MLU 通过 --privileged 透传，通过 MLU_VISIBLE_DEVICES 控制可见设备
+          gpu_style: mlu                   # Cambricon: passthrough via --privileged, MLU_VISIBLE_DEVICES for device control
           memory: 32GB
           shm_size: 16g
           timeout: 3600
diff --git a/.ci/run.py b/.ci/run.py
index 811ba2d..969336d 100644
--- a/.ci/run.py
+++ b/.ci/run.py
@@ -183,7 +183,7 @@ def build_docker_args(
         args.extend(["-v", vol])
 
     gpu_id = gpu_id_override or str(resources.get("gpu_ids", ""))
-    gpu_count = resources.get("gpu_count", 0)
+    ngpus = resources.get("ngpus")
     gpu_style = resources.get("gpu_style", GPU_STYLE_NVIDIA)
 
     if gpu_style == GPU_STYLE_NVIDIA:
@@ -192,8 +192,8 @@ def build_docker_args(
                 args.extend(["--gpus", "all"])
             else:
                 args.extend(["--gpus", f'"device={gpu_id}"'])
-        elif gpu_count and gpu_count > 0:
-            args.extend(["--gpus", f"count={gpu_count}"])
+        elif ngpus:
+            args.extend(["--gpus", f"count={ngpus}"])
     elif gpu_style == GPU_STYLE_NONE and gpu_id and gpu_id != "all":
         # For platforms like Iluvatar/CoreX that use --privileged + /dev mount,
         # control visible GPUs via CUDA_VISIBLE_DEVICES.

From b4a43d55742be330b4dac0147d4a34b555fb92c4 Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Wed, 25 Mar 2026 07:23:37 +0000
Subject: [PATCH 14/16] feat(ci): add --local flag to run.py for testing
 uncommitted changes

- Mount current directory read-only into container via `-v cwd:/workspace/repo:ro`
- Copy to writable `/tmp/src` inside container before setup runs, so host
  files are never modified by pip install or build artifacts
- Simplify README: fix ngpus example, add gpu_style column, add --local docs

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .ci/README.md | 200 +++++++++++++-------------------------------------
 .ci/run.py    |  23 +++++-
 2 files changed, 70 insertions(+), 153 deletions(-)

diff --git a/.ci/README.md b/.ci/README.md
index f468d90..190d012 100644
--- a/.ci/README.md
+++ b/.ci/README.md
@@ -59,7 +59,7 @@ platforms:
     jobs:
       gpu:                              # Flattened as nvidia_gpu
         resources:
-          gpu_ids: "0"                  # "0" | "0,2" | "all"
+          ngpus: 1                      # Scheduler auto-picks this many free GPUs
           memory: 32GB
           shm_size: 16g
           timeout: 3600
@@ -108,8 +108,9 @@ platforms:
 | | `volumes` | Extra volume mounts |
 | | `setup` | In-container setup command |
 | | `env` | Injected container env vars |
-| **Job** | `resources.gpu_ids` | GPU device IDs |
-| | `resources.gpu_style` | GPU passthrough: `nvidia` (default) or `none` |
+| **Job** | `resources.ngpus` | Number of GPUs — scheduler auto-picks free ones (NVIDIA only) |
+| | `resources.gpu_ids` | Static GPU device IDs (e.g., `"0"`, `"0,2"`) |
+| | `resources.gpu_style` | GPU passthrough: `nvidia` (default), `none`, or `mlu` |
 | | `resources.memory` | Container memory limit |
 | | `resources.shm_size` | Shared memory size |
 | | `resources.timeout` | Max run time in seconds |
@@ -147,7 +148,7 @@ Proxy and `no_proxy` env vars are forwarded from the host to `docker build` auto
 
 ## Pipeline runner `run.py`
 
-Platform is auto-detected (via `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi` on PATH), no manual specification needed.
+Platform is auto-detected (via `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi`/`cnmon` on PATH), no manual specification needed.
 
 | Flag | Description |
 |---|---|
@@ -157,7 +158,9 @@ Platform is auto-detected (via `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi` on P
 | `--stage` | Run only the specified stage |
 | `--image-tag` | Override image tag |
 | `--gpu-id` | Override GPU device IDs (nvidia via `--gpus`, others via `CUDA_VISIBLE_DEVICES`) |
+| `--test` | Override pytest test path (e.g., `tests/test_gemm.py::test_gemm`) |
 | `--results-dir` | Host directory mounted to `/workspace/results` inside the container |
+| `--local` | Mount current directory (read-only) instead of cloning from git |
 | `--dry-run` | Print docker command without executing |
 
 ```bash
@@ -172,23 +175,29 @@ python .ci/run.py --job nvidia_gpu
 
 # Run only the test stage, preview mode
 python .ci/run.py --job gpu --stage test --dry-run
+
+# Test local uncommitted changes without pushing
+python .ci/run.py --local
 ```
 
 Container execution flow: `git clone` → `checkout` → `setup` → stages.
+With `--local`, the current directory is mounted read-only at `/workspace/repo` and copied to a writable temp directory inside the container before setup runs — host files are never modified.
 Proxy vars are forwarded from the host. Test results are written to `--results-dir`. Each run uses a clean environment (no host pip cache mounted).
 
 ---
 
 ## Platform differences
 
-| Platform | GPU passthrough | Base image | Notes |
-|---|---|---|---|
-| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvcr.io/nvidia/pytorch:24.10-py3` | Standard CUDA |
-| Iluvatar | `--privileged` + `/dev` mount | `corex:qs_pj20250825` | CoreX runtime, CUDA compatible |
-| MetaX | `--privileged` | `maca-pytorch:3.2.1.4` | MACA runtime, detected via `mx-smi` |
-| Moore | `--privileged` | `vllm_musa:20251112_hygon` | MUSA runtime, detected via `mthreads-gmi` |
-| Cambricon | `--privileged` | `cambricon/pytorch:v1.25.3` | Neuware runtime, detected via `cnmon` |
-| Ascend | TODO | `ascend-pytorch:24.0.0` | Not ready, image and jobs pending |
+| Platform | GPU passthrough | `gpu_style` | Base image | Detection tool |
+|---|---|---|---|---|
+| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvidia` (default) | `nvcr.io/nvidia/pytorch:24.10-py3` | `nvidia-smi` |
+| Iluvatar | `--privileged` + `/dev` mount | `none` | `corex:qs_pj20250825` | `ixsmi` |
+| MetaX | `--privileged` | `none` | `maca-pytorch:3.2.1.4-...` | `mx-smi` |
+| Moore | `--privileged` | `none` | `vllm_musa:20251112_hygon` | `mthreads-gmi` |
+| Cambricon | `--privileged` | `mlu` | `cambricon/pytorch:v1.25.3` | `cnmon` |
+| Ascend | TODO | — | `ascend-pytorch:24.0.0` | — |
+
+`gpu_style` controls the Docker device injection mechanism: `nvidia` uses `--gpus`, `none` uses `CUDA_VISIBLE_DEVICES` (or skips injection for Moore), `mlu` uses `MLU_VISIBLE_DEVICES`.
 
 ---
 
@@ -220,22 +229,15 @@ python .ci/agent.py run --dry-run
 | `--branch` | Test branch (default: config `repo.branch`) |
 | `--job` | Specific job name |
 | `--platform` | Filter jobs by platform |
-| `--commit` | Override commit SHA |
+| `--commit` | Override commit SHA used for GitHub status reporting |
 | `--image-tag` | Override image tag |
 | `--dry-run` | Preview mode |
 
 ### Webhook server
 
-Deploy one Agent instance per platform machine (platform is auto-detected):
+Deploy one Agent instance per platform machine (platform is auto-detected). On each machine:
 
 ```bash
-# NVIDIA machine
-python .ci/agent.py serve --port 8080
-
-# Iluvatar machine
-python .ci/agent.py serve --port 8080
-
-# MetaX machine
 python .ci/agent.py serve --port 8080
 ```
 
@@ -267,13 +269,13 @@ Configure agent URLs in `config.yaml`; the CLI automatically dispatches remote j
 ```yaml
 agents:
   nvidia:
-    url: http://nvidia-host:8080
+    url: http://<nvidia-ip>:8080
   iluvatar:
-    url: http://iluvatar-host:8080
+    url: http://<iluvatar-ip>:8080
   metax:
-    url: http://metax-host:8080
+    url: http://<metax-ip>:8080
   moore:
-    url: http://moore-host:8080
+    url: http://<moore-ip>:8080
 ```
 
 ### Resource scheduling
@@ -294,116 +296,28 @@ Status context format: `ci/infiniops/{job_name}`
 
 ## Multi-machine deployment guide
 
-Example with NVIDIA + Iluvatar + MetaX + Moore multi-platform setup, showing how to deploy agents across machines for cross-platform parallel testing.
-
-### Prerequisites (all machines)
-
-```bash
-# 1. Python 3.10+ and dependencies
-pip install pyyaml
-
-# 2. Docker installed
-docker --version
-
-# 3. Clone the repository
-git clone https://github.com/InfiniTensor/InfiniOps.git
-cd InfiniOps
-```
-
-### NVIDIA machine setup
-
-```bash
-# 1. Install NVIDIA Container Toolkit
-#    See: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html
-
-# 2. Verify GPU visibility
-nvidia-smi
-
-# 3. Build CI image
-python .ci/build.py --platform nvidia
-```
-
-### Iluvatar machine setup
-
-```bash
-# 1. Verify CoreX runtime is installed
-ixsmi
-
-# 2. Verify base image is imported (non-public, must be prepared in advance)
-docker images | grep corex    # Should show corex:qs_pj20250825
-
-# 3. Build CI image
-python .ci/build.py --platform iluvatar
-```
-
-### MetaX machine setup
-
-```bash
-# 1. Verify MACA runtime is installed
-mx-smi
-
-# 2. Verify base image is imported (non-public, must be prepared in advance)
-docker images | grep maca-pytorch    # Should show maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64
-
-# 3. Build CI image
-python .ci/build.py --platform metax
-```
+### Per-platform setup
 
-### Moore machine setup
-
-```bash
-# 1. Verify MUSA runtime is installed
-mthreads-gmi
+Each machine needs Docker installed, the platform runtime, and the base CI image built.
 
-# 2. Verify base image is imported (non-public, must be prepared in advance)
-docker images | grep vllm_musa    # Should show vllm_musa:20251112_hygon
-
-# 3. Build CI image
-python .ci/build.py --platform moore
-```
+| Platform | Runtime check | Base image | Build command |
+|---|---|---|---|
+| NVIDIA | `nvidia-smi` (+ [Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) | `nvcr.io/nvidia/pytorch:24.10-py3` (public) | `python .ci/build.py --platform nvidia` |
+| Iluvatar | `ixsmi` | `corex:qs_pj20250825` (import in advance) | `python .ci/build.py --platform iluvatar` |
+| MetaX | `mx-smi` | `maca-pytorch:3.2.1.4-...` (import in advance) | `python .ci/build.py --platform metax` |
+| Moore | `mthreads-gmi` | `vllm_musa:20251112_hygon` (import in advance) | `python .ci/build.py --platform moore` |
 
 ### Start Agent services
 
-Start the Agent on each machine:
+On each machine (platform is auto-detected):
 
 ```bash
-# NVIDIA machine (platform auto-detected)
-python .ci/agent.py serve --port 8080
-
-# Iluvatar machine (platform auto-detected)
-python .ci/agent.py serve --port 8080
-
-# MetaX machine (platform auto-detected)
-python .ci/agent.py serve --port 8080
-
-# Moore machine (platform auto-detected)
 python .ci/agent.py serve --port 8080
 ```
 
-Verify connectivity:
-
-```bash
-curl http://<nvidia-ip>:8080/health
-curl http://<iluvatar-ip>:8080/health
-curl http://<metax-ip>:8080/health
-curl http://<moore-ip>:8080/health
-```
-
 ### Configure remote agent URLs
 
-Add the `agents` section to `config.yaml` on the trigger machine:
-
-```yaml
-agents:
-  nvidia:
-    url: http://<nvidia-ip>:8080
-  iluvatar:
-    url: http://<iluvatar-ip>:8080
-  metax:
-    url: http://<metax-ip>:8080
-  moore:
-    url: http://<moore-ip>:8080
-```
+On the trigger machine, add the `agents` section to `config.yaml` (see [Remote agent configuration](#remote-agent-configuration) above for the format).
 
 ### Trigger cross-platform tests
 
@@ -433,11 +347,8 @@ export GITHUB_TOKEN=ghp_xxxxxxxxxxxx
 When agents are exposed on untrusted networks, enable token auth:
 
 ```bash
-# Specify token at startup
 python .ci/agent.py serve --port 8080 --api-token <secret>
-
-# Or via env var
-export AGENT_API_TOKEN=<secret>
+# Or: export AGENT_API_TOKEN=<secret>
 ```
 
 #### GitHub Webhook auto-trigger
@@ -451,36 +362,25 @@ In GitHub repo → Settings → Webhooks, add a webhook for each machine:
 | Secret | Must match `--webhook-secret` |
 | Events | `push` and `pull_request` |
 
-Configure the secret at startup:
-
 ```bash
 python .ci/agent.py serve --port 8080 --webhook-secret <github-secret>
-
-# Or via env var
-export WEBHOOK_SECRET=<github-secret>
+# Or: export WEBHOOK_SECRET=<github-secret>
 ```
 
 ### Verification checklist
 
 ```bash
 # 1. Dry-run each machine individually
-python .ci/agent.py run --platform nvidia --dry-run
-python .ci/agent.py run --platform iluvatar --dry-run
-python .ci/agent.py run --platform metax --dry-run
-python .ci/agent.py run --platform moore --dry-run
-
-# 2. Health checks
-curl http://<nvidia-ip>:8080/health
-curl http://<iluvatar-ip>:8080/health
-curl http://<metax-ip>:8080/health
-curl http://<moore-ip>:8080/health
-
-# 3. Resource status
-curl http://<nvidia-ip>:8080/status
-curl http://<iluvatar-ip>:8080/status
-curl http://<metax-ip>:8080/status
-curl http://<moore-ip>:8080/status
-
-# 4. Cross-platform test
+for platform in nvidia iluvatar metax moore; do
+  python .ci/agent.py run --platform $platform --dry-run
+done
+
+# 2. Health and resource checks
+for ip in <nvidia-ip> <iluvatar-ip> <metax-ip> <moore-ip>; do
+  curl http://$ip:8080/health
+  curl http://$ip:8080/status
+done
+
+# 3. Cross-platform test
 python .ci/agent.py run --branch master
 ```
diff --git a/.ci/run.py b/.ci/run.py
index 969336d..24a8867 100644
--- a/.ci/run.py
+++ b/.ci/run.py
@@ -87,9 +87,14 @@ def build_runner_script():
 set -e
 cd /workspace
 mkdir -p /workspace/results
-git clone "$REPO_URL" repo
-cd repo
-git checkout "$BRANCH"
+if [ -n "$LOCAL_SRC" ]; then
+  cp -r "$LOCAL_SRC" /tmp/src
+  cd /tmp/src
+else
+  git clone "$REPO_URL" repo
+  cd repo
+  git checkout "$BRANCH"
+fi
 echo "========== Setup =========="
 eval "$SETUP_CMD"
 set +e
@@ -120,6 +125,7 @@ def build_docker_args(
     image_tag_override,
     gpu_id_override=None,
     results_dir=None,
+    local_path=None,
 ):
     job = config["jobs"][job_name]
     platform = job.get("platform", "nvidia")
@@ -169,6 +175,10 @@ def build_docker_args(
     if results_dir:
         args.extend(["-v", f"{results_dir.resolve()}:/workspace/results"])
 
+    if local_path:
+        args.extend(["-v", f"{local_path}:/workspace/repo:ro"])
+        args.extend(["-e", "LOCAL_SRC=/workspace/repo"])
+
     for i, s in enumerate(stages):
         args.append("-e")
         args.append(f"STAGE_{i + 1}_NAME={s['name']}")
@@ -307,6 +317,11 @@ def main():
         type=str,
         help='Override pytest test path, e.g. "tests/test_gemm.py" or "tests/test_gemm.py::test_gemm"',
     )
+    parser.add_argument(
+        "--local",
+        action="store_true",
+        help="Mount current directory (read-only) into the container instead of cloning from git",
+    )
     parser.add_argument(
         "--dry-run",
         action="store_true",
@@ -363,6 +378,7 @@ def main():
         commit = get_git_commit()
         results_dir = build_results_dir(args.results_dir, job_platform, stages, commit)
 
+        local_path = Path.cwd().resolve() if args.local else None
         docker_args = build_docker_args(
             config,
             job_name,
@@ -373,6 +389,7 @@ def main():
             args.image_tag,
             gpu_id_override=args.gpu_id,
             results_dir=results_dir,
+            local_path=local_path,
         )
 
         if args.dry_run:

From fab00e233dbd7eb9b21c72ee1b67b0a601b5d02c Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Wed, 25 Mar 2026 08:46:46 +0000
Subject: [PATCH 15/16] style(ci): normalize comments to complete English
 sentences with markdown

- Backtick-quote tool/package names (`torch`, `pip`, `git`, `cmake`,
  `coreutils-single`, `conda`) and paths in Dockerfile comments.
- Add explanatory comment to the commented-out `agents:` block in
  `config.yaml` describing when to uncomment it.
- Convert all section-header banners in `.ci/tests/` to "Tests for
  `FunctionName`." sentence form; fix three docstrings in `test_agent.py`.
- Backtick-quote identifiers in `tests/test_gemm.py` inline comments.

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .ci/config.yaml                 |  5 ++++-
 .ci/images/cambricon/Dockerfile |  6 +++---
 .ci/images/iluvatar/Dockerfile  |  6 +++---
 .ci/images/metax/Dockerfile     |  4 ++--
 .ci/images/moore/Dockerfile     |  4 ++--
 .ci/images/nvidia/Dockerfile    |  2 +-
 .ci/tests/test_agent.py         | 22 +++++++++++-----------
 .ci/tests/test_build.py         |  8 ++++----
 .ci/tests/test_github_status.py |  6 +++---
 .ci/tests/test_resource.py      | 18 +++++++++---------
 .ci/tests/test_run.py           | 18 +++++++++---------
 .ci/tests/test_utils.py         |  2 +-
 tests/test_gemm.py              |  4 ++--
 13 files changed, 54 insertions(+), 51 deletions(-)

diff --git a/.ci/config.yaml b/.ci/config.yaml
index 3ac211d..b70e7df 100644
--- a/.ci/config.yaml
+++ b/.ci/config.yaml
@@ -5,6 +5,9 @@ repo:
 github:
   status_context_prefix: "ci/infiniops"
 
+# Uncomment and replace the URLs below with actual host IPs to dispatch jobs to remote
+# machines via `agent.py run`. Required on the trigger machine when each platform's
+# agent runs on a separate host. See the README for multi-machine deployment details.
 # agents:
 #   nvidia:
 #     url: http://nvidia-host:8080
@@ -31,7 +34,7 @@ platforms:
           memory: 32GB
           shm_size: 16g                    # Prevent PyTorch default 64MB shared memory limit
           timeout: 3600
-        # env:                             # Optional: inject container env vars
+        # env:                             # Uncomment to inject extra env vars into the container.
         #   MY_VAR: value
         stages:
           - name: test
diff --git a/.ci/images/cambricon/Dockerfile b/.ci/images/cambricon/Dockerfile
index f1282d9..138f3cb 100644
--- a/.ci/images/cambricon/Dockerfile
+++ b/.ci/images/cambricon/Dockerfile
@@ -1,7 +1,7 @@
 ARG BASE_IMAGE
 FROM ${BASE_IMAGE}
 
-# Python 3.10 executables (pip-installed tools) live under /usr/local/python3.10/bin.
+# Python 3.10 executables (`pip`-installed tools) live under `/usr/local/python3.10/bin`.
 ENV PATH=/usr/local/python3.10/bin:${PATH}
 
 ARG HTTP_PROXY
@@ -11,7 +11,7 @@ ARG http_proxy
 ARG https_proxy
 ARG no_proxy
 
-# git and cmake are pre-installed; coreutils-single covers coreutils needs.
+# `git` and `cmake` are pre-installed; `coreutils-single` covers coreutils needs.
 RUN dnf install -y ninja-build && dnf clean all
 
 ARG PIP_INDEX_URL
@@ -24,7 +24,7 @@ RUN pip install --no-cache-dir \
     pytest-xdist \
     ruff==0.15.7
 
-# Pin pre-installed Cambricon torch to prevent pip from replacing it with upstream version.
+# Pin pre-installed Cambricon `torch` to prevent `pip` from replacing it with upstream version.
 RUN pip show torch >/dev/null 2>&1 && \
     echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \
     touch /etc/pip-constraints.txt
diff --git a/.ci/images/iluvatar/Dockerfile b/.ci/images/iluvatar/Dockerfile
index f098e5f..79afc85 100644
--- a/.ci/images/iluvatar/Dockerfile
+++ b/.ci/images/iluvatar/Dockerfile
@@ -3,8 +3,8 @@ FROM ${BASE_IMAGE}
 
 ENV DEBIAN_FRONTEND=noninteractive
 
-# CoreX runtime environment (base image sets these in /etc/bash.bashrc,
-# but docker build RUN uses /bin/sh which doesn't source it)
+# CoreX runtime environment (base image sets these in `/etc/bash.bashrc`,
+# but `docker build` `RUN` uses `/bin/sh` which doesn't source it).
 ENV PATH=/usr/local/corex/bin:/usr/local/corex-4.3.0/corex-toolbox-1.0.0/bin:/usr/local/corex/lib64/python3/dist-packages/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages
 ENV LD_LIBRARY_PATH=/usr/local/corex/lib64:/usr/local/lib:/usr/local/openmpi/lib
@@ -44,7 +44,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
 
 RUN pip config set global.index-url https://pypi.org/simple
 
-# Pin pre-installed CoreX torch to prevent pip from replacing it with upstream version
+# Pin pre-installed CoreX `torch` to prevent `pip` from replacing it with upstream version.
 RUN pip show torch >/dev/null 2>&1 && \
     echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \
     touch /etc/pip-constraints.txt
diff --git a/.ci/images/metax/Dockerfile b/.ci/images/metax/Dockerfile
index fda527c..540bc9d 100644
--- a/.ci/images/metax/Dockerfile
+++ b/.ci/images/metax/Dockerfile
@@ -3,7 +3,7 @@ FROM ${BASE_IMAGE}
 
 ENV DEBIAN_FRONTEND=noninteractive
 
-# conda Python is used in this image
+# `conda` Python is used in this image.
 ENV PATH=/opt/conda/bin:${PATH}
 
 ARG HTTP_PROXY
@@ -37,7 +37,7 @@ RUN pip install --no-cache-dir \
     pyyaml \
     ruff==0.15.7
 
-# Pin pre-installed MetaX torch to prevent pip from replacing it with upstream version
+# Pin pre-installed MetaX `torch` to prevent `pip` from replacing it with upstream version.
 RUN pip show torch >/dev/null 2>&1 && \
     echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \
     touch /etc/pip-constraints.txt
diff --git a/.ci/images/moore/Dockerfile b/.ci/images/moore/Dockerfile
index 9a073ba..a95d9bd 100644
--- a/.ci/images/moore/Dockerfile
+++ b/.ci/images/moore/Dockerfile
@@ -3,7 +3,7 @@ FROM ${BASE_IMAGE}
 
 ENV DEBIAN_FRONTEND=noninteractive
 
-# MUSA_HOME, PATH, LD_LIBRARY_PATH already set by base image
+# `MUSA_HOME`, `PATH`, `LD_LIBRARY_PATH` already set by base image.
 
 ARG HTTP_PROXY
 ARG HTTPS_PROXY
@@ -31,7 +31,7 @@ RUN pip install --no-cache-dir \
     pytest-xdist \
     ruff==0.15.7
 
-# Pin pre-installed torch to prevent pip from replacing it with upstream version
+# Pin pre-installed `torch` to prevent `pip` from replacing it with upstream version.
 RUN echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt
 ENV PIP_CONSTRAINT=/etc/pip-constraints.txt
 
diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile
index 05da963..b4984da 100644
--- a/.ci/images/nvidia/Dockerfile
+++ b/.ci/images/nvidia/Dockerfile
@@ -37,7 +37,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
     pyyaml \
     ruff==0.15.7
 
-# Pin pre-installed torch to prevent pip from replacing it with a different version
+# Pin pre-installed `torch` to prevent `pip` from replacing it with a different version.
 RUN pip show torch >/dev/null 2>&1 && \
     echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \
     touch /etc/pip-constraints.txt
diff --git a/.ci/tests/test_agent.py b/.ci/tests/test_agent.py
index e51af2a..73708db 100644
--- a/.ci/tests/test_agent.py
+++ b/.ci/tests/test_agent.py
@@ -13,7 +13,7 @@
 
 
 # ---------------------------------------------------------------------------
-# Fixtures
+# Test fixtures.
 # ---------------------------------------------------------------------------
 
 
@@ -90,7 +90,7 @@ def mock_resource_pool():
 
 
 # ---------------------------------------------------------------------------
-# select_jobs
+# Tests for `select_jobs`.
 # ---------------------------------------------------------------------------
 
 
@@ -120,7 +120,7 @@ def test_select_jobs_invalid_name(agent_config):
 
 
 # ---------------------------------------------------------------------------
-# verify_signature
+# Tests for `verify_signature`.
 # ---------------------------------------------------------------------------
 
 
@@ -140,7 +140,7 @@ def test_verify_signature_empty():
 
 
 # ---------------------------------------------------------------------------
-# JobRequest / JobResult
+# Tests for `JobRequest` and `JobResult`.
 # ---------------------------------------------------------------------------
 
 
@@ -165,7 +165,7 @@ def test_job_result_failure():
 
 
 # ---------------------------------------------------------------------------
-# Scheduler
+# Tests for the `Scheduler` class.
 # ---------------------------------------------------------------------------
 
 
@@ -236,7 +236,7 @@ def test_scheduler_get_status(agent_config, mock_resource_pool):
 
 
 # ---------------------------------------------------------------------------
-# WebhookHandler — push event parsing
+# Tests for `WebhookHandler` push event parsing.
 # ---------------------------------------------------------------------------
 
 
@@ -264,12 +264,12 @@ def test_webhook_parse_pr():
 
 
 # ---------------------------------------------------------------------------
-# Integration-style: webhook HTTP test
+# Integration-style webhook HTTP tests.
 # ---------------------------------------------------------------------------
 
 
 def _urlopen_no_proxy(url_or_req, **kwargs):
-    """urlopen that bypasses any HTTP_PROXY."""
+    """`urlopen` mock that bypasses any `HTTP_PROXY`."""
     import urllib.request
 
     opener = urllib.request.build_opener(urllib.request.ProxyHandler({}))
@@ -442,12 +442,12 @@ def test_webhook_invalid_signature(agent_config, mock_resource_pool):
 
 
 # ---------------------------------------------------------------------------
-# API token authentication
+# Tests for API token authentication.
 # ---------------------------------------------------------------------------
 
 
 def test_api_run_requires_token(agent_config, mock_resource_pool, monkeypatch):
-    """When api_token is set, /api/run rejects requests without valid token."""
+    """When `api_token` is set, `/api/run` rejects requests without a valid token."""
     monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
 
     scheduler = agent.Scheduler(
@@ -491,7 +491,7 @@ def test_api_run_requires_token(agent_config, mock_resource_pool, monkeypatch):
 
 
 def test_api_run_accepts_valid_token(agent_config, mock_resource_pool, monkeypatch):
-    """When api_token is set, /api/run accepts requests with correct Bearer token."""
+    """When `api_token` is set, `/api/run` accepts requests with a correct Bearer token."""
     monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
 
     scheduler = agent.Scheduler(
diff --git a/.ci/tests/test_build.py b/.ci/tests/test_build.py
index fa2f292..4d28885 100644
--- a/.ci/tests/test_build.py
+++ b/.ci/tests/test_build.py
@@ -2,7 +2,7 @@
 
 
 # ---------------------------------------------------------------------------
-# build_image_tag
+# Tests for `build_image_tag`.
 # ---------------------------------------------------------------------------
 
 
@@ -24,7 +24,7 @@ def test_build_image_tag_commit_hash():
 
 
 # ---------------------------------------------------------------------------
-# has_dockerfile_changed
+# Tests for `has_dockerfile_changed`.
 # ---------------------------------------------------------------------------
 
 
@@ -54,7 +54,7 @@ def test_has_dockerfile_changed_true_on_git_error(mocker):
 
 
 # ---------------------------------------------------------------------------
-# docker_login
+# Tests for `docker_login`.
 # ---------------------------------------------------------------------------
 
 
@@ -99,7 +99,7 @@ def test_docker_login_success(mocker, monkeypatch):
 
 
 # ---------------------------------------------------------------------------
-# build_image — dry_run and proxy
+# Tests for `build_image` dry-run mode and proxy forwarding.
 # ---------------------------------------------------------------------------
 
 
diff --git a/.ci/tests/test_github_status.py b/.ci/tests/test_github_status.py
index edb2915..9e29c79 100644
--- a/.ci/tests/test_github_status.py
+++ b/.ci/tests/test_github_status.py
@@ -6,7 +6,7 @@
 
 
 # ---------------------------------------------------------------------------
-# parse_repo_url
+# Tests for `parse_repo_url`.
 # ---------------------------------------------------------------------------
 
 
@@ -35,7 +35,7 @@ def test_parse_repo_url_invalid():
 
 
 # ---------------------------------------------------------------------------
-# build_status_context
+# Tests for `build_status_context`.
 # ---------------------------------------------------------------------------
 
 
@@ -45,7 +45,7 @@ def test_build_status_context():
 
 
 # ---------------------------------------------------------------------------
-# post_commit_status
+# Tests for `post_commit_status`.
 # ---------------------------------------------------------------------------
 
 
diff --git a/.ci/tests/test_resource.py b/.ci/tests/test_resource.py
index ac15b7e..0db3fbb 100644
--- a/.ci/tests/test_resource.py
+++ b/.ci/tests/test_resource.py
@@ -5,7 +5,7 @@
 
 
 # ---------------------------------------------------------------------------
-# GpuInfo / SystemResources
+# Tests for `GpuInfo` and `SystemResources`.
 # ---------------------------------------------------------------------------
 
 
@@ -25,7 +25,7 @@ def test_system_resources_fields():
 
 
 # ---------------------------------------------------------------------------
-# detect_gpus
+# Tests for `detect_gpus`.
 # ---------------------------------------------------------------------------
 
 
@@ -81,7 +81,7 @@ def mock_run(cmd, **kwargs):
 
 
 # ---------------------------------------------------------------------------
-# detect_system_resources
+# Tests for `detect_system_resources`.
 # ---------------------------------------------------------------------------
 
 
@@ -111,7 +111,7 @@ def fake_open(path, **kw):
 
 
 # ---------------------------------------------------------------------------
-# get_free_gpus
+# Tests for `get_free_gpus`.
 # ---------------------------------------------------------------------------
 
 
@@ -135,7 +135,7 @@ class R:
 
 
 # ---------------------------------------------------------------------------
-# allocate / release
+# Tests for `allocate` and `release`.
 # ---------------------------------------------------------------------------
 
 
@@ -200,11 +200,11 @@ class R:
     assert ok is True
     assert len(gpu_ids) == 2
 
-    # All GPUs allocated, next allocation should fail
+    # All GPUs allocated; next allocation should fail.
     _, ok2 = pool.allocate(1)
     assert ok2 is False
 
-    # Release one
+    # Release one GPU.
     pool.release([gpu_ids[0]])
     gpu_ids2, ok3 = pool.allocate(1)
     assert ok3 is True
@@ -267,7 +267,7 @@ def allocate_one():
 
 
 # ---------------------------------------------------------------------------
-# get_status
+# Tests for `get_status`.
 # ---------------------------------------------------------------------------
 
 
@@ -291,7 +291,7 @@ class R:
 
 
 # ---------------------------------------------------------------------------
-# parse_gpu_requirement / parse_memory_requirement
+# Tests for `parse_gpu_requirement` and `parse_memory_requirement`.
 # ---------------------------------------------------------------------------
 
 
diff --git a/.ci/tests/test_run.py b/.ci/tests/test_run.py
index 075546e..93987e5 100644
--- a/.ci/tests/test_run.py
+++ b/.ci/tests/test_run.py
@@ -6,7 +6,7 @@
 
 
 # ---------------------------------------------------------------------------
-# resolve_image
+# Tests for `resolve_image`.
 # ---------------------------------------------------------------------------
 
 
@@ -22,7 +22,7 @@ def test_resolve_image_without_registry(minimal_config):
 
 
 # ---------------------------------------------------------------------------
-# build_runner_script
+# Tests for `build_runner_script`.
 # ---------------------------------------------------------------------------
 
 
@@ -47,7 +47,7 @@ def test_runner_script_creates_results_dir():
 
 
 # ---------------------------------------------------------------------------
-# build_docker_args — basic structure
+# Tests for `build_docker_args` basic structure.
 # ---------------------------------------------------------------------------
 
 
@@ -93,7 +93,7 @@ def test_docker_args_image_tag_override(minimal_config):
 
 
 # ---------------------------------------------------------------------------
-# build_docker_args — proxy passthrough
+# Tests for `build_docker_args` proxy passthrough.
 # ---------------------------------------------------------------------------
 
 
@@ -156,7 +156,7 @@ def test_docker_args_proxy_lowercase_fallback(minimal_config, monkeypatch):
 
 
 # ---------------------------------------------------------------------------
-# build_docker_args — GPU flags
+# Tests for `build_docker_args` GPU flags.
 # ---------------------------------------------------------------------------
 
 
@@ -200,7 +200,7 @@ def test_docker_args_gpu_override(minimal_config):
 
 
 # ---------------------------------------------------------------------------
-# build_docker_args — memory format
+# Tests for `build_docker_args` memory format.
 # ---------------------------------------------------------------------------
 
 
@@ -222,7 +222,7 @@ def test_docker_args_memory_format(minimal_config, raw, expected):
 
 
 # ---------------------------------------------------------------------------
-# build_docker_args — stages encoding
+# Tests for `build_docker_args` stages encoding.
 # ---------------------------------------------------------------------------
 
 
@@ -249,7 +249,7 @@ def test_docker_args_multiple_stages(minimal_config):
 
 
 # ---------------------------------------------------------------------------
-# build_docker_args — results_dir mount
+# Tests for `build_docker_args` `results_dir` mount.
 # ---------------------------------------------------------------------------
 
 
@@ -270,7 +270,7 @@ def test_docker_args_results_dir(minimal_config, tmp_path):
 
 
 # ---------------------------------------------------------------------------
-# build_results_dir
+# Tests for `build_results_dir`.
 # ---------------------------------------------------------------------------
 
 
diff --git a/.ci/tests/test_utils.py b/.ci/tests/test_utils.py
index 2a930d3..b07011c 100644
--- a/.ci/tests/test_utils.py
+++ b/.ci/tests/test_utils.py
@@ -82,7 +82,7 @@ def test_normalize_preserves_top_level_keys():
 
 
 def test_normalize_passthrough_flat_config():
-    """Old flat format without 'platforms' key is returned as-is."""
+    """Old flat format without `platforms` key is returned as-is."""
     flat = {
         "images": {"nvidia": {}},
         "jobs": {"nvidia_gpu": {"platform": "nvidia"}},
diff --git a/tests/test_gemm.py b/tests/test_gemm.py
index 491fb47..d75ac45 100644
--- a/tests/test_gemm.py
+++ b/tests/test_gemm.py
@@ -48,7 +48,7 @@ def test_gemm(
     if device == "mlu" and (trans_a or trans_b):
         pytest.skip("transposing is not currently supported on MLU")
 
-    # cnnlBatchMatMulEx does not accept bfloat16 inputs on MLU.
+    # `cnnlBatchMatMulEx` does not accept `bfloat16` inputs on MLU.
     if device == "mlu" and dtype == torch.bfloat16:
         pytest.skip("bfloat16 is not supported by cnnlBatchMatMulEx")
 
@@ -101,7 +101,7 @@ def _torch_gemm(a, b, alpha=1.0, beta=1.0, trans_a=False, trans_b=False, c=None)
 
         return torch.baddbmm(c, a, b, beta=beta, alpha=alpha, out=c)
     except RuntimeError:
-        # Fallback for backends that don't support addmm/baddbmm (e.g. CPU float16/bfloat16):
+        # Fallback for backends that don't support `addmm`/`baddbmm` (e.g. CPU `float16`/`bfloat16`):
         # compute in float32 and cast back.
         c_original = c.float()
         result = torch.matmul(a.float(), b.float())

From bedede278c615461141cf50d9814141812246ac5 Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue@qiyuanlab.com>
Date: Wed, 25 Mar 2026 08:50:51 +0000
Subject: [PATCH 16/16] style(tests): backtick-quote identifiers in
 test_gemm.py skip message

Co-Authored-By: Claude <noreply@anthropic.com>
---
 tests/test_gemm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_gemm.py b/tests/test_gemm.py
index d75ac45..136e991 100644
--- a/tests/test_gemm.py
+++ b/tests/test_gemm.py
@@ -50,7 +50,7 @@ def test_gemm(
 
     # `cnnlBatchMatMulEx` does not accept `bfloat16` inputs on MLU.
     if device == "mlu" and dtype == torch.bfloat16:
-        pytest.skip("bfloat16 is not supported by cnnlBatchMatMulEx")
+        pytest.skip("`bfloat16` is not supported by `cnnlBatchMatMulEx`")
 
     a = randn_strided(a_shape, a_strides, dtype=dtype, device=device)
     b = randn_strided(b_shape, b_strides, dtype=dtype, device=device)