From ecafdc02fc7ecf9535eace735f19a670949f23c5 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 19 Mar 2026 06:16:40 +0000 Subject: [PATCH 01/16] feat/nv ci test --- .ci/README.md | 171 ++++++++++++++++++++++++++++ .ci/build.py | 210 +++++++++++++++++++++++++++++++++++ .ci/config.yaml | 36 ++++++ .ci/images/ascend/Dockerfile | 31 ++++++ .ci/images/nvidia/Dockerfile | 26 +++++ .ci/run.py | 195 ++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 7 files changed, 670 insertions(+), 1 deletion(-) create mode 100644 .ci/README.md create mode 100644 .ci/build.py create mode 100644 .ci/config.yaml create mode 100644 .ci/images/ascend/Dockerfile create mode 100644 .ci/images/nvidia/Dockerfile create mode 100644 .ci/run.py diff --git a/.ci/README.md b/.ci/README.md new file mode 100644 index 0000000..59ee101 --- /dev/null +++ b/.ci/README.md @@ -0,0 +1,171 @@ +# .ci — CI 镜像与流水线 + +本目录管理 CI 所用的 Docker 镜像构建与测试流水线执行。 + +## 目录结构 + +``` +.ci/ +├── config.yaml # 统一配置(registry、镜像、job 定义) +├── build.py # 镜像构建脚本 +├── run.py # CI 流水线执行脚本 +├── README.md +└── images/ + ├── nvidia/Dockerfile # NVIDIA 平台镜像 + └── ascend/Dockerfile # 昇腾平台镜像 +``` + +## 前置依赖 + +- Docker +- Python 3.10+ +- pyyaml (`pip install pyyaml`) + +## 配置文件 `config.yaml` + +```yaml +repo: + url: https://github.com/InfiniTensor/InfiniOps.git + branch: master + +registry: + url: "" # Harbor 地址,本地开发时留空 + project: infiniops + credentials_env: REGISTRY_TOKEN + +images: + nvidia: + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + ascend: + dockerfile: .ci/images/ascend/ + build_args: + BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 + private_sdk: + source: "${PRIVATE_SDK_URL}" + +jobs: + nvidia_gpu: + image: stable # stable | latest | 具体 commit hash + platform: nvidia + resources: + gpu_ids: "0" # GPU 设备 ID,如 "0" "0,2" "all" + gpu_type: A100 + memory: 32GB + timeout: 3600 + setup: pip install .[dev] + stages: + - name: test + run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml +``` + +- **`registry.url`** 为空时镜像仅保存在本地,tag 格式为 `-ci/:`。 +- **`images..build_args`** 会作为 `--build-arg` 传入 `docker build`。 +- **`jobs..image`** 支持 `stable`、`latest` 或具体 commit hash。 +- **`resources.gpu_ids`** 指定 GPU 设备 ID,支持 `"0"`、`"0,2"`、`"all"` 等格式,映射为 `docker run --gpus "device=..."`。也可保留 `gpu_count` 按数量分配。 + +## 镜像构建 `build.py` + +```bash +python .ci/build.py [options] +``` + +| 参数 | 默认值 | 说明 | +|---|---|---| +| `--platform` | `all` | 构建平台:`nvidia`、`ascend` 或 `all` | +| `--commit` | `HEAD` | 用于镜像 tag 的 git ref | +| `--push` | — | 构建后推送到 registry | +| `--force` | — | 跳过变更检测,强制构建 | +| `--dry-run` | — | 仅打印命令,不执行 | +| `--config` | `.ci/config.yaml` | 配置文件路径 | + +### 示例 + +```bash +# 构建 nvidia 镜像(自动检测 Dockerfile 变更,无变更则跳过) +python .ci/build.py --platform nvidia + +# 强制构建 +python .ci/build.py --platform nvidia --force + +# 构建全部平台并推送到 registry +python .ci/build.py --push --force + +# 预览实际执行的 docker 命令 +python .ci/build.py --platform nvidia --force --dry-run +``` + +### 构建流程 + +1. 通过 `git diff HEAD~1` 检测 Dockerfile 目录是否有变更(`--force` 跳过此步) +2. `docker build` 构建镜像,同时打 `` 和 `latest` 两个 tag +3. 自动透传宿主机的 `http_proxy`/`https_proxy`/`no_proxy` 到构建容器 +4. 若指定 `--push`,将两个 tag 推送到 registry + +### 产物 + +| Tag | 说明 | +|---|---| +| `infiniops-ci/:` | 精确追溯到某次构建 | +| `infiniops-ci/:latest` | 最近一次构建 | + +## 流水线执行 `run.py` + +```bash +python .ci/run.py [options] +``` + +| 参数 | 默认值 | 说明 | +|---|---|---| +| `--job` | 配置中第一个 job | 要执行的 job 名称 | +| `--branch` | `config.yaml` 中的 `repo.branch` | 覆盖克隆分支 | +| `--stage` | 全部 | 仅运行指定 stage | +| `--image-tag` | job 中的 `image` 字段 | 覆盖镜像版本 | +| `--gpu-id` | config 中的 `gpu_ids` | GPU 设备 ID,如 `0`、`0,2`、`all` | +| `--dry-run` | — | 仅打印 docker 命令,不执行 | +| `--config` | `.ci/config.yaml` | 配置文件路径 | + +### 示例 + +```bash +# 运行默认 job +python .ci/run.py + +# 指定分支和镜像版本 +python .ci/run.py --branch feature-xxx --image-tag latest + +# 只用 GPU 0 运行 +python .ci/run.py --gpu-id 0 + +# 用 GPU 0 和 2 运行 +python .ci/run.py --gpu-id 0,2 + +# 使用全部 GPU +python .ci/run.py --gpu-id all + +# 只跑 test stage +python .ci/run.py --stage test + +# 预览 docker 命令 +python .ci/run.py --dry-run +``` + +### 执行流程 + +1. 解析 job 配置,拉取对应镜像 +2. `docker run` 启动容器(自动挂载 GPU、限制内存) +3. 容器内 `git clone` → `checkout` → 执行 `setup` 命令 +4. 依次执行各 stage,汇总结果 + +## 代理配置 + +如果网络环境需要代理,在宿主机设置环境变量后即可: + +```bash +export http_proxy=http://localhost:9991 +export https_proxy=http://localhost:9991 +``` + +- **`build.py`** 会自动透传代理到 `docker build`(通过 `--build-arg` + `--network host`)。 +- **`run.py`** 使用 `--network host`,容器内可直接访问宿主机代理。 diff --git a/.ci/build.py b/.ci/build.py new file mode 100644 index 0000000..489ebf0 --- /dev/null +++ b/.ci/build.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +"""CI image builder: detect changes, build, tag, and optionally push Docker images.""" + +import argparse +import json +import os +import subprocess +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + + +def load_config(path): + with open(path, encoding="utf-8") as f: + return yaml.safe_load(f) + + +def get_git_commit(ref="HEAD"): + result = subprocess.run( + ["git", "rev-parse", "--short", ref], + capture_output=True, + text=True, + ) + if result.returncode != 0: + print(f"error: failed to get commit hash for `{ref}`", file=sys.stderr) + sys.exit(1) + + return result.stdout.strip() + + +def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"): + """Check if any file under `dockerfile_dir` changed since `base_ref`.""" + result = subprocess.run( + ["git", "diff", "--name-only", base_ref, "--", dockerfile_dir], + capture_output=True, + text=True, + ) + + return bool(result.stdout.strip()) + + +def build_image_tag(registry_url, project, platform, tag): + if registry_url: + return f"{registry_url}/{project}/{platform}:{tag}" + + return f"{project}-ci/{platform}:{tag}" + + +def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run): + """Build a single platform image. Returns True on success.""" + registry_url = registry_cfg.get("url", "") + project = registry_cfg.get("project", "infiniops") + dockerfile_dir = platform_cfg["dockerfile"] + + commit_tag = build_image_tag(registry_url, project, platform, commit) + latest_tag = build_image_tag(registry_url, project, platform, "latest") + + build_args_cfg = platform_cfg.get("build_args", {}) + build_cmd = ["docker", "build", "--network", "host"] + for key, value in build_args_cfg.items(): + build_cmd.extend(["--build-arg", f"{key}={value}"]) + + for proxy_var in ("http_proxy", "https_proxy", "no_proxy"): + proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.upper()) + if proxy_val: + build_cmd.extend(["--build-arg", f"{proxy_var}={proxy_val}"]) + + private_sdk = platform_cfg.get("private_sdk", {}) + if private_sdk: + sdk_url = private_sdk.get("source", "") + if sdk_url.startswith("${") and sdk_url.endswith("}"): + env_var = sdk_url[2:-1] + sdk_url = os.environ.get(env_var, "") + if sdk_url: + build_cmd.extend(["--build-arg", f"PRIVATE_SDK_URL={sdk_url}"]) + + build_cmd.extend(["-t", commit_tag, "-t", latest_tag, dockerfile_dir]) + + if dry_run: + print(f"[dry-run] {' '.join(build_cmd)}") + if push: + print(f"[dry-run] docker push {commit_tag}") + print(f"[dry-run] docker push {latest_tag}") + + return True + + print(f"==> building {platform}: {commit_tag}", file=sys.stderr) + result = subprocess.run(build_cmd) + if result.returncode != 0: + error = { + "stage": "build", + "platform": platform, + "tag": commit_tag, + "exit_code": result.returncode, + } + print(json.dumps(error), file=sys.stderr) + + return False + + if push: + for tag in (commit_tag, latest_tag): + print(f"==> pushing {tag}", file=sys.stderr) + push_result = subprocess.run(["docker", "push", tag]) + if push_result.returncode != 0: + error = { + "stage": "push", + "platform": platform, + "tag": tag, + "exit_code": push_result.returncode, + } + print(json.dumps(error), file=sys.stderr) + + return False + + return True + + +def main(): + parser = argparse.ArgumentParser(description="Build CI Docker images") + parser.add_argument( + "--platform", + type=str, + default="all", + help="Platform to build: nvidia, ascend, or all (default: all)", + ) + parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + help="Path to config.yaml", + ) + parser.add_argument( + "--commit", + type=str, + default="HEAD", + help="Git ref for tagging the image (default: HEAD)", + ) + parser.add_argument( + "--push", + action="store_true", + help="Push images to registry after building", + ) + parser.add_argument( + "--force", + action="store_true", + help="Skip change detection and force build", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print commands without executing", + ) + args = parser.parse_args() + + config = load_config(args.config) + registry_cfg = config.get("registry", {}) + images_cfg = config.get("images", {}) + + if not images_cfg: + print("error: no `images` section in config", file=sys.stderr) + sys.exit(1) + + if args.platform == "all": + platforms = list(images_cfg.keys()) + else: + if args.platform not in images_cfg: + print( + f"error: platform `{args.platform}` not found in config", + file=sys.stderr, + ) + sys.exit(1) + platforms = [args.platform] + + commit = get_git_commit(args.commit) + failed = False + + for platform in platforms: + platform_cfg = images_cfg[platform] + dockerfile_dir = platform_cfg["dockerfile"] + + if not Path(dockerfile_dir).is_dir(): + print( + f"warning: dockerfile directory `{dockerfile_dir}` does not exist, skipping {platform}", + file=sys.stderr, + ) + continue + + if not args.force and not has_dockerfile_changed(dockerfile_dir): + print(f"==> {platform}: no changes detected, skipping", file=sys.stderr) + continue + + ok = build_image( + platform, platform_cfg, registry_cfg, commit, args.push, args.dry_run + ) + if not ok: + failed = True + + if failed: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.ci/config.yaml b/.ci/config.yaml new file mode 100644 index 0000000..fea3f7c --- /dev/null +++ b/.ci/config.yaml @@ -0,0 +1,36 @@ +repo: + url: https://github.com/InfiniTensor/InfiniOps.git + branch: master + +registry: + url: "" # TODO: Harbor not ready yet + project: infiniops + credentials_env: REGISTRY_TOKEN + +images: + nvidia: + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + ascend: # TODO: Ascend image is not ready yet + dockerfile: .ci/images/ascend/ + build_args: + BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 + private_sdk: + source: "${PRIVATE_SDK_URL}" + +jobs: + nvidia_gpu: + image: stable + platform: nvidia + resources: + gpu_ids: "0" # 指定 GPU ID,如 "0" "0,2" "all" + gpu_type: A100 + memory: 32GB + timeout: 3600 + + setup: pip install .[dev] + + stages: + - name: test + run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile new file mode 100644 index 0000000..87f7c91 --- /dev/null +++ b/.ci/images/ascend/Dockerfile @@ -0,0 +1,31 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + curl \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +ARG PRIVATE_SDK_URL +RUN if [ -n "$PRIVATE_SDK_URL" ]; then \ + curl -fSL "$PRIVATE_SDK_URL" -o /tmp/sdk.run && \ + chmod +x /tmp/sdk.run && /tmp/sdk.run --quiet && \ + rm /tmp/sdk.run; \ + fi + +RUN pip install --no-cache-dir \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml + +WORKDIR /workspace diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile new file mode 100644 index 0000000..d89ea91 --- /dev/null +++ b/.ci/images/nvidia/Dockerfile @@ -0,0 +1,26 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +ARG http_proxy +ARG https_proxy + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml + +WORKDIR /workspace diff --git a/.ci/run.py b/.ci/run.py new file mode 100644 index 0000000..0421a56 --- /dev/null +++ b/.ci/run.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +"""Standalone Docker CI runner: clone repo, setup, run stages. Output to stdout.""" + +import argparse +import subprocess +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + + +def load_config(path): + with open(path, encoding="utf-8") as f: + return yaml.safe_load(f) + + +def resolve_image(config, platform, image_tag): + """Resolve an image reference ('stable', 'latest', or commit hash) to a full URL.""" + registry = config.get("registry", {}) + registry_url = registry.get("url", "") + project = registry.get("project", "infiniops") + + if not registry_url: + return f"{project}-ci/{platform}:{image_tag}" + + return f"{registry_url}/{project}/{platform}:{image_tag}" + + +def build_runner_script(): + return r""" +export https_proxy=http://localhost:9991 +set -e +cd /workspace +git clone "$REPO_URL" repo +cd repo +git checkout "$BRANCH" +echo "========== Setup ==========" +eval "$SETUP_CMD" +set +e +failed=0 +for i in $(seq 1 "$NUM_STAGES"); do + name_var="STAGE_${i}_NAME" + cmd_var="STAGE_${i}_CMD" + name="${!name_var}" + cmd="${!cmd_var}" + echo "========== Stage: $name ==========" + eval "$cmd" || failed=1 +done +echo "========== Summary ==========" +exit $failed +""" + + +def build_docker_args( + config, job_name, repo_url, branch, stages, workdir, image_tag_override, + gpu_id_override=None, +): + job = config["jobs"][job_name] + platform = job.get("platform", "nvidia") + image_tag = image_tag_override or job.get("image", "stable") + image = resolve_image(config, platform, image_tag) + resources = job.get("resources", {}) + setup_cmd = job.get("setup", "pip install .[dev]") + + args = [ + "docker", + "run", + "--rm", + "--network", + "host", + "-i", + "-w", + workdir, + "-e", + f"REPO_URL={repo_url}", + "-e", + f"BRANCH={branch}", + "-e", + f"SETUP_CMD={setup_cmd}", + "-e", + f"NUM_STAGES={len(stages)}", + ] + for i, s in enumerate(stages): + args.append("-e") + args.append(f"STAGE_{i + 1}_NAME={s['name']}") + args.append("-e") + args.append(f"STAGE_{i + 1}_CMD={s['run']}") + + gpu_id = gpu_id_override or str(resources.get("gpu_ids", "")) + gpu_count = resources.get("gpu_count", 0) + if gpu_id: + if gpu_id == "all": + args.extend(["--gpus", "all"]) + else: + args.extend(["--gpus", f'"device={gpu_id}"']) + elif gpu_count and gpu_count > 0: + args.extend(["--gpus", f"count={gpu_count}"]) + + memory = resources.get("memory") + if memory: + mem = str(memory).upper().replace("GB", "g").replace("MB", "m") + if not mem.endswith("g") and not mem.endswith("m"): + mem = f"{mem}g" + args.extend(["--memory", mem]) + + timeout_sec = resources.get("timeout") + if timeout_sec: + args.extend(["--stop-timeout", str(timeout_sec)]) + + args.append(image) + args.append("bash") + args.append("-c") + args.append(build_runner_script().strip()) + + return args + + +def main(): + parser = argparse.ArgumentParser(description="Run Docker CI pipeline") + parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + help="Path to config.yaml", + ) + parser.add_argument("--branch", type=str, help="Override repo branch") + parser.add_argument("--job", type=str, help="Job name to run (default: first job)") + parser.add_argument( + "--stage", + type=str, + help="Run only this stage name (still runs setup first)", + ) + parser.add_argument( + "--image-tag", + type=str, + help="Override image tag (stable, latest, or commit hash)", + ) + parser.add_argument( + "--gpu-id", + type=str, + help='GPU device IDs to use, e.g. "0", "0,2", "all"', + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print docker command and exit", + ) + args = parser.parse_args() + + config = load_config(args.config) + repo = config.get("repo", {}) + repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git") + branch = args.branch or repo.get("branch", "dev-infra") + + jobs = config.get("jobs", {}) + if not jobs: + print("error: no jobs in config", file=sys.stderr) + sys.exit(1) + job_name = args.job or next(iter(jobs)) + if job_name not in jobs: + print(f"error: job {job_name!r} not in config", file=sys.stderr) + sys.exit(1) + + job = jobs[job_name] + all_stages = job.get("stages", []) + if args.stage: + stages = [s for s in all_stages if s["name"] == args.stage] + if not stages: + print(f"error: stage {args.stage!r} not found", file=sys.stderr) + sys.exit(1) + else: + stages = all_stages + + workdir = "/workspace" + docker_args = build_docker_args( + config, job_name, repo_url, branch, stages, workdir, args.image_tag, + gpu_id_override=args.gpu_id, + ) + + if args.dry_run: + print(" ".join(docker_args)) + + return + + sys.exit(subprocess.run(docker_args).returncode) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 765b90a..3dbc186 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "InfiniOps" version = "0.1.0" [project.optional-dependencies] -dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch"] +dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch", "pyyaml"] [tool.scikit-build.wheel] install-dir = "infini" From f8a60644f8a46a41718e7d1fd42bab1988ed9ecc Mon Sep 17 00:00:00 2001 From: zhangyue Date: Fri, 20 Mar 2026 07:24:55 +0000 Subject: [PATCH 02/16] feat: ci sys for nv platform --- .ci/README.md | 155 +++++------------- .ci/build.py | 103 ++++++++++-- .ci/config.yaml | 17 +- .ci/images/ascend/Dockerfile | 8 + .ci/images/nvidia/Dockerfile | 5 + .ci/run.py | 117 ++++++++++++-- .ci/tests/__init__.py | 0 .ci/tests/conftest.py | 42 +++++ .ci/tests/test_build.py | 186 ++++++++++++++++++++++ .ci/tests/test_run.py | 298 +++++++++++++++++++++++++++++++++++ 10 files changed, 775 insertions(+), 156 deletions(-) create mode 100644 .ci/tests/__init__.py create mode 100644 .ci/tests/conftest.py create mode 100644 .ci/tests/test_build.py create mode 100644 .ci/tests/test_run.py diff --git a/.ci/README.md b/.ci/README.md index 59ee101..0bd59bd 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -1,25 +1,18 @@ # .ci — CI 镜像与流水线 -本目录管理 CI 所用的 Docker 镜像构建与测试流水线执行。 - -## 目录结构 - ``` .ci/ -├── config.yaml # 统一配置(registry、镜像、job 定义) -├── build.py # 镜像构建脚本 -├── run.py # CI 流水线执行脚本 -├── README.md +├── config.yaml # 统一配置(镜像、job 定义) +├── build.py # 镜像构建 +├── run.py # CI 流水线执行 └── images/ - ├── nvidia/Dockerfile # NVIDIA 平台镜像 - └── ascend/Dockerfile # 昇腾平台镜像 + ├── nvidia/Dockerfile + └── ascend/Dockerfile ``` -## 前置依赖 +**前置依赖**:Docker、Python 3.10+、`pip install pyyaml` -- Docker -- Python 3.10+ -- pyyaml (`pip install pyyaml`) +--- ## 配置文件 `config.yaml` @@ -28,144 +21,72 @@ repo: url: https://github.com/InfiniTensor/InfiniOps.git branch: master -registry: - url: "" # Harbor 地址,本地开发时留空 - project: infiniops - credentials_env: REGISTRY_TOKEN - images: nvidia: dockerfile: .ci/images/nvidia/ build_args: BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 - ascend: - dockerfile: .ci/images/ascend/ - build_args: - BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 - private_sdk: - source: "${PRIVATE_SDK_URL}" jobs: nvidia_gpu: - image: stable # stable | latest | 具体 commit hash + image: latest # latest | platform: nvidia resources: - gpu_ids: "0" # GPU 设备 ID,如 "0" "0,2" "all" - gpu_type: A100 + gpu_ids: "0" # "0" | "0,2" | "all" memory: 32GB - timeout: 3600 + shm_size: 16g # 避免 PyTorch SHMEM 不足 + timeout: 3600 # 容器内脚本最大运行秒数 setup: pip install .[dev] + env: # 可选,注入容器环境变量 + MY_VAR: value stages: - name: test - run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml + run: pytest tests/ -n auto -v --tb=short --junitxml=/workspace/results/test-results.xml ``` -- **`registry.url`** 为空时镜像仅保存在本地,tag 格式为 `-ci/:`。 -- **`images..build_args`** 会作为 `--build-arg` 传入 `docker build`。 -- **`jobs..image`** 支持 `stable`、`latest` 或具体 commit hash。 -- **`resources.gpu_ids`** 指定 GPU 设备 ID,支持 `"0"`、`"0,2"`、`"all"` 等格式,映射为 `docker run --gpus "device=..."`。也可保留 `gpu_count` 按数量分配。 +--- ## 镜像构建 `build.py` -```bash -python .ci/build.py [options] -``` - -| 参数 | 默认值 | 说明 | -|---|---|---| -| `--platform` | `all` | 构建平台:`nvidia`、`ascend` 或 `all` | -| `--commit` | `HEAD` | 用于镜像 tag 的 git ref | -| `--push` | — | 构建后推送到 registry | -| `--force` | — | 跳过变更检测,强制构建 | -| `--dry-run` | — | 仅打印命令,不执行 | -| `--config` | `.ci/config.yaml` | 配置文件路径 | - -### 示例 +| 参数 | 说明 | +|---|---| +| `--platform nvidia\|ascend\|all` | 构建平台,默认 `all` | +| `--force` | 跳过 Dockerfile 变更检测 | +| `--dry-run` | 打印命令不执行 | ```bash -# 构建 nvidia 镜像(自动检测 Dockerfile 变更,无变更则跳过) +# 检测变更后构建(无变更自动跳过) python .ci/build.py --platform nvidia # 强制构建 python .ci/build.py --platform nvidia --force - -# 构建全部平台并推送到 registry -python .ci/build.py --push --force - -# 预览实际执行的 docker 命令 -python .ci/build.py --platform nvidia --force --dry-run ``` -### 构建流程 +构建产物以宿主机本地镜像 tag 存储:`infiniops-ci/:` 和 `:latest`。 +代理、`no_proxy` 自动从宿主机环境变量透传到 `docker build`。 -1. 通过 `git diff HEAD~1` 检测 Dockerfile 目录是否有变更(`--force` 跳过此步) -2. `docker build` 构建镜像,同时打 `` 和 `latest` 两个 tag -3. 自动透传宿主机的 `http_proxy`/`https_proxy`/`no_proxy` 到构建容器 -4. 若指定 `--push`,将两个 tag 推送到 registry +> `--push` 为预留功能,需在 `config.yaml` 中配置 `registry` 段后方可使用。 -### 产物 - -| Tag | 说明 | -|---|---| -| `infiniops-ci/:` | 精确追溯到某次构建 | -| `infiniops-ci/:latest` | 最近一次构建 | +--- ## 流水线执行 `run.py` -```bash -python .ci/run.py [options] -``` - -| 参数 | 默认值 | 说明 | -|---|---|---| -| `--job` | 配置中第一个 job | 要执行的 job 名称 | -| `--branch` | `config.yaml` 中的 `repo.branch` | 覆盖克隆分支 | -| `--stage` | 全部 | 仅运行指定 stage | -| `--image-tag` | job 中的 `image` 字段 | 覆盖镜像版本 | -| `--gpu-id` | config 中的 `gpu_ids` | GPU 设备 ID,如 `0`、`0,2`、`all` | -| `--dry-run` | — | 仅打印 docker 命令,不执行 | -| `--config` | `.ci/config.yaml` | 配置文件路径 | - -### 示例 +| 参数 | 说明 | +|---|---| +| `--branch` | 覆盖克隆分支 | +| `--stage` | 只运行指定 stage | +| `--image-tag` | 覆盖镜像 tag | +| `--gpu-id` | 覆盖 GPU 设备 ID | +| `--results-dir` | 宿主机目录,挂载到容器 `/workspace/results` | +| `--dry-run` | 打印 docker 命令不执行 | ```bash # 运行默认 job -python .ci/run.py - -# 指定分支和镜像版本 -python .ci/run.py --branch feature-xxx --image-tag latest - -# 只用 GPU 0 运行 -python .ci/run.py --gpu-id 0 - -# 用 GPU 0 和 2 运行 -python .ci/run.py --gpu-id 0,2 - -# 使用全部 GPU -python .ci/run.py --gpu-id all - -# 只跑 test stage -python .ci/run.py --stage test +python .ci/run.py --branch feat/my-feature --results-dir ./ci-results -# 预览 docker 命令 -python .ci/run.py --dry-run -``` - -### 执行流程 - -1. 解析 job 配置,拉取对应镜像 -2. `docker run` 启动容器(自动挂载 GPU、限制内存) -3. 容器内 `git clone` → `checkout` → 执行 `setup` 命令 -4. 依次执行各 stage,汇总结果 - -## 代理配置 - -如果网络环境需要代理,在宿主机设置环境变量后即可: - -```bash -export http_proxy=http://localhost:9991 -export https_proxy=http://localhost:9991 +# 只跑 test stage,预览命令 +python .ci/run.py --stage test --dry-run ``` -- **`build.py`** 会自动透传代理到 `docker build`(通过 `--build-arg` + `--network host`)。 -- **`run.py`** 使用 `--network host`,容器内可直接访问宿主机代理。 +容器内执行流程:`git clone` → `checkout` → `setup` → stages。 +代理从宿主机透传,测试结果写入 `--results-dir`。每次运行均为干净环境(不挂载宿主机 pip 缓存)。 diff --git a/.ci/build.py b/.ci/build.py index 489ebf0..2339319 100644 --- a/.ci/build.py +++ b/.ci/build.py @@ -4,6 +4,7 @@ import argparse import json import os +import shlex import subprocess import sys from pathlib import Path @@ -28,6 +29,7 @@ def get_git_commit(ref="HEAD"): capture_output=True, text=True, ) + if result.returncode != 0: print(f"error: failed to get commit hash for `{ref}`", file=sys.stderr) sys.exit(1) @@ -43,9 +45,61 @@ def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"): text=True, ) + if result.returncode != 0: + print( + "warning: git diff failed (shallow clone or initial commit?);" + " assuming Dockerfile changed", + file=sys.stderr, + ) + return True + return bool(result.stdout.strip()) +def docker_login(registry_cfg, dry_run): + """Log in to the registry using `credentials_env` token. + + Returns True on success. + + NOTE: Registry support is currently unused (`config.yaml` has no registry + section). Retained for future integration with an external image management + system. + """ + credentials_env = registry_cfg.get("credentials_env") + registry_url = registry_cfg.get("url", "") + + if not credentials_env or not registry_url: + return True + + token = os.environ.get(credentials_env) + + if not token: + print( + f"error: {credentials_env} not set, cannot login", + file=sys.stderr, + ) + return False + + if dry_run: + print( + f"[dry-run] echo | docker login {registry_url}" + " --username token --password-stdin" + ) + return True + + result = subprocess.run( + ["docker", "login", registry_url, "--username", "token", "--password-stdin"], + input=token, + text=True, + ) + + if result.returncode != 0: + print("error: docker login failed", file=sys.stderr) + return False + + return True + + def build_image_tag(registry_url, project, platform, tag): if registry_url: return f"{registry_url}/{project}/{platform}:{tag}" @@ -53,46 +107,53 @@ def build_image_tag(registry_url, project, platform, tag): return f"{project}-ci/{platform}:{tag}" -def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run): +def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run, logged_in): """Build a single platform image. Returns True on success.""" registry_url = registry_cfg.get("url", "") project = registry_cfg.get("project", "infiniops") dockerfile_dir = platform_cfg["dockerfile"] - commit_tag = build_image_tag(registry_url, project, platform, commit) latest_tag = build_image_tag(registry_url, project, platform, "latest") build_args_cfg = platform_cfg.get("build_args", {}) build_cmd = ["docker", "build", "--network", "host"] + for key, value in build_args_cfg.items(): build_cmd.extend(["--build-arg", f"{key}={value}"]) - for proxy_var in ("http_proxy", "https_proxy", "no_proxy"): - proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.upper()) + for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"): + proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower()) + if proxy_val: build_cmd.extend(["--build-arg", f"{proxy_var}={proxy_val}"]) + build_cmd.extend(["--build-arg", f"{proxy_var.lower()}={proxy_val}"]) private_sdk = platform_cfg.get("private_sdk", {}) + if private_sdk: - sdk_url = private_sdk.get("source", "") - if sdk_url.startswith("${") and sdk_url.endswith("}"): - env_var = sdk_url[2:-1] - sdk_url = os.environ.get(env_var, "") + source_env = private_sdk.get("source_env", "") + sdk_url = os.environ.get(source_env, "") if source_env else "" + if sdk_url: build_cmd.extend(["--build-arg", f"PRIVATE_SDK_URL={sdk_url}"]) build_cmd.extend(["-t", commit_tag, "-t", latest_tag, dockerfile_dir]) if dry_run: - print(f"[dry-run] {' '.join(build_cmd)}") + print(f"[dry-run] {shlex.join(build_cmd)}") + if push: - print(f"[dry-run] docker push {commit_tag}") - print(f"[dry-run] docker push {latest_tag}") + if not logged_in: + print("[dry-run] (skipping push: docker login failed)") + else: + print(f"[dry-run] docker push {commit_tag}") + print(f"[dry-run] docker push {latest_tag}") return True print(f"==> building {platform}: {commit_tag}", file=sys.stderr) result = subprocess.run(build_cmd) + if result.returncode != 0: error = { "stage": "build", @@ -105,9 +166,14 @@ def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run): return False if push: + if not logged_in: + print("error: docker login failed, cannot push", file=sys.stderr) + return False + for tag in (commit_tag, latest_tag): print(f"==> pushing {tag}", file=sys.stderr) push_result = subprocess.run(["docker", "push", tag]) + if push_result.returncode != 0: error = { "stage": "push", @@ -145,7 +211,7 @@ def main(): parser.add_argument( "--push", action="store_true", - help="Push images to registry after building", + help="Push images to registry after building (requires registry in config)", ) parser.add_argument( "--force", @@ -179,6 +245,7 @@ def main(): platforms = [args.platform] commit = get_git_commit(args.commit) + logged_in = docker_login(registry_cfg, args.dry_run) if args.push else True failed = False for platform in platforms: @@ -187,7 +254,8 @@ def main(): if not Path(dockerfile_dir).is_dir(): print( - f"warning: dockerfile directory `{dockerfile_dir}` does not exist, skipping {platform}", + f"warning: dockerfile directory `{dockerfile_dir}` does not exist," + f" skipping {platform}", file=sys.stderr, ) continue @@ -197,8 +265,15 @@ def main(): continue ok = build_image( - platform, platform_cfg, registry_cfg, commit, args.push, args.dry_run + platform, + platform_cfg, + registry_cfg, + commit, + args.push, + args.dry_run, + logged_in=logged_in, ) + if not ok: failed = True diff --git a/.ci/config.yaml b/.ci/config.yaml index fea3f7c..c80c47d 100644 --- a/.ci/config.yaml +++ b/.ci/config.yaml @@ -2,12 +2,7 @@ repo: url: https://github.com/InfiniTensor/InfiniOps.git branch: master -registry: - url: "" # TODO: Harbor not ready yet - project: infiniops - credentials_env: REGISTRY_TOKEN - -images: +images: nvidia: dockerfile: .ci/images/nvidia/ build_args: @@ -17,20 +12,22 @@ images: build_args: BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 private_sdk: - source: "${PRIVATE_SDK_URL}" + source_env: PRIVATE_SDK_URL jobs: nvidia_gpu: - image: stable + image: latest platform: nvidia resources: gpu_ids: "0" # 指定 GPU ID,如 "0" "0,2" "all" - gpu_type: A100 memory: 32GB + shm_size: 16g # 避免 PyTorch 默认 64MB SHMEM 不足 timeout: 3600 setup: pip install .[dev] + # env: # 可选,注入容器环境变量 + # MY_VAR: value stages: - name: test - run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml + run: pytest tests/ -n auto -v --tb=short --junitxml=/workspace/results/test-results.xml diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile index 87f7c91..66392eb 100644 --- a/.ci/images/ascend/Dockerfile +++ b/.ci/images/ascend/Dockerfile @@ -3,11 +3,19 @@ FROM ${BASE_IMAGE} ENV DEBIAN_FRONTEND=noninteractive +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + RUN apt-get update && \ apt-get install -y --no-install-recommends \ git \ cmake \ ninja-build \ + coreutils \ curl \ libclang-dev \ && rm -rf /var/lib/apt/lists/* diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile index d89ea91..74ccfd1 100644 --- a/.ci/images/nvidia/Dockerfile +++ b/.ci/images/nvidia/Dockerfile @@ -3,14 +3,19 @@ FROM ${BASE_IMAGE} ENV DEBIAN_FRONTEND=noninteractive +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY ARG http_proxy ARG https_proxy +ARG no_proxy RUN apt-get update && \ apt-get install -y --no-install-recommends \ git \ cmake \ ninja-build \ + coreutils \ libclang-dev \ && rm -rf /var/lib/apt/lists/* diff --git a/.ci/run.py b/.ci/run.py index 0421a56..3f25afa 100644 --- a/.ci/run.py +++ b/.ci/run.py @@ -2,8 +2,11 @@ """Standalone Docker CI runner: clone repo, setup, run stages. Output to stdout.""" import argparse +import os +import shlex import subprocess import sys +from datetime import datetime from pathlib import Path try: @@ -20,8 +23,35 @@ def load_config(path): return yaml.safe_load(f) +def get_git_commit(ref="HEAD"): + result = subprocess.run( + ["git", "rev-parse", "--short", ref], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + return "unknown" + + return result.stdout.strip() + + +def build_results_dir(base, platform, stages, commit): + """Build a results directory path: `{base}/{platform}_{stages}_{commit}_{timestamp}`.""" + stage_names = "+".join(s["name"] for s in stages) + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + dirname = f"{platform}_{stage_names}_{commit}_{timestamp}" + + return Path(base) / dirname + + def resolve_image(config, platform, image_tag): - """Resolve an image reference ('stable', 'latest', or commit hash) to a full URL.""" + """Resolve an image reference to a full image name. + + Accepts `stable`, `latest`, or a commit hash as `image_tag`. When config + contains a registry section, returns a registry-prefixed URL. Otherwise + returns a local tag (current default). + """ registry = config.get("registry", {}) registry_url = registry.get("url", "") project = registry.get("project", "infiniops") @@ -34,9 +64,9 @@ def resolve_image(config, platform, image_tag): def build_runner_script(): return r""" -export https_proxy=http://localhost:9991 set -e cd /workspace +mkdir -p /workspace/results git clone "$REPO_URL" repo cd repo git checkout "$BRANCH" @@ -58,15 +88,27 @@ def build_runner_script(): def build_docker_args( - config, job_name, repo_url, branch, stages, workdir, image_tag_override, + config, + job_name, + repo_url, + branch, + stages, + workdir, + image_tag_override, gpu_id_override=None, + results_dir=None, ): job = config["jobs"][job_name] platform = job.get("platform", "nvidia") - image_tag = image_tag_override or job.get("image", "stable") + image_tag = image_tag_override or job.get("image", "latest") image = resolve_image(config, platform, image_tag) resources = job.get("resources", {}) - setup_cmd = job.get("setup", "pip install .[dev]") + setup_raw = job.get("setup", "pip install .[dev]") + + if isinstance(setup_raw, list): + setup_cmd = "\n".join(setup_raw) + else: + setup_cmd = setup_raw args = [ "docker", @@ -86,6 +128,20 @@ def build_docker_args( "-e", f"NUM_STAGES={len(stages)}", ] + + for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"): + proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower()) + + if proxy_val: + args.extend(["-e", f"{proxy_var}={proxy_val}"]) + args.extend(["-e", f"{proxy_var.lower()}={proxy_val}"]) + + for key, value in job.get("env", {}).items(): + args.extend(["-e", f"{key}={value}"]) + + if results_dir: + args.extend(["-v", f"{results_dir.resolve()}:/workspace/results"]) + for i, s in enumerate(stages): args.append("-e") args.append(f"STAGE_{i + 1}_NAME={s['name']}") @@ -94,6 +150,7 @@ def build_docker_args( gpu_id = gpu_id_override or str(resources.get("gpu_ids", "")) gpu_count = resources.get("gpu_count", 0) + if gpu_id: if gpu_id == "all": args.extend(["--gpus", "all"]) @@ -103,20 +160,28 @@ def build_docker_args( args.extend(["--gpus", f"count={gpu_count}"]) memory = resources.get("memory") + if memory: - mem = str(memory).upper().replace("GB", "g").replace("MB", "m") + mem = str(memory).lower().replace("gb", "g").replace("mb", "m") + if not mem.endswith("g") and not mem.endswith("m"): mem = f"{mem}g" + args.extend(["--memory", mem]) + shm_size = resources.get("shm_size") + + if shm_size: + args.extend(["--shm-size", str(shm_size)]) + timeout_sec = resources.get("timeout") + args.append(image) + if timeout_sec: - args.extend(["--stop-timeout", str(timeout_sec)]) + # Requires coreutils `timeout` inside the container image. + args.extend(["timeout", str(timeout_sec)]) - args.append(image) - args.append("bash") - args.append("-c") - args.append(build_runner_script().strip()) + args.extend(["bash", "-c", build_runner_script().strip()]) return args @@ -146,6 +211,12 @@ def main(): type=str, help='GPU device IDs to use, e.g. "0", "0,2", "all"', ) + parser.add_argument( + "--results-dir", + type=Path, + default=Path("ci-results"), + help="Base directory for test results (default: ./ci-results)", + ) parser.add_argument( "--dry-run", action="store_true", @@ -156,38 +227,54 @@ def main(): config = load_config(args.config) repo = config.get("repo", {}) repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git") - branch = args.branch or repo.get("branch", "dev-infra") + branch = args.branch or repo.get("branch", "master") jobs = config.get("jobs", {}) + if not jobs: print("error: no jobs in config", file=sys.stderr) sys.exit(1) + job_name = args.job or next(iter(jobs)) + if job_name not in jobs: print(f"error: job {job_name!r} not in config", file=sys.stderr) sys.exit(1) job = jobs[job_name] all_stages = job.get("stages", []) + if args.stage: stages = [s for s in all_stages if s["name"] == args.stage] + if not stages: print(f"error: stage {args.stage!r} not found", file=sys.stderr) sys.exit(1) else: stages = all_stages + platform = job.get("platform", "nvidia") + commit = get_git_commit() + results_dir = build_results_dir(args.results_dir, platform, stages, commit) + workdir = "/workspace" docker_args = build_docker_args( - config, job_name, repo_url, branch, stages, workdir, args.image_tag, + config, + job_name, + repo_url, + branch, + stages, + workdir, + args.image_tag, gpu_id_override=args.gpu_id, + results_dir=results_dir, ) if args.dry_run: - print(" ".join(docker_args)) - + print(shlex.join(docker_args)) return + results_dir.mkdir(parents=True, exist_ok=True) sys.exit(subprocess.run(docker_args).returncode) diff --git a/.ci/tests/__init__.py b/.ci/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/.ci/tests/conftest.py b/.ci/tests/conftest.py new file mode 100644 index 0000000..98079cd --- /dev/null +++ b/.ci/tests/conftest.py @@ -0,0 +1,42 @@ +import sys +from pathlib import Path + +# Allow `import run` and `import build` directly. +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +import pytest + + +@pytest.fixture +def minimal_config(): + return { + "repo": { + "url": "https://github.com/InfiniTensor/InfiniOps.git", + "branch": "master", + }, + "images": { + "nvidia": { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, + } + }, + "jobs": { + "nvidia_gpu": { + "image": "latest", + "platform": "nvidia", + "resources": { + "gpu_ids": "0", + "memory": "32GB", + "shm_size": "16g", + "timeout": 3600, + }, + "setup": "pip install .[dev]", + "stages": [ + { + "name": "test", + "run": "pytest tests/ -v", + } + ], + } + }, + } diff --git a/.ci/tests/test_build.py b/.ci/tests/test_build.py new file mode 100644 index 0000000..fa2f292 --- /dev/null +++ b/.ci/tests/test_build.py @@ -0,0 +1,186 @@ +import build + + +# --------------------------------------------------------------------------- +# build_image_tag +# --------------------------------------------------------------------------- + + +def test_build_image_tag_with_registry(): + tag = build.build_image_tag("localhost:5000", "infiniops", "nvidia", "latest") + assert tag == "localhost:5000/infiniops/nvidia:latest" + + +def test_build_image_tag_without_registry(): + tag = build.build_image_tag("", "infiniops", "nvidia", "abc1234") + assert tag == "infiniops-ci/nvidia:abc1234" + + +def test_build_image_tag_commit_hash(): + tag = build.build_image_tag( + "registry.example.com:5000", "proj", "ascend", "deadbeef" + ) + assert tag == "registry.example.com:5000/proj/ascend:deadbeef" + + +# --------------------------------------------------------------------------- +# has_dockerfile_changed +# --------------------------------------------------------------------------- + + +def test_has_dockerfile_changed_true_when_stdout_nonempty(mocker): + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0, stdout="Dockerfile\n"), + ) + assert build.has_dockerfile_changed(".ci/images/nvidia/") is True + + +def test_has_dockerfile_changed_false_when_stdout_empty(mocker): + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0, stdout=""), + ) + assert build.has_dockerfile_changed(".ci/images/nvidia/") is False + + +def test_has_dockerfile_changed_true_on_git_error(mocker): + # Shallow clone or initial commit: `git diff` returns non-zero. + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=128, stdout=""), + ) + assert build.has_dockerfile_changed(".ci/images/nvidia/") is True + + +# --------------------------------------------------------------------------- +# docker_login +# --------------------------------------------------------------------------- + + +def test_docker_login_no_credentials_env(mocker): + run_mock = mocker.patch("subprocess.run") + result = build.docker_login({"url": "localhost:5000"}, dry_run=False) + assert result is True + run_mock.assert_not_called() + + +def test_docker_login_token_not_set(mocker, monkeypatch, capsys): + monkeypatch.delenv("REGISTRY_TOKEN", raising=False) + run_mock = mocker.patch("subprocess.run") + cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} + result = build.docker_login(cfg, dry_run=False) + assert result is False + run_mock.assert_not_called() + + +def test_docker_login_dry_run_does_not_call_subprocess(mocker, monkeypatch): + monkeypatch.setenv("REGISTRY_TOKEN", "mytoken") + run_mock = mocker.patch("subprocess.run") + cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} + result = build.docker_login(cfg, dry_run=True) + assert result is True + run_mock.assert_not_called() + + +def test_docker_login_success(mocker, monkeypatch): + monkeypatch.setenv("REGISTRY_TOKEN", "mytoken") + run_mock = mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0), + ) + cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} + result = build.docker_login(cfg, dry_run=False) + assert result is True + run_mock.assert_called_once() + cmd = run_mock.call_args[0][0] + assert "docker" in cmd + assert "login" in cmd + + +# --------------------------------------------------------------------------- +# build_image — dry_run and proxy +# --------------------------------------------------------------------------- + + +def _platform_cfg(): + return { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, + } + + +def _registry_cfg(): + return {"url": "localhost:5000", "project": "infiniops"} + + +def test_build_image_dry_run_no_subprocess(mocker, monkeypatch, capsys): + monkeypatch.delenv("HTTP_PROXY", raising=False) + run_mock = mocker.patch("subprocess.run") + build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=True, + logged_in=True, + ) + run_mock.assert_not_called() + captured = capsys.readouterr() + assert "[dry-run]" in captured.out + + +def test_build_image_dry_run_output_contains_image_tag(mocker, monkeypatch, capsys): + monkeypatch.delenv("HTTP_PROXY", raising=False) + mocker.patch("subprocess.run") + build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=True, + logged_in=True, + ) + captured = capsys.readouterr() + assert "abc1234" in captured.out + + +def test_build_image_proxy_in_build_args(mocker, monkeypatch): + monkeypatch.setenv("HTTP_PROXY", "http://proxy.test:3128") + run_mock = mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=0), + ) + build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=False, + logged_in=True, + ) + called_cmd = run_mock.call_args[0][0] + joined = " ".join(called_cmd) + assert "HTTP_PROXY=http://proxy.test:3128" in joined + assert "http_proxy=http://proxy.test:3128" in joined + + +def test_build_image_returns_false_on_docker_error(mocker, monkeypatch): + monkeypatch.delenv("HTTP_PROXY", raising=False) + mocker.patch( + "subprocess.run", + return_value=mocker.Mock(returncode=1), + ) + result = build.build_image( + "nvidia", + _platform_cfg(), + _registry_cfg(), + "abc1234", + push=False, + dry_run=False, + logged_in=True, + ) + assert result is False diff --git a/.ci/tests/test_run.py b/.ci/tests/test_run.py new file mode 100644 index 0000000..075546e --- /dev/null +++ b/.ci/tests/test_run.py @@ -0,0 +1,298 @@ +from pathlib import Path + +import pytest + +import run + + +# --------------------------------------------------------------------------- +# resolve_image +# --------------------------------------------------------------------------- + + +def test_resolve_image_with_registry(): + cfg = {"registry": {"url": "localhost:5000", "project": "infiniops"}} + img = run.resolve_image(cfg, "nvidia", "latest") + assert img == "localhost:5000/infiniops/nvidia:latest" + + +def test_resolve_image_without_registry(minimal_config): + img = run.resolve_image(minimal_config, "nvidia", "abc1234") + assert img == "infiniops-ci/nvidia:abc1234" + + +# --------------------------------------------------------------------------- +# build_runner_script +# --------------------------------------------------------------------------- + + +def test_runner_script_contains_git_clone(): + script = run.build_runner_script() + assert "git clone" in script + + +def test_runner_script_contains_setup_cmd(): + script = run.build_runner_script() + assert "SETUP_CMD" in script + + +def test_runner_script_exits_on_failure(): + script = run.build_runner_script() + assert "exit $failed" in script + + +def test_runner_script_creates_results_dir(): + script = run.build_runner_script() + assert "mkdir -p /workspace/results" in script + + +# --------------------------------------------------------------------------- +# build_docker_args — basic structure +# --------------------------------------------------------------------------- + + +def test_docker_args_basic_structure(minimal_config): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert args[0] == "docker" + assert "run" in args + assert "--rm" in args + + +def test_docker_args_correct_image(minimal_config): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert "infiniops-ci/nvidia:latest" in args + + +def test_docker_args_image_tag_override(minimal_config): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + "abc1234", + ) + assert "infiniops-ci/nvidia:abc1234" in args + + +# --------------------------------------------------------------------------- +# build_docker_args — proxy passthrough +# --------------------------------------------------------------------------- + + +def test_docker_args_proxy_present_when_set(minimal_config, monkeypatch): + monkeypatch.setenv("HTTP_PROXY", "http://proxy.example.com:8080") + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert "-e" in args + assert "HTTP_PROXY=http://proxy.example.com:8080" in args + assert "http_proxy=http://proxy.example.com:8080" in args + + +def test_docker_args_proxy_absent_when_not_set(minimal_config, monkeypatch): + monkeypatch.delenv("HTTP_PROXY", raising=False) + monkeypatch.delenv("http_proxy", raising=False) + monkeypatch.delenv("HTTPS_PROXY", raising=False) + monkeypatch.delenv("https_proxy", raising=False) + monkeypatch.delenv("NO_PROXY", raising=False) + monkeypatch.delenv("no_proxy", raising=False) + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + + for arg in args: + assert not arg.startswith("HTTP_PROXY=") + assert not arg.startswith("http_proxy=") + assert not arg.startswith("HTTPS_PROXY=") + assert not arg.startswith("https_proxy=") + assert not arg.startswith("NO_PROXY=") + assert not arg.startswith("no_proxy=") + + +def test_docker_args_proxy_lowercase_fallback(minimal_config, monkeypatch): + monkeypatch.delenv("HTTP_PROXY", raising=False) + monkeypatch.setenv("http_proxy", "http://lowercase.proxy:3128") + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + ) + assert "HTTP_PROXY=http://lowercase.proxy:3128" in args + assert "http_proxy=http://lowercase.proxy:3128" in args + + +# --------------------------------------------------------------------------- +# build_docker_args — GPU flags +# --------------------------------------------------------------------------- + + +def _make_args(config, gpu_id_override=None): + return run.build_docker_args( + config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + gpu_id_override=gpu_id_override, + ) + + +def test_docker_args_gpu_device(minimal_config): + args = _make_args(minimal_config) + idx = args.index("--gpus") + assert "device=0" in args[idx + 1] + + +def test_docker_args_gpu_all(minimal_config): + minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "all" + args = _make_args(minimal_config) + idx = args.index("--gpus") + assert args[idx + 1] == "all" + + +def test_docker_args_no_gpu(minimal_config): + minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "" + minimal_config["jobs"]["nvidia_gpu"]["resources"].pop("gpu_count", None) + args = _make_args(minimal_config) + assert "--gpus" not in args + + +def test_docker_args_gpu_override(minimal_config): + args = _make_args(minimal_config, gpu_id_override="2,3") + idx = args.index("--gpus") + assert "2,3" in args[idx + 1] + + +# --------------------------------------------------------------------------- +# build_docker_args — memory format +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("32GB", "32g"), + ("512MB", "512m"), + ("8", "8g"), + ("16gb", "16g"), + ("256mb", "256m"), + ], +) +def test_docker_args_memory_format(minimal_config, raw, expected): + minimal_config["jobs"]["nvidia_gpu"]["resources"]["memory"] = raw + args = _make_args(minimal_config) + idx = args.index("--memory") + assert args[idx + 1] == expected + + +# --------------------------------------------------------------------------- +# build_docker_args — stages encoding +# --------------------------------------------------------------------------- + + +def test_docker_args_num_stages(minimal_config): + args = _make_args(minimal_config) + assert "NUM_STAGES=1" in args + + +def test_docker_args_stage_name_cmd(minimal_config): + args = _make_args(minimal_config) + assert "STAGE_1_NAME=test" in args + assert any(a.startswith("STAGE_1_CMD=") for a in args) + + +def test_docker_args_multiple_stages(minimal_config): + minimal_config["jobs"]["nvidia_gpu"]["stages"] = [ + {"name": "lint", "run": "ruff check ."}, + {"name": "test", "run": "pytest tests/"}, + ] + args = _make_args(minimal_config) + assert "NUM_STAGES=2" in args + assert "STAGE_1_NAME=lint" in args + assert "STAGE_2_NAME=test" in args + + +# --------------------------------------------------------------------------- +# build_docker_args — results_dir mount +# --------------------------------------------------------------------------- + + +def test_docker_args_results_dir(minimal_config, tmp_path): + args = run.build_docker_args( + minimal_config, + "nvidia_gpu", + "https://github.com/example/repo.git", + "master", + minimal_config["jobs"]["nvidia_gpu"]["stages"], + "/workspace", + None, + results_dir=tmp_path, + ) + joined = " ".join(str(a) for a in args) + assert "-v" in args + assert "/workspace/results" in joined + + +# --------------------------------------------------------------------------- +# build_results_dir +# --------------------------------------------------------------------------- + + +def test_build_results_dir_contains_platform(): + stages = [{"name": "test", "run": "pytest"}] + d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") + assert "nvidia" in d.name + + +def test_build_results_dir_contains_commit(): + stages = [{"name": "test", "run": "pytest"}] + d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") + assert "abc1234" in d.name + + +def test_build_results_dir_contains_stage_names(): + stages = [{"name": "lint", "run": "ruff"}, {"name": "test", "run": "pytest"}] + d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") + assert "lint+test" in d.name + + +def test_build_results_dir_under_base(): + stages = [{"name": "test", "run": "pytest"}] + d = run.build_results_dir("/tmp/my-results", "ascend", stages, "def5678") + assert d.parent == Path("/tmp/my-results") From e2d2c21cc560692c99800a33064825f2621066b2 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Fri, 20 Mar 2026 08:00:22 +0000 Subject: [PATCH 03/16] fix(ci): fix results dir permissions and reduce parallel workers - Pass host UID/GID into container and `chown` results after tests, so mounted `ci-results/` is accessible by the host user. - Limit `pytest-xdist` workers from `-n auto` to `-n 8` to prevent OOM worker crashes on high-core-count machines. Co-Authored-By: Claude Opus 4.6 --- .ci/config.yaml | 2 +- .ci/run.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.ci/config.yaml b/.ci/config.yaml index c80c47d..a86174a 100644 --- a/.ci/config.yaml +++ b/.ci/config.yaml @@ -30,4 +30,4 @@ jobs: stages: - name: test - run: pytest tests/ -n auto -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml diff --git a/.ci/run.py b/.ci/run.py index 3f25afa..0c8d648 100644 --- a/.ci/run.py +++ b/.ci/run.py @@ -83,6 +83,9 @@ def build_runner_script(): eval "$cmd" || failed=1 done echo "========== Summary ==========" +if [ -n "$HOST_UID" ] && [ -n "$HOST_GID" ]; then + chown -R "$HOST_UID:$HOST_GID" /workspace/results 2>/dev/null || true +fi exit $failed """ @@ -127,6 +130,10 @@ def build_docker_args( f"SETUP_CMD={setup_cmd}", "-e", f"NUM_STAGES={len(stages)}", + "-e", + f"HOST_UID={os.getuid()}", + "-e", + f"HOST_GID={os.getgid()}", ] for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"): From 41c76c9f5f76812958d7cc68bc1ebfa161acb06e Mon Sep 17 00:00:00 2001 From: zhangyue Date: Mon, 23 Mar 2026 03:27:06 +0000 Subject: [PATCH 04/16] refactor(ci): Refactor code structure for improved readability and maintainability --- .ci/README.md | 207 ++++++- .ci/agent.py | 971 ++++++++++++++++++++++++++++++++ .ci/build.py | 27 +- .ci/ci_resource.py | 241 ++++++++ .ci/config.yaml | 89 ++- .ci/github_status.py | 98 ++++ .ci/images/iluvatar/Dockerfile | 53 ++ .ci/images/nvidia/Dockerfile | 21 +- .ci/run.py | 56 +- .ci/tests/conftest.py | 44 +- .ci/tests/test_agent.py | 503 +++++++++++++++++ .ci/tests/test_github_status.py | 144 +++++ .ci/tests/test_resource.py | 324 +++++++++++ .ci/tests/test_utils.py | 90 +++ .ci/utils.py | 101 ++++ 15 files changed, 2833 insertions(+), 136 deletions(-) create mode 100644 .ci/agent.py create mode 100644 .ci/ci_resource.py create mode 100644 .ci/github_status.py create mode 100644 .ci/images/iluvatar/Dockerfile create mode 100644 .ci/tests/test_agent.py create mode 100644 .ci/tests/test_github_status.py create mode 100644 .ci/tests/test_resource.py create mode 100644 .ci/tests/test_utils.py create mode 100644 .ci/utils.py diff --git a/.ci/README.md b/.ci/README.md index 0bd59bd..33841ca 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -2,11 +2,16 @@ ``` .ci/ -├── config.yaml # 统一配置(镜像、job 定义) +├── config.yaml # 统一配置(镜像、job、Agent 定义) +├── utils.py # 共享工具(load_config、get_git_commit) +├── agent.py # Runner Agent(调度、Webhook、远程触发) ├── build.py # 镜像构建 -├── run.py # CI 流水线执行 +├── run.py # CI 流水线执行(Docker 层) +├── ci_resource.py # GPU/内存资源检测与分配 +├── github_status.py # GitHub Commit Status 上报 └── images/ ├── nvidia/Dockerfile + ├── iluvatar/Dockerfile └── ascend/Dockerfile ``` @@ -16,41 +21,88 @@ ## 配置文件 `config.yaml` +配置以 **platform** 为顶级结构,每个平台包含镜像定义、平台级默认值和 job 列表。 +加载时自动展平为 `{platform}_{job}` 格式(如 `nvidia_gpu`)。 + ```yaml repo: url: https://github.com/InfiniTensor/InfiniOps.git branch: master -images: +platforms: nvidia: - dockerfile: .ci/images/nvidia/ - build_args: - BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 - -jobs: - nvidia_gpu: - image: latest # latest | - platform: nvidia - resources: - gpu_ids: "0" # "0" | "0,2" | "all" - memory: 32GB - shm_size: 16g # 避免 PyTorch SHMEM 不足 - timeout: 3600 # 容器内脚本最大运行秒数 + image: # 镜像定义 + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + setup: pip install .[dev] # 平台级默认值,job 可覆盖 + jobs: + gpu: # 展平后为 nvidia_gpu + resources: + gpu_ids: "0" # "0" | "0,2" | "all" + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml + + iluvatar: + image: + dockerfile: .ci/images/iluvatar/ + build_args: + BASE_IMAGE: corex:qs_pj20250825 + APT_MIRROR: http://archive.ubuntu.com/ubuntu + PIP_INDEX_URL: https://pypi.org/simple + docker_args: # 平台级 docker 参数,所有 job 继承 + - "--privileged" + - "--cap-add=ALL" + - "--pid=host" + - "--ipc=host" + volumes: + - /dev:/dev + - /lib/firmware:/lib/firmware + - /usr/src:/usr/src + - /lib/modules:/lib/modules setup: pip install .[dev] - env: # 可选,注入容器环境变量 - MY_VAR: value - stages: - - name: test - run: pytest tests/ -n auto -v --tb=short --junitxml=/workspace/results/test-results.xml + jobs: + gpu: # 展平后为 iluvatar_gpu + resources: + gpu_ids: "0" + gpu_style: none # CoreX 设备通过 --privileged + /dev 挂载 + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml ``` +### 配置层级说明 + +| 层级 | 字段 | 说明 | +|---|---|---| +| **平台级** | `image` | 镜像定义(dockerfile、build_args) | +| | `image_tag` | 默认镜像 tag(默认 `latest`) | +| | `docker_args` | 额外 docker run 参数(如 `--privileged`) | +| | `volumes` | 额外挂载卷 | +| | `setup` | 容器内 setup 命令 | +| | `env` | 注入容器环境变量 | +| **Job 级** | `resources.gpu_ids` | GPU 设备 ID | +| | `resources.gpu_style` | GPU 透传方式:`nvidia`(默认)或 `none` | +| | `resources.memory` | 容器内存限制 | +| | `resources.shm_size` | 共享内存大小 | +| | `resources.timeout` | 容器内脚本最大运行秒数 | +| | `stages` | 执行阶段列表 | +| | 以上平台级字段 | Job 可覆盖任意平台级默认值 | + --- ## 镜像构建 `build.py` | 参数 | 说明 | |---|---| -| `--platform nvidia\|ascend\|all` | 构建平台,默认 `all` | +| `--platform nvidia\|iluvatar\|ascend\|all` | 构建平台,默认 `all` | | `--force` | 跳过 Dockerfile 变更检测 | | `--dry-run` | 打印命令不执行 | @@ -58,8 +110,11 @@ jobs: # 检测变更后构建(无变更自动跳过) python .ci/build.py --platform nvidia -# 强制构建 -python .ci/build.py --platform nvidia --force +# 构建 Iluvatar 镜像 +python .ci/build.py --platform iluvatar --force + +# 强制构建全部 +python .ci/build.py --force ``` 构建产物以宿主机本地镜像 tag 存储:`infiniops-ci/:` 和 `:latest`。 @@ -73,20 +128,116 @@ python .ci/build.py --platform nvidia --force | 参数 | 说明 | |---|---| +| `--job` | 指定 job 名称(默认第一个) | | `--branch` | 覆盖克隆分支 | | `--stage` | 只运行指定 stage | | `--image-tag` | 覆盖镜像 tag | -| `--gpu-id` | 覆盖 GPU 设备 ID | +| `--gpu-id` | 覆盖 GPU 设备 ID(仅 nvidia gpu_style) | | `--results-dir` | 宿主机目录,挂载到容器 `/workspace/results` | | `--dry-run` | 打印 docker 命令不执行 | ```bash -# 运行默认 job -python .ci/run.py --branch feat/my-feature --results-dir ./ci-results +# 运行 NVIDIA job +python .ci/run.py --job nvidia_gpu --branch master + +# 运行 Iluvatar job +python .ci/run.py --job iluvatar_gpu --branch feat/ci-nvidia # 只跑 test stage,预览命令 -python .ci/run.py --stage test --dry-run +python .ci/run.py --job iluvatar_gpu --stage test --dry-run ``` 容器内执行流程:`git clone` → `checkout` → `setup` → stages。 代理从宿主机透传,测试结果写入 `--results-dir`。每次运行均为干净环境(不挂载宿主机 pip 缓存)。 + +--- + +## 平台差异 + +| 平台 | GPU 透传方式 | 基础镜像 | 备注 | +|---|---|---|---| +| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvcr.io/nvidia/pytorch:24.10-py3` | 标准 CUDA | +| Iluvatar | `--privileged` + `/dev` 挂载 | `corex:qs_pj20250825` | CoreX 运行时,CUDA 兼容 | +| Ascend | TODO | `ascend-pytorch:24.0.0` | 待完善 | + +--- + +## Runner Agent `agent.py` + +Runner Agent 支持 CLI 手动触发、GitHub Webhook 自动触发、资源感知的动态调度,以及跨机器远程触发。 + +### CLI 手动执行 + +```bash +# 运行所有 job(本地 + 远程 Agent) +python .ci/agent.py run --branch master + +# 运行指定 job +python .ci/agent.py run --branch master --job nvidia_gpu + +# 按平台运行 +python .ci/agent.py run --branch master --platform nvidia + +# 预览命令 +python .ci/agent.py run --branch master --dry-run --no-status +``` + +| 参数 | 说明 | +|---|---| +| `--branch` | 测试分支(必填) | +| `--job` | 指定 job 名称 | +| `--platform` | 按平台过滤 job | +| `--commit` | 覆盖 commit SHA | +| `--image-tag` | 覆盖镜像 tag | +| `--results-dir` | 结果目录(默认 `ci-results`) | +| `--utilization-threshold` | GPU 空闲阈值百分比(默认 10) | +| `--no-status` | 跳过 GitHub Status 上报 | +| `--dry-run` | 预览模式 | + +### Webhook 服务 + +每台平台机器部署一个 Agent 实例: + +```bash +# NVIDIA 机器 +python .ci/agent.py serve --platform nvidia --port 8080 + +# Iluvatar 机器 +python .ci/agent.py serve --platform iluvatar --port 8080 +``` + +| 端点 | 方法 | 说明 | +|---|---|---| +| `/webhook` | POST | GitHub Webhook(push/pull_request) | +| `/api/run` | POST | 远程触发 job | +| `/api/job/{id}` | GET | 查询 job 状态 | +| `/health` | GET | 健康检查 | +| `/status` | GET | 队列 + 资源状态 | + +Webhook 支持 `X-Hub-Signature-256` 签名验证,通过 `--webhook-secret` 或 `WEBHOOK_SECRET` 环境变量配置。 + +### 远程 Agent 配置 + +在 `config.yaml` 中配置各平台 Agent 地址,CLI 执行时自动将远程 job 分发到对应 Agent: + +```yaml +agents: + nvidia: + url: http://nvidia-host:8080 + iluvatar: + url: http://iluvatar-host:8080 +``` + +### 资源调度 + +Agent 自动检测 GPU 利用率和系统内存,动态决定并行度: +- GPU 利用率 < 阈值(默认 10%)且未被 Agent 分配 → 可用 +- 资源不足时 job 自动排队,已完成 job 释放资源后自动调度排队任务 + +### GitHub Status + +设置 `GITHUB_TOKEN` 环境变量后,Agent 会自动上报 commit status: +- `pending` — job 开始执行 +- `success` / `failure` — job 执行完成 + +Status context 格式:`ci/infiniops/{job_name}` diff --git a/.ci/agent.py b/.ci/agent.py new file mode 100644 index 0000000..3696ce2 --- /dev/null +++ b/.ci/agent.py @@ -0,0 +1,971 @@ +#!/usr/bin/env python3 +"""CI Runner Agent: webhook server, resource-aware scheduler, GitHub status reporting. + +Usage: + # Run jobs locally (or dispatch to remote agents) + python .ci/agent.py run --branch master + python .ci/agent.py run --branch master --job nvidia_gpu --dry-run + + # Start webhook server + python .ci/agent.py serve --platform nvidia --port 8080 +""" + +import argparse +import collections +import hashlib +import hmac +import json +import os +import shlex +import subprocess +import sys +import threading +import time +import urllib.error +import urllib.request +import uuid +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime +from http.server import BaseHTTPRequestHandler, HTTPServer +from pathlib import Path + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + +import ci_resource as res +import github_status as gh +import run + +# Maximum POST body size (1 MB) to prevent memory exhaustion +MAX_CONTENT_LENGTH = 1 * 1024 * 1024 + +# Job states +STATE_QUEUED = "queued" +STATE_RUNNING = "running" +STATE_PENDING = "pending" +STATE_SUCCESS = "success" +STATE_FAILURE = "failure" +STATE_ERROR = "error" + +# urllib helpers (module-level for easier mocking in tests) +urllib_request = urllib.request.Request +urllib_urlopen = urllib.request.urlopen + + +# --------------------------------------------------------------------------- +# Data classes +# --------------------------------------------------------------------------- + + +class JobRequest: + """Describes a CI job to be executed.""" + + def __init__(self, job_name, branch, commit_sha, config, image_tag=None, results_dir=None): + self.job_id = str(uuid.uuid4())[:8] + self.job_name = job_name + self.branch = branch + self.commit_sha = commit_sha + self.config = config + self.image_tag = image_tag + self.results_dir = results_dir or Path("ci-results") + self.created_at = datetime.now().isoformat() + + job = config["jobs"][job_name] + self.platform = job.get("platform", "nvidia") + + def to_dict(self): + return { + "job_id": self.job_id, + "job_name": self.job_name, + "branch": self.branch, + "commit_sha": self.commit_sha, + "platform": self.platform, + "created_at": self.created_at, + } + + +class JobResult: + """Outcome of a completed job.""" + + def __init__(self, job_id, job_name, commit_sha, returncode, results_dir, duration): + self.job_id = job_id + self.job_name = job_name + self.commit_sha = commit_sha + self.returncode = returncode + self.results_dir = results_dir + self.duration = duration + + self.state = STATE_SUCCESS if returncode == 0 else STATE_FAILURE + + def to_dict(self): + return { + "job_id": self.job_id, + "job_name": self.job_name, + "commit_sha": self.commit_sha, + "state": self.state, + "returncode": self.returncode, + "results_dir": str(self.results_dir), + "duration_seconds": round(self.duration, 1), + } + + +# --------------------------------------------------------------------------- +# Job selection and routing +# --------------------------------------------------------------------------- + + +def select_jobs(config, platform=None, job_name=None): + """Return list of job names to run.""" + jobs = config.get("jobs", {}) + + if job_name: + if job_name not in jobs: + raise ValueError(f"job {job_name!r} not in config") + + return [job_name] + + if platform: + return [ + name for name, job in jobs.items() if job.get("platform") == platform + ] + + return list(jobs.keys()) + + +def route_jobs(config, job_names, local_platform=None): + """Split jobs into local and remote. + + Returns (local_jobs, remote_jobs) where remote_jobs is a list of + (job_name, agent_url) tuples. + """ + agents = config.get("agents", {}) + jobs = config.get("jobs", {}) + local = [] + remote = [] + + for name in job_names: + job = jobs.get(name, {}) + platform = job.get("platform", "") + + if not local_platform: + local.append(name) + elif platform == local_platform: + local.append(name) + elif platform in agents: + remote.append((name, agents[platform].get("url", ""))) + else: + local.append(name) + + return local, remote + + +# --------------------------------------------------------------------------- +# Scheduler +# --------------------------------------------------------------------------- + + +class Scheduler: + """Resource-aware job scheduler with dynamic parallelism.""" + + def __init__( + self, + config, + platform, + resource_pool, + results_dir=None, + max_workers=4, + no_status=False, + dry_run=False, + ): + self._config = config + self._platform = platform + self._resource_pool = resource_pool + self._results_dir = results_dir or Path("ci-results") + self._no_status = no_status + self._dry_run = dry_run + self._queue = collections.deque() + self._jobs: dict[str, dict] = {} # job_id -> {request, result, state, gpu_ids} + self._executor = ThreadPoolExecutor(max_workers=max_workers) + self._lock = threading.Lock() + self._done_event = threading.Event() + + # GitHub config + github_cfg = config.get("github", {}) + self._status_prefix = github_cfg.get("status_context_prefix", "ci/infiniops") + repo = config.get("repo", {}) + repo_url = repo.get("url", "") + self._owner, self._repo = gh.parse_repo_url(repo_url) + + def submit(self, job_request): + """Add a job to the queue and attempt to schedule it. + + Returns the job_id. + """ + with self._lock: + self._jobs[job_request.job_id] = { + "request": job_request, + "result": None, + "state": STATE_QUEUED, + "gpu_ids": [], + } + self._queue.append(job_request) + + self._try_schedule() + return job_request.job_id + + def get_job(self, job_id): + """Get job info by ID.""" + with self._lock: + entry = self._jobs.get(job_id) + + if not entry: + return None + + info = entry["request"].to_dict() + info["state"] = entry["state"] + + if entry["result"]: + info.update(entry["result"].to_dict()) + + return info + + def get_status(self): + """Return scheduler status for the /status endpoint.""" + with self._lock: + queued = [ + self._jobs[r.job_id]["request"].to_dict() + for r in self._queue + ] + running = [] + completed = [] + + for entry in self._jobs.values(): + state = entry["state"] + + if state == STATE_RUNNING: + running.append({**entry["request"].to_dict(), "gpu_ids": entry["gpu_ids"]}) + elif state in (STATE_SUCCESS, STATE_FAILURE): + completed.append(entry["result"].to_dict()) + + return { + "queued": queued, + "running": running, + "completed": completed[-20:], # Last 20 + "resources": self._resource_pool.get_status(), + } + + def wait_all(self): + """Block until all submitted jobs are done. Returns list of JobResult.""" + while True: + with self._lock: + pending = any( + e["state"] in (STATE_QUEUED, STATE_RUNNING) for e in self._jobs.values() + ) + + if not pending: + break + + self._done_event.wait(timeout=2.0) + self._done_event.clear() + + with self._lock: + return [ + e["result"] + for e in self._jobs.values() + if e["result"] is not None + ] + + def _try_schedule(self): + """Try to run queued jobs that have enough resources. + + Resource allocation and job submission are split: allocation decisions + are made under the lock, but executor.submit() happens outside to + prevent deadlock when the thread pool is saturated. + """ + to_launch = [] # [(req, gpu_ids), ...] + + with self._lock: + remaining = collections.deque() + + while self._queue: + req = self._queue.popleft() + job_cfg = self._config["jobs"].get(req.job_name, {}) + gpu_count = res.parse_gpu_requirement(job_cfg) + memory_mb = res.parse_memory_requirement(job_cfg) + + if self._dry_run: + # In dry-run mode, skip resource checks + gpu_ids, ok = [], True + else: + gpu_ids, ok = self._resource_pool.allocate(gpu_count, memory_mb) + + if ok: + self._jobs[req.job_id]["state"] = STATE_RUNNING + self._jobs[req.job_id]["gpu_ids"] = gpu_ids + to_launch.append((req, gpu_ids)) + else: + remaining.append(req) + + self._queue = remaining + + # Submit outside the lock to avoid deadlock with ThreadPoolExecutor + for req, gpu_ids in to_launch: + self._executor.submit(self._run_job, req, gpu_ids) + + def _run_job(self, req, gpu_ids): + """Execute a single job in a worker thread. + + Wrapped in try/finally to guarantee GPU resources are always released + and job state is updated even on unexpected exceptions. + """ + context = gh.build_status_context(self._status_prefix, req.job_name) + result = None + + try: + # Post pending status + if not self._no_status: + gh.post_commit_status( + self._owner, + self._repo, + req.commit_sha, + STATE_PENDING, + context, + f"Running {req.job_name}...", + ) + + job_cfg = self._config["jobs"][req.job_name] + all_stages = job_cfg.get("stages", []) + repo_url = self._config.get("repo", {}).get("url", "") + commit_short = req.commit_sha[:7] if len(req.commit_sha) > 7 else req.commit_sha + results_dir = run.build_results_dir( + req.results_dir, req.platform, all_stages, commit_short + ) + + gpu_id_str = ",".join(str(g) for g in gpu_ids) if gpu_ids else None + docker_args = run.build_docker_args( + self._config, + req.job_name, + repo_url, + req.branch, + all_stages, + "/workspace", + req.image_tag, + gpu_id_override=gpu_id_str, + results_dir=results_dir, + ) + + start = time.monotonic() + + if self._dry_run: + print(f"[dry-run] {req.job_name}: {shlex.join(docker_args)}") + returncode = 0 + else: + results_dir.mkdir(parents=True, exist_ok=True) + proc = subprocess.run(docker_args) + returncode = proc.returncode + + duration = time.monotonic() - start + + result = JobResult( + job_id=req.job_id, + job_name=req.job_name, + commit_sha=req.commit_sha, + returncode=returncode, + results_dir=results_dir, + duration=duration, + ) + + # Post final status + if not self._no_status: + gh.post_commit_status( + self._owner, + self._repo, + req.commit_sha, + result.state, + context, + f"{req.job_name}: {result.state} in {duration:.0f}s", + ) + except Exception as e: + print(f"error: job {req.job_name} failed with exception: {e}", file=sys.stderr) + + if result is None: + result = JobResult( + job_id=req.job_id, + job_name=req.job_name, + commit_sha=req.commit_sha, + returncode=-1, + results_dir=req.results_dir, + duration=0, + ) + + if not self._no_status: + gh.post_commit_status( + self._owner, + self._repo, + req.commit_sha, + STATE_ERROR, + context, + f"{req.job_name}: internal error", + ) + finally: + # Always release resources and update state + self._resource_pool.release(gpu_ids) + + with self._lock: + self._jobs[req.job_id]["result"] = result + self._jobs[req.job_id]["state"] = result.state if result else STATE_FAILURE + + self._done_event.set() + self._try_schedule() + + return result + + +# --------------------------------------------------------------------------- +# Webhook server +# --------------------------------------------------------------------------- + + +def verify_signature(secret, body, signature_header): + """Verify GitHub webhook HMAC-SHA256 signature.""" + if not signature_header: + return False + + expected = "sha256=" + hmac.new( + secret.encode("utf-8"), body, hashlib.sha256 + ).hexdigest() + return hmac.compare_digest(expected, signature_header) + + +def _verify_api_token(handler): + """Check Bearer token for /api/run authentication. + + Returns True if authenticated, False (and sends 401) if not. + When no api_token is configured on the server, all requests are allowed. + """ + api_token = getattr(handler.server, "api_token", None) + + if not api_token: + return True + + auth_header = handler.headers.get("Authorization", "") + + if auth_header == f"Bearer {api_token}": + return True + + handler._respond_json(401, {"error": "unauthorized"}) + return False + + +class WebhookHandler(BaseHTTPRequestHandler): + """HTTP handler for GitHub webhooks and API endpoints.""" + + def log_message(self, format, *args): + print(f"[agent] {args[0]}", file=sys.stderr) + + def do_GET(self): + if self.path == "/health": + self._respond_json(200, {"status": "ok", "platform": self.server.platform}) + elif self.path == "/status": + status = self.server.scheduler.get_status() + self._respond_json(200, status) + elif self.path.startswith("/api/job/"): + self._handle_api_job() + else: + self._respond_json(404, {"error": "not found"}) + + def do_POST(self): + content_length = int(self.headers.get("Content-Length", 0)) + + if content_length > MAX_CONTENT_LENGTH: + self._respond_json(413, {"error": "payload too large"}) + return + + body = self.rfile.read(content_length) + + if self.path == "/webhook": + self._handle_webhook(body) + elif self.path == "/api/run": + self._handle_api_run(body) + else: + self._respond_json(404, {"error": "not found"}) + + def _handle_webhook(self, body): + # Verify signature if secret is configured + if self.server.webhook_secret: + sig = self.headers.get("X-Hub-Signature-256", "") + + if not verify_signature(self.server.webhook_secret, body, sig): + self._respond_json(401, {"error": "invalid signature"}) + return + + event_type = self.headers.get("X-GitHub-Event", "") + + if event_type == "ping": + self._respond_json(200, {"msg": "pong"}) + return + + try: + payload = json.loads(body) + except json.JSONDecodeError: + self._respond_json(400, {"error": "invalid JSON"}) + return + + if event_type == "push": + branch, sha = self._parse_push(payload) + elif event_type == "pull_request": + action = payload.get("action", "") + + if action not in ("opened", "synchronize"): + self._respond_json(200, {"msg": f"ignored PR action: {action}"}) + return + + branch, sha = self._parse_pull_request(payload) + else: + self._respond_json(200, {"msg": f"ignored event: {event_type}"}) + return + + if not branch or not sha: + self._respond_json(400, {"error": "could not extract branch/sha"}) + return + + job_ids = self._submit_jobs(branch, sha) + self._respond_json(200, {"accepted": True, "job_ids": job_ids}) + + def _handle_api_run(self, body): + """Handle /api/run: remote job trigger (requires Bearer token auth).""" + if not _verify_api_token(self): + return + + try: + payload = json.loads(body) + except json.JSONDecodeError: + self._respond_json(400, {"error": "invalid JSON"}) + return + + branch = payload.get("branch", "") + sha = payload.get("commit_sha", "") + job_name = payload.get("job") + image_tag = payload.get("image_tag") + + if not branch: + self._respond_json(400, {"error": "branch is required"}) + return + + if not sha: + sha = run.get_git_commit() + + job_ids = self._submit_jobs(branch, sha, job_name=job_name, image_tag=image_tag) + self._respond_json(200, {"accepted": True, "job_ids": job_ids}) + + def _handle_api_job(self): + """Handle GET /api/job/{id}.""" + parts = self.path.split("/") + + if len(parts) < 4: + self._respond_json(400, {"error": "missing job_id"}) + return + + job_id = parts[3] + info = self.server.scheduler.get_job(job_id) + + if info is None: + self._respond_json(404, {"error": f"job {job_id} not found"}) + else: + self._respond_json(200, info) + + def _parse_push(self, payload): + branch = payload.get("ref", "").removeprefix("refs/heads/") + sha = payload.get("after", "") + return branch, sha + + def _parse_pull_request(self, payload): + pr = payload.get("pull_request", {}) + head = pr.get("head", {}) + branch = head.get("ref", "") + sha = head.get("sha", "") + return branch, sha + + def _submit_jobs(self, branch, sha, job_name=None, image_tag=None): + config = self.server.config + job_names = select_jobs(config, platform=self.server.platform, job_name=job_name) + job_ids = [] + + for name in job_names: + req = JobRequest( + job_name=name, + branch=branch, + commit_sha=sha, + config=config, + image_tag=image_tag, + results_dir=self.server.results_dir, + ) + jid = self.server.scheduler.submit(req) + job_ids.append(jid) + + return job_ids + + def _respond_json(self, status_code, data): + body = json.dumps(data, indent=2).encode("utf-8") + self.send_response(status_code) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + +class AgentServer(HTTPServer): + """HTTP server with scheduler and config context.""" + + def __init__( + self, + host, + port, + config, + scheduler, + platform, + webhook_secret=None, + api_token=None, + results_dir=None, + ): + super().__init__((host, port), WebhookHandler) + self.config = config + self.scheduler = scheduler + self.platform = platform + self.webhook_secret = webhook_secret + self.api_token = api_token + self.results_dir = results_dir or Path("ci-results") + + +# --------------------------------------------------------------------------- +# Remote job dispatch (for CLI triggering remote agents) +# --------------------------------------------------------------------------- + + +def dispatch_remote_job(agent_url, job_name, branch, commit_sha, image_tag=None, api_token=None): + """Send a job to a remote agent via HTTP API. Returns job_id or None.""" + url = f"{agent_url.rstrip('/')}/api/run" + body = { + "branch": branch, + "commit_sha": commit_sha, + "job": job_name, + } + + if image_tag: + body["image_tag"] = image_tag + + data = json.dumps(body).encode("utf-8") + headers = {"Content-Type": "application/json"} + + if api_token: + headers["Authorization"] = f"Bearer {api_token}" + + req = urllib_request(url, data=data, headers=headers, method="POST") + + try: + with urllib_urlopen(req, timeout=30) as resp: + result = json.loads(resp.read()) + job_ids = result.get("job_ids", []) + return job_ids[0] if job_ids else None + except Exception as e: + print(f"error: failed to dispatch to {agent_url}: {e}", file=sys.stderr) + return None + + +def poll_remote_job(agent_url, job_id, interval=5.0, timeout=7200): + """Poll a remote agent for job completion. Returns final state dict or None.""" + url = f"{agent_url.rstrip('/')}/api/job/{job_id}" + deadline = time.monotonic() + timeout + + while time.monotonic() < deadline: + try: + req = urllib_request(url) + + with urllib_urlopen(req, timeout=10) as resp: + info = json.loads(resp.read()) + + state = info.get("state", "") + + if state in (STATE_SUCCESS, STATE_FAILURE): + return info + except Exception: + pass + + time.sleep(interval) + + return None + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def cmd_run(args): + """Handle 'run' subcommand: execute jobs locally and/or remotely.""" + config = run.load_config(args.config) + commit_sha = args.commit or run.get_git_commit(short=False) + + # Determine which jobs to run + try: + job_names = select_jobs(config, platform=args.platform, job_name=args.job) + except ValueError as e: + print(f"error: {e}", file=sys.stderr) + sys.exit(1) + + if not job_names: + print("error: no matching jobs found", file=sys.stderr) + sys.exit(1) + + # Detect local platform (if running serve on this machine, use that; otherwise guess) + local_platform = args.platform + local_jobs, remote_jobs = route_jobs(config, job_names, local_platform) + + # Run local jobs + local_results = [] + + if local_jobs: + pool = res.ResourcePool( + local_platform or "unknown", + utilization_threshold=args.utilization_threshold, + ) + scheduler = Scheduler( + config, + local_platform or "unknown", + pool, + results_dir=args.results_dir, + no_status=args.no_status, + dry_run=args.dry_run, + ) + + for name in local_jobs: + req = JobRequest( + job_name=name, + branch=args.branch, + commit_sha=commit_sha, + config=config, + image_tag=args.image_tag, + results_dir=args.results_dir, + ) + scheduler.submit(req) + + local_results = scheduler.wait_all() + + # Dispatch remote jobs + remote_results = [] + api_token = os.environ.get("AGENT_API_TOKEN", "") + + if remote_jobs and not args.dry_run: + # Dispatch all remote jobs first, then poll concurrently + dispatched = [] # [(name, agent_url, job_id)] + + for name, agent_url in remote_jobs: + if not agent_url: + print(f"warning: no agent URL for {name}, skipping", file=sys.stderr) + remote_results.append({"job_name": name, "state": "error"}) + continue + + print(f"==> dispatching {name} to {agent_url}", file=sys.stderr) + job_id = dispatch_remote_job( + agent_url, name, args.branch, commit_sha, args.image_tag, + api_token=api_token or None, + ) + + if job_id: + print(f" job_id: {job_id}", file=sys.stderr) + dispatched.append((name, agent_url, job_id)) + else: + print(f" failed to dispatch {name}", file=sys.stderr) + remote_results.append({"job_name": name, "state": "error"}) + + # Poll all dispatched jobs concurrently + if dispatched: + with ThreadPoolExecutor(max_workers=len(dispatched)) as executor: + futures = { + executor.submit(poll_remote_job, url, jid): (name, url, jid) + for name, url, jid in dispatched + } + + for future in futures: + name, _, _ = futures[future] + result = future.result() + + if result: + remote_results.append(result) + else: + print(f" timeout waiting for {name}", file=sys.stderr) + remote_results.append({"job_name": name, "state": "timeout"}) + + elif remote_jobs and args.dry_run: + for name, agent_url in remote_jobs: + print(f"[dry-run] dispatch {name} to {agent_url}") + + # Summary + print("\n========== Results ==========") + all_ok = True + + for r in local_results: + status = "PASS" if r.returncode == 0 else "FAIL" + + if r.returncode != 0: + all_ok = False + + print(f" {status} {r.job_name} ({r.duration:.0f}s) {r.results_dir}") + + for r in remote_results: + state = r.get("state", "unknown") + name = r.get("job_name", "?") + status = "PASS" if state == STATE_SUCCESS else "FAIL" + + if state != STATE_SUCCESS: + all_ok = False + + duration = r.get("duration_seconds", 0) + print(f" {status} {name} ({duration:.0f}s) [remote]") + + if not all_ok: + sys.exit(1) + + +def cmd_serve(args): + """Handle 'serve' subcommand: start webhook server.""" + config = run.load_config(args.config) + + pool = res.ResourcePool( + args.platform, + utilization_threshold=args.utilization_threshold, + ) + scheduler = Scheduler( + config, + args.platform, + pool, + results_dir=args.results_dir, + ) + + webhook_secret = args.webhook_secret or os.environ.get("WEBHOOK_SECRET", "") + api_token = args.api_token or os.environ.get("AGENT_API_TOKEN", "") + + if not webhook_secret: + print( + "WARNING: No webhook secret configured. Webhook endpoint accepts " + "unsigned requests. Set --webhook-secret or WEBHOOK_SECRET for production.", + file=sys.stderr, + ) + + if not api_token: + print( + "WARNING: No API token configured. /api/run endpoint is unauthenticated. " + "Set --api-token or AGENT_API_TOKEN for production.", + file=sys.stderr, + ) + + server = AgentServer( + args.host, + args.port, + config, + scheduler, + args.platform, + webhook_secret=webhook_secret or None, + api_token=api_token or None, + results_dir=args.results_dir, + ) + + print( + f"Agent serving on {args.host}:{args.port} (platform={args.platform})", + file=sys.stderr, + ) + print(f" POST /webhook — GitHub webhook", file=sys.stderr) + print(f" POST /api/run — remote job trigger", file=sys.stderr) + print(f" GET /health — health check", file=sys.stderr) + print(f" GET /status — queue & resource status", file=sys.stderr) + print(f" GET /api/job/{{id}} — job status", file=sys.stderr) + + try: + server.serve_forever() + except KeyboardInterrupt: + print("\nShutting down...", file=sys.stderr) + server.shutdown() + + +def main(): + parser = argparse.ArgumentParser( + description="CI Runner Agent: run jobs locally, dispatch remotely, or serve webhooks", + ) + subparsers = parser.add_subparsers(dest="command") + + # --- run subcommand --- + run_parser = subparsers.add_parser("run", help="Run CI jobs") + run_parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + ) + run_parser.add_argument("--branch", type=str, required=True, help="Branch to test") + run_parser.add_argument("--job", type=str, help="Specific job name") + run_parser.add_argument("--platform", type=str, help="Filter jobs by platform") + run_parser.add_argument("--image-tag", type=str, help="Override image tag") + run_parser.add_argument("--commit", type=str, help="Override commit SHA") + run_parser.add_argument( + "--results-dir", + type=Path, + default=Path("ci-results"), + ) + run_parser.add_argument( + "--utilization-threshold", + type=int, + default=10, + help="GPU utilization threshold (%%) to consider free (default: 10)", + ) + run_parser.add_argument("--no-status", action="store_true", help="Skip GitHub status") + run_parser.add_argument("--dry-run", action="store_true") + + # --- serve subcommand --- + serve_parser = subparsers.add_parser("serve", help="Start webhook server") + serve_parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + ) + serve_parser.add_argument( + "--platform", + type=str, + required=True, + help="Platform this agent handles (nvidia, iluvatar, etc.)", + ) + serve_parser.add_argument("--port", type=int, default=8080) + serve_parser.add_argument("--host", type=str, default="0.0.0.0") + serve_parser.add_argument("--webhook-secret", type=str) + serve_parser.add_argument( + "--api-token", + type=str, + help="Bearer token for /api/run authentication (or AGENT_API_TOKEN env var)", + ) + serve_parser.add_argument( + "--results-dir", + type=Path, + default=Path("ci-results"), + ) + serve_parser.add_argument( + "--utilization-threshold", + type=int, + default=10, + ) + + args = parser.parse_args() + + if args.command == "run": + cmd_run(args) + elif args.command == "serve": + cmd_serve(args) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.ci/build.py b/.ci/build.py index 2339319..7953209 100644 --- a/.ci/build.py +++ b/.ci/build.py @@ -9,32 +9,7 @@ import sys from pathlib import Path -try: - import yaml -except ImportError: - print( - "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr - ) - sys.exit(1) - - -def load_config(path): - with open(path, encoding="utf-8") as f: - return yaml.safe_load(f) - - -def get_git_commit(ref="HEAD"): - result = subprocess.run( - ["git", "rev-parse", "--short", ref], - capture_output=True, - text=True, - ) - - if result.returncode != 0: - print(f"error: failed to get commit hash for `{ref}`", file=sys.stderr) - sys.exit(1) - - return result.stdout.strip() +from utils import get_git_commit, load_config def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"): diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py new file mode 100644 index 0000000..f3dbfb1 --- /dev/null +++ b/.ci/ci_resource.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +"""Resource detection and allocation for CI Runner Agent.""" + +import os +import subprocess +import threading +from dataclasses import dataclass, field + +# GPU passthrough styles +GPU_STYLE_NVIDIA = "nvidia" +GPU_STYLE_NONE = "none" + + +@dataclass +class GpuInfo: + index: int + memory_used_mb: float + memory_total_mb: float + utilization_pct: float + + +@dataclass +class SystemResources: + total_memory_mb: float + available_memory_mb: float + cpu_count: int + + +class ResourcePool: + """Thread-safe GPU and system resource manager. + + Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi) + and tracks allocations to enable dynamic parallel scheduling. + """ + + GPU_QUERY_TOOLS = { + "nvidia": "nvidia-smi", + "iluvatar": "ixsmi", + } + + def __init__(self, platform, utilization_threshold=10): + self._platform = platform + self._utilization_threshold = utilization_threshold + self._allocated: set[int] = set() + self._lock = threading.Lock() + + @property + def platform(self): + return self._platform + + @property + def allocated(self): + with self._lock: + return set(self._allocated) + + def detect_gpus(self) -> list[GpuInfo]: + """Query GPU status via platform-specific CLI tool.""" + tool = self.GPU_QUERY_TOOLS.get(self._platform) + + if not tool: + return [] + + try: + result = subprocess.run( + [ + tool, + "--query-gpu=index,memory.used,memory.total,utilization.gpu", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + timeout=10, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return [] + + if result.returncode != 0: + return [] + + gpus = [] + + for line in result.stdout.strip().splitlines(): + parts = [p.strip() for p in line.split(",")] + + if len(parts) < 4: + continue + + try: + gpus.append( + GpuInfo( + index=int(parts[0]), + memory_used_mb=float(parts[1]), + memory_total_mb=float(parts[2]), + utilization_pct=float(parts[3]), + ) + ) + except (ValueError, IndexError): + continue + + return gpus + + def detect_system_resources(self) -> SystemResources: + """Read system memory from /proc/meminfo and CPU count.""" + total_mb = 0.0 + available_mb = 0.0 + + try: + with open("/proc/meminfo", encoding="utf-8") as f: + for line in f: + if line.startswith("MemTotal:"): + total_mb = float(line.split()[1]) / 1024 + elif line.startswith("MemAvailable:"): + available_mb = float(line.split()[1]) / 1024 + except OSError: + pass + + return SystemResources( + total_memory_mb=total_mb, + available_memory_mb=available_mb, + cpu_count=os.cpu_count() or 1, + ) + + def get_free_gpus(self) -> list[int]: + """Return GPU indices with utilization below threshold.""" + gpus = self.detect_gpus() + return [ + g.index + for g in gpus + if g.utilization_pct < self._utilization_threshold + ] + + def allocate(self, gpu_count, memory_mb=0) -> tuple[list[int], bool]: + """Try to allocate GPUs and check memory. + + Returns (allocated_gpu_ids, success). On failure returns ([], False). + GPU detection and memory checks run outside the lock to avoid blocking + other threads while subprocess.run (nvidia-smi) executes. + """ + if gpu_count <= 0: + if memory_mb > 0: + sys_res = self.detect_system_resources() + + if sys_res.available_memory_mb < memory_mb: + return ([], False) + + return ([], True) + + # Detect GPUs and memory outside the lock (subprocess.run can block) + free_gpus = set(self.get_free_gpus()) + sys_res = self.detect_system_resources() if memory_mb > 0 else None + + with self._lock: + available = free_gpus - self._allocated + + if len(available) < gpu_count: + return ([], False) + + if sys_res is not None and sys_res.available_memory_mb < memory_mb: + return ([], False) + + selected = sorted(available)[:gpu_count] + self._allocated.update(selected) + return (selected, True) + + def release(self, gpu_ids): + """Return GPUs to the free pool.""" + with self._lock: + self._allocated -= set(gpu_ids) + + def get_status(self) -> dict: + """Return current resource status for API endpoints.""" + gpus = self.detect_gpus() + sys_res = self.detect_system_resources() + + with self._lock: + allocated = sorted(self._allocated) + + return { + "platform": self._platform, + "gpus": [ + { + "index": g.index, + "memory_used_mb": g.memory_used_mb, + "memory_total_mb": g.memory_total_mb, + "utilization_pct": g.utilization_pct, + "allocated_by_agent": g.index in allocated, + } + for g in gpus + ], + "allocated_gpu_ids": allocated, + "system": { + "total_memory_mb": round(sys_res.total_memory_mb, 1), + "available_memory_mb": round(sys_res.available_memory_mb, 1), + "cpu_count": sys_res.cpu_count, + }, + "utilization_threshold": self._utilization_threshold, + } + + +def parse_gpu_requirement(job_config) -> int: + """Extract GPU count requirement from a job config.""" + resources = job_config.get("resources", {}) + gpu_style = resources.get("gpu_style", GPU_STYLE_NVIDIA) + + if gpu_style == GPU_STYLE_NONE: + return 0 + + gpu_ids = str(resources.get("gpu_ids", "")) + + if not gpu_ids: + return resources.get("gpu_count", 0) + + if gpu_ids == "all": + return 0 # "all" means use all available, don't reserve specific count + + return len(gpu_ids.split(",")) + + +def parse_memory_requirement(job_config) -> float: + """Extract memory requirement in MB from a job config.""" + resources = job_config.get("resources", {}) + memory = str(resources.get("memory", "")) + + if not memory: + return 0 + + memory = memory.lower().strip() + + if memory.endswith("gb"): + return float(memory[:-2]) * 1024 + elif memory.endswith("g"): + return float(memory[:-1]) * 1024 + elif memory.endswith("mb"): + return float(memory[:-2]) + elif memory.endswith("m"): + return float(memory[:-1]) + + try: + return float(memory) * 1024 # Default: GB + except ValueError: + return 0 diff --git a/.ci/config.yaml b/.ci/config.yaml index a86174a..e62bc07 100644 --- a/.ci/config.yaml +++ b/.ci/config.yaml @@ -2,32 +2,69 @@ repo: url: https://github.com/InfiniTensor/InfiniOps.git branch: master -images: - nvidia: - dockerfile: .ci/images/nvidia/ - build_args: - BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 - ascend: # TODO: Ascend image is not ready yet - dockerfile: .ci/images/ascend/ - build_args: - BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 - private_sdk: - source_env: PRIVATE_SDK_URL +github: + status_context_prefix: "ci/infiniops" # GitHub Commit Status context 前缀 + +# agents: # 远程 Agent 地址(CLI 跨机器触发用) +# nvidia: +# url: http://nvidia-host:8080 +# iluvatar: +# url: http://iluvatar-host:8080 -jobs: - nvidia_gpu: - image: latest - platform: nvidia - resources: - gpu_ids: "0" # 指定 GPU ID,如 "0" "0,2" "all" - memory: 32GB - shm_size: 16g # 避免 PyTorch 默认 64MB SHMEM 不足 - timeout: 3600 +platforms: + nvidia: + image: + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + setup: pip install .[dev] --no-build-isolation + jobs: + gpu: + resources: + gpu_ids: "0" # 指定 GPU ID,如 "0" "0,2" "all" + memory: 32GB + shm_size: 16g # 避免 PyTorch 默认 64MB SHMEM 不足 + timeout: 3600 + # env: # 可选,注入容器环境变量 + # MY_VAR: value + stages: + - name: test + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml - setup: pip install .[dev] - # env: # 可选,注入容器环境变量 - # MY_VAR: value + iluvatar: + image: + dockerfile: .ci/images/iluvatar/ + build_args: + BASE_IMAGE: corex:qs_pj20250825 + APT_MIRROR: http://archive.ubuntu.com/ubuntu + PIP_INDEX_URL: https://pypi.org/simple + docker_args: + - "--privileged" + - "--cap-add=ALL" + - "--pid=host" + - "--ipc=host" + volumes: + - /dev:/dev + - /lib/firmware:/lib/firmware + - /usr/src:/usr/src + - /lib/modules:/lib/modules + setup: pip install .[dev] --no-build-isolation + jobs: + gpu: + resources: + gpu_ids: "0" # 通过 CUDA_VISIBLE_DEVICES 控制可见 GPU + gpu_style: none # CoreX 设备通过 --privileged + /dev 挂载透传 + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml - stages: - - name: test - run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml + ascend: # TODO: Ascend image is not ready yet + image: + dockerfile: .ci/images/ascend/ + build_args: + BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 + private_sdk: + source_env: PRIVATE_SDK_URL diff --git a/.ci/github_status.py b/.ci/github_status.py new file mode 100644 index 0000000..a7abb8f --- /dev/null +++ b/.ci/github_status.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +"""GitHub Commit Status API wrapper using urllib (zero external dependencies).""" + +import json +import os +import re +import sys +import urllib.error +import urllib.request + + +def parse_repo_url(url): + """Extract (owner, repo) from a GitHub URL. + + Handles: + - https://github.com/Owner/Repo.git + - git@github.com:Owner/Repo.git + """ + # HTTPS format + m = re.match(r"https?://[^/]+/([^/]+)/([^/]+?)(?:\.git)?$", url) + + if m: + return m.group(1), m.group(2) + + # SSH format + m = re.match(r"git@[^:]+:([^/]+)/([^/]+?)(?:\.git)?$", url) + + if m: + return m.group(1), m.group(2) + + return "", "" + + +def build_status_context(prefix, job_name): + """Build status context string, e.g. 'ci/infiniops/nvidia_gpu'.""" + return f"{prefix}/{job_name}" + + +def post_commit_status( + owner, + repo, + sha, + state, + context, + description, + target_url=None, + token=None, +): + """Post a commit status to GitHub. + + Args: + state: One of 'pending', 'success', 'failure', 'error'. + Returns True on success, False on failure. + """ + token = token or os.environ.get("GITHUB_TOKEN", "") + + if not token: + print("warning: GITHUB_TOKEN not set, skipping status update", file=sys.stderr) + return False + + if not owner or not repo or not sha: + print("warning: missing owner/repo/sha, skipping status update", file=sys.stderr) + return False + + url = f"https://api.github.com/repos/{owner}/{repo}/statuses/{sha}" + body = { + "state": state, + "context": context, + "description": description[:140], + } + + if target_url: + body["target_url"] = target_url + + data = json.dumps(body).encode("utf-8") + req = urllib.request.Request( + url, + data=data, + headers={ + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + "Content-Type": "application/json", + }, + method="POST", + ) + + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return 200 <= resp.status < 300 + except urllib.error.HTTPError as e: + print( + f"warning: GitHub status API returned {e.code}: {e.reason}", + file=sys.stderr, + ) + return False + except urllib.error.URLError as e: + print(f"warning: GitHub status API error: {e.reason}", file=sys.stderr) + return False diff --git a/.ci/images/iluvatar/Dockerfile b/.ci/images/iluvatar/Dockerfile new file mode 100644 index 0000000..f098e5f --- /dev/null +++ b/.ci/images/iluvatar/Dockerfile @@ -0,0 +1,53 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +# CoreX runtime environment (base image sets these in /etc/bash.bashrc, +# but docker build RUN uses /bin/sh which doesn't source it) +ENV PATH=/usr/local/corex/bin:/usr/local/corex-4.3.0/corex-toolbox-1.0.0/bin:/usr/local/corex/lib64/python3/dist-packages/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages +ENV LD_LIBRARY_PATH=/usr/local/corex/lib64:/usr/local/lib:/usr/local/openmpi/lib + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + +ARG APT_MIRROR +RUN if [ -n "$APT_MIRROR" ]; then \ + sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ + fi && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + ninja-build \ + coreutils \ + && rm -rf /var/lib/apt/lists/* + +RUN ln -sf $(which python3) /usr/local/bin/python 2>/dev/null || true + +ARG PIP_INDEX_URL +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir \ + ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml \ + ruff==0.15.7 + +RUN pip config set global.index-url https://pypi.org/simple + +# Pin pre-installed CoreX torch to prevent pip from replacing it with upstream version +RUN pip show torch >/dev/null 2>&1 && \ + echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ + touch /etc/pip-constraints.txt +ENV PIP_CONSTRAINT=/etc/pip-constraints.txt + +WORKDIR /workspace diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile index 74ccfd1..05da963 100644 --- a/.ci/images/nvidia/Dockerfile +++ b/.ci/images/nvidia/Dockerfile @@ -10,7 +10,11 @@ ARG http_proxy ARG https_proxy ARG no_proxy -RUN apt-get update && \ +ARG APT_MIRROR +RUN if [ -n "$APT_MIRROR" ]; then \ + sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ + fi && \ + apt-get update && \ apt-get install -y --no-install-recommends \ git \ cmake \ @@ -19,13 +23,24 @@ RUN apt-get update && \ libclang-dev \ && rm -rf /var/lib/apt/lists/* -RUN pip install --no-cache-dir \ + +ARG PIP_INDEX_URL +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir \ + ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ scikit-build-core \ pybind11 \ libclang \ pytest \ pytest-cov \ pytest-xdist \ - pyyaml + pyyaml \ + ruff==0.15.7 + +# Pin pre-installed torch to prevent pip from replacing it with a different version +RUN pip show torch >/dev/null 2>&1 && \ + echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ + touch /etc/pip-constraints.txt +ENV PIP_CONSTRAINT=/etc/pip-constraints.txt WORKDIR /workspace diff --git a/.ci/run.py b/.ci/run.py index 0c8d648..2575781 100644 --- a/.ci/run.py +++ b/.ci/run.py @@ -9,31 +9,8 @@ from datetime import datetime from pathlib import Path -try: - import yaml -except ImportError: - print( - "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr - ) - sys.exit(1) - - -def load_config(path): - with open(path, encoding="utf-8") as f: - return yaml.safe_load(f) - - -def get_git_commit(ref="HEAD"): - result = subprocess.run( - ["git", "rev-parse", "--short", ref], - capture_output=True, - text=True, - ) - - if result.returncode != 0: - return "unknown" - - return result.stdout.strip() +from ci_resource import GPU_STYLE_NVIDIA, GPU_STYLE_NONE +from utils import get_git_commit, load_config def build_results_dir(base, platform, stages, commit): @@ -155,16 +132,29 @@ def build_docker_args( args.append("-e") args.append(f"STAGE_{i + 1}_CMD={s['run']}") + # Platform-specific device access + for flag in job.get("docker_args", []): + args.append(flag) + + for vol in job.get("volumes", []): + args.extend(["-v", vol]) + gpu_id = gpu_id_override or str(resources.get("gpu_ids", "")) gpu_count = resources.get("gpu_count", 0) - - if gpu_id: - if gpu_id == "all": - args.extend(["--gpus", "all"]) - else: - args.extend(["--gpus", f'"device={gpu_id}"']) - elif gpu_count and gpu_count > 0: - args.extend(["--gpus", f"count={gpu_count}"]) + gpu_style = resources.get("gpu_style", GPU_STYLE_NVIDIA) + + if gpu_style == GPU_STYLE_NVIDIA: + if gpu_id: + if gpu_id == "all": + args.extend(["--gpus", "all"]) + else: + args.extend(["--gpus", f'"device={gpu_id}"']) + elif gpu_count and gpu_count > 0: + args.extend(["--gpus", f"count={gpu_count}"]) + elif gpu_style == GPU_STYLE_NONE and gpu_id and gpu_id != "all": + # For platforms like Iluvatar/CoreX that use --privileged + /dev mount, + # control visible GPUs via CUDA_VISIBLE_DEVICES. + args.extend(["-e", f"CUDA_VISIBLE_DEVICES={gpu_id}"]) memory = resources.get("memory") diff --git a/.ci/tests/conftest.py b/.ci/tests/conftest.py index 98079cd..38ed716 100644 --- a/.ci/tests/conftest.py +++ b/.ci/tests/conftest.py @@ -6,37 +6,41 @@ import pytest +from utils import normalize_config + @pytest.fixture def minimal_config(): - return { + """Minimal platform-centric config, normalized to flat format.""" + raw = { "repo": { "url": "https://github.com/InfiniTensor/InfiniOps.git", "branch": "master", }, - "images": { + "platforms": { "nvidia": { - "dockerfile": ".ci/images/nvidia/", - "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, - } - }, - "jobs": { - "nvidia_gpu": { - "image": "latest", - "platform": "nvidia", - "resources": { - "gpu_ids": "0", - "memory": "32GB", - "shm_size": "16g", - "timeout": 3600, + "image": { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, }, "setup": "pip install .[dev]", - "stages": [ - { - "name": "test", - "run": "pytest tests/ -v", + "jobs": { + "gpu": { + "resources": { + "gpu_ids": "0", + "memory": "32GB", + "shm_size": "16g", + "timeout": 3600, + }, + "stages": [ + { + "name": "test", + "run": "pytest tests/ -v", + } + ], } - ], + }, } }, } + return normalize_config(raw) diff --git a/.ci/tests/test_agent.py b/.ci/tests/test_agent.py new file mode 100644 index 0000000..5741385 --- /dev/null +++ b/.ci/tests/test_agent.py @@ -0,0 +1,503 @@ +import hashlib +import hmac +import json +import threading +import time +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +import agent +import ci_resource as res +from utils import normalize_config + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def agent_config(): + raw = { + "repo": { + "url": "https://github.com/InfiniTensor/InfiniOps.git", + "branch": "master", + }, + "github": { + "status_context_prefix": "ci/infiniops", + }, + "agents": { + "nvidia": {"url": "http://nvidia-host:8080"}, + "iluvatar": {"url": "http://iluvatar-host:8080"}, + }, + "platforms": { + "nvidia": { + "image": { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, + }, + "setup": "pip install .[dev]", + "jobs": { + "gpu": { + "resources": { + "gpu_ids": "0", + "memory": "32GB", + "shm_size": "16g", + "timeout": 3600, + }, + "stages": [{"name": "test", "run": "pytest tests/ -v"}], + }, + }, + }, + "iluvatar": { + "image": { + "dockerfile": ".ci/images/iluvatar/", + "build_args": {"BASE_IMAGE": "corex:qs_pj20250825"}, + }, + "setup": "pip install .[dev]", + "jobs": { + "gpu": { + "resources": { + "gpu_ids": "0", + "gpu_style": "none", + "memory": "32GB", + "shm_size": "16g", + "timeout": 3600, + }, + "stages": [{"name": "test", "run": "pytest tests/ -v"}], + }, + }, + }, + }, + } + return normalize_config(raw) + + +@pytest.fixture +def mock_resource_pool(): + pool = MagicMock(spec=res.ResourcePool) + pool.platform = "nvidia" + pool.allocate.return_value = ([0], True) + pool.release.return_value = None + pool.get_status.return_value = {"platform": "nvidia", "gpus": [], "allocated_gpu_ids": [], "system": {}} + return pool + + +# --------------------------------------------------------------------------- +# select_jobs +# --------------------------------------------------------------------------- + + +def test_select_jobs_by_name(agent_config): + jobs = agent.select_jobs(agent_config, job_name="nvidia_gpu") + assert jobs == ["nvidia_gpu"] + + +def test_select_jobs_by_platform(agent_config): + jobs = agent.select_jobs(agent_config, platform="nvidia") + assert jobs == ["nvidia_gpu"] + + +def test_select_jobs_by_platform_iluvatar(agent_config): + jobs = agent.select_jobs(agent_config, platform="iluvatar") + assert jobs == ["iluvatar_gpu"] + + +def test_select_jobs_all(agent_config): + jobs = agent.select_jobs(agent_config) + assert set(jobs) == {"nvidia_gpu", "iluvatar_gpu"} + + +def test_select_jobs_invalid_name(agent_config): + with pytest.raises(ValueError, match="not_exist"): + agent.select_jobs(agent_config, job_name="not_exist") + + +# --------------------------------------------------------------------------- +# route_jobs +# --------------------------------------------------------------------------- + + +def test_route_jobs_local(agent_config): + local, remote = agent.route_jobs(agent_config, ["nvidia_gpu"], local_platform="nvidia") + assert local == ["nvidia_gpu"] + assert remote == [] + + +def test_route_jobs_remote(agent_config): + local, remote = agent.route_jobs(agent_config, ["iluvatar_gpu"], local_platform="nvidia") + assert local == [] + assert len(remote) == 1 + assert remote[0][0] == "iluvatar_gpu" + assert remote[0][1] == "http://iluvatar-host:8080" + + +def test_route_jobs_mixed(agent_config): + local, remote = agent.route_jobs( + agent_config, ["nvidia_gpu", "iluvatar_gpu"], local_platform="nvidia" + ) + assert local == ["nvidia_gpu"] + assert len(remote) == 1 + + +def test_route_jobs_no_platform(agent_config): + local, remote = agent.route_jobs(agent_config, ["nvidia_gpu", "iluvatar_gpu"]) + assert len(local) == 2 + assert remote == [] + + +# --------------------------------------------------------------------------- +# verify_signature +# --------------------------------------------------------------------------- + + +def test_verify_signature_valid(): + secret = "my-secret" + body = b'{"action": "push"}' + sig = "sha256=" + hmac.new(secret.encode(), body, hashlib.sha256).hexdigest() + assert agent.verify_signature(secret, body, sig) is True + + +def test_verify_signature_invalid(): + assert agent.verify_signature("secret", b"body", "sha256=wrong") is False + + +def test_verify_signature_empty(): + assert agent.verify_signature("secret", b"body", "") is False + + +# --------------------------------------------------------------------------- +# JobRequest / JobResult +# --------------------------------------------------------------------------- + + +def test_job_request_fields(agent_config): + req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) + assert req.job_name == "nvidia_gpu" + assert req.platform == "nvidia" + assert req.commit_sha == "abc123" + assert len(req.job_id) == 8 + d = req.to_dict() + assert d["job_name"] == "nvidia_gpu" + + +def test_job_result_success(): + r = agent.JobResult("id1", "nvidia_gpu", "abc", 0, Path("/tmp/res"), 42.5) + assert r.state == "success" + + +def test_job_result_failure(): + r = agent.JobResult("id1", "nvidia_gpu", "abc", 1, Path("/tmp/res"), 10.0) + assert r.state == "failure" + + +# --------------------------------------------------------------------------- +# Scheduler +# --------------------------------------------------------------------------- + + +def test_scheduler_submit_and_run(agent_config, mock_resource_pool, monkeypatch): + monkeypatch.setattr("subprocess.run", lambda cmd, **kw: MagicMock(returncode=0)) + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + results_dir=Path("/tmp/test-results"), + no_status=True, dry_run=True, + ) + req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config, + results_dir=Path("/tmp/test-results")) + jid = scheduler.submit(req) + results = scheduler.wait_all() + assert len(results) == 1 + assert results[0].state == "success" + + +def test_scheduler_queues_when_no_resources(agent_config, monkeypatch): + pool = MagicMock(spec=res.ResourcePool) + pool.allocate.return_value = ([], False) + pool.get_status.return_value = {"platform": "nvidia", "gpus": [], "allocated_gpu_ids": [], "system": {}} + + scheduler = agent.Scheduler( + agent_config, "nvidia", pool, + no_status=True, dry_run=False, + ) + + req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) + scheduler.submit(req) + + info = scheduler.get_job(req.job_id) + assert info["state"] == "queued" + + +def test_scheduler_get_status(agent_config, mock_resource_pool): + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + no_status=True, dry_run=True, + ) + + status = scheduler.get_status() + assert "queued" in status + assert "running" in status + assert "completed" in status + assert "resources" in status + + +# --------------------------------------------------------------------------- +# WebhookHandler — push event parsing +# --------------------------------------------------------------------------- + + +def test_webhook_parse_push(): + handler = agent.WebhookHandler.__new__(agent.WebhookHandler) + payload = {"ref": "refs/heads/feat/test", "after": "abc123def456"} + branch, sha = handler._parse_push(payload) + assert branch == "feat/test" + assert sha == "abc123def456" + + +def test_webhook_parse_pr(): + handler = agent.WebhookHandler.__new__(agent.WebhookHandler) + payload = { + "pull_request": { + "head": { + "ref": "feat/pr-branch", + "sha": "def789", + } + } + } + branch, sha = handler._parse_pull_request(payload) + assert branch == "feat/pr-branch" + assert sha == "def789" + + +# --------------------------------------------------------------------------- +# Integration-style: webhook HTTP test +# --------------------------------------------------------------------------- + + +def _urlopen_no_proxy(url_or_req, **kwargs): + """urlopen that bypasses any HTTP_PROXY.""" + import urllib.request + + opener = urllib.request.build_opener(urllib.request.ProxyHandler({})) + return opener.open(url_or_req, **kwargs) + + +def test_health_endpoint(agent_config, mock_resource_pool): + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + no_status=True, + ) + server = agent.AgentServer( + "127.0.0.1", 0, agent_config, scheduler, "nvidia", + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + try: + resp = _urlopen_no_proxy(f"http://127.0.0.1:{port}/health", timeout=5) + data = json.loads(resp.read()) + assert data["status"] == "ok" + assert data["platform"] == "nvidia" + finally: + server.server_close() + + +def test_api_run_endpoint(agent_config, mock_resource_pool, monkeypatch): + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + no_status=True, dry_run=True, + ) + server = agent.AgentServer( + "127.0.0.1", 0, agent_config, scheduler, "nvidia", + results_dir=Path("/tmp/test-results"), + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.request + + body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() + req = urllib.request.Request( + f"http://127.0.0.1:{port}/api/run", + data=body, + headers={"Content-Type": "application/json"}, + ) + + try: + resp = _urlopen_no_proxy(req, timeout=5) + data = json.loads(resp.read()) + assert data["accepted"] is True + assert len(data["job_ids"]) >= 1 + finally: + server.server_close() + + +def test_webhook_with_signature(agent_config, mock_resource_pool, monkeypatch): + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + no_status=True, dry_run=True, + ) + secret = "test-secret" + server = agent.AgentServer( + "127.0.0.1", 0, agent_config, scheduler, "nvidia", + webhook_secret=secret, + results_dir=Path("/tmp/test-results"), + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.request + + payload = json.dumps({ + "ref": "refs/heads/master", + "after": "abc123def456", + }).encode() + sig = "sha256=" + hmac.new(secret.encode(), payload, hashlib.sha256).hexdigest() + + req = urllib.request.Request( + f"http://127.0.0.1:{port}/webhook", + data=payload, + headers={ + "Content-Type": "application/json", + "X-GitHub-Event": "push", + "X-Hub-Signature-256": sig, + }, + ) + + try: + resp = _urlopen_no_proxy(req, timeout=5) + data = json.loads(resp.read()) + assert data["accepted"] is True + finally: + server.server_close() + + +def test_webhook_invalid_signature(agent_config, mock_resource_pool): + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + no_status=True, + ) + server = agent.AgentServer( + "127.0.0.1", 0, agent_config, scheduler, "nvidia", + webhook_secret="real-secret", + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.error + import urllib.request + + payload = b'{"ref": "refs/heads/master", "after": "abc"}' + req = urllib.request.Request( + f"http://127.0.0.1:{port}/webhook", + data=payload, + headers={ + "Content-Type": "application/json", + "X-GitHub-Event": "push", + "X-Hub-Signature-256": "sha256=invalid", + }, + ) + + try: + with pytest.raises(urllib.error.HTTPError) as exc_info: + _urlopen_no_proxy(req, timeout=5) + + assert exc_info.value.code == 401 + finally: + server.server_close() + + +# --------------------------------------------------------------------------- +# API token authentication +# --------------------------------------------------------------------------- + + +def test_api_run_requires_token(agent_config, mock_resource_pool, monkeypatch): + """When api_token is set, /api/run rejects requests without valid token.""" + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + no_status=True, dry_run=True, + ) + server = agent.AgentServer( + "127.0.0.1", 0, agent_config, scheduler, "nvidia", + api_token="my-secret-token", + results_dir=Path("/tmp/test-results"), + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.error + import urllib.request + + body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() + req = urllib.request.Request( + f"http://127.0.0.1:{port}/api/run", + data=body, + headers={"Content-Type": "application/json"}, + ) + + try: + with pytest.raises(urllib.error.HTTPError) as exc_info: + _urlopen_no_proxy(req, timeout=5) + + assert exc_info.value.code == 401 + finally: + server.server_close() + + +def test_api_run_accepts_valid_token(agent_config, mock_resource_pool, monkeypatch): + """When api_token is set, /api/run accepts requests with correct Bearer token.""" + monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) + + scheduler = agent.Scheduler( + agent_config, "nvidia", mock_resource_pool, + no_status=True, dry_run=True, + ) + server = agent.AgentServer( + "127.0.0.1", 0, agent_config, scheduler, "nvidia", + api_token="my-secret-token", + results_dir=Path("/tmp/test-results"), + ) + port = server.server_address[1] + + t = threading.Thread(target=server.handle_request, daemon=True) + t.start() + + import urllib.request + + body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() + req = urllib.request.Request( + f"http://127.0.0.1:{port}/api/run", + data=body, + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer my-secret-token", + }, + ) + + try: + resp = _urlopen_no_proxy(req, timeout=5) + data = json.loads(resp.read()) + assert data["accepted"] is True + finally: + server.server_close() diff --git a/.ci/tests/test_github_status.py b/.ci/tests/test_github_status.py new file mode 100644 index 0000000..0efa36e --- /dev/null +++ b/.ci/tests/test_github_status.py @@ -0,0 +1,144 @@ +import json +from unittest.mock import MagicMock, patch + +import pytest + +import github_status as gh + + +# --------------------------------------------------------------------------- +# parse_repo_url +# --------------------------------------------------------------------------- + + +def test_parse_repo_url_https(): + owner, repo = gh.parse_repo_url("https://github.com/InfiniTensor/InfiniOps.git") + assert owner == "InfiniTensor" + assert repo == "InfiniOps" + + +def test_parse_repo_url_https_no_git(): + owner, repo = gh.parse_repo_url("https://github.com/Owner/Repo") + assert owner == "Owner" + assert repo == "Repo" + + +def test_parse_repo_url_ssh(): + owner, repo = gh.parse_repo_url("git@github.com:Owner/Repo.git") + assert owner == "Owner" + assert repo == "Repo" + + +def test_parse_repo_url_invalid(): + owner, repo = gh.parse_repo_url("not-a-url") + assert owner == "" + assert repo == "" + + +# --------------------------------------------------------------------------- +# build_status_context +# --------------------------------------------------------------------------- + + +def test_build_status_context(): + ctx = gh.build_status_context("ci/infiniops", "nvidia_gpu") + assert ctx == "ci/infiniops/nvidia_gpu" + + +# --------------------------------------------------------------------------- +# post_commit_status +# --------------------------------------------------------------------------- + + +def test_post_status_no_token(monkeypatch): + monkeypatch.delenv("GITHUB_TOKEN", raising=False) + result = gh.post_commit_status("owner", "repo", "abc123", "success", "ctx", "desc") + assert result is False + + +def test_post_status_missing_owner(): + result = gh.post_commit_status("", "repo", "abc123", "success", "ctx", "desc", token="tok") + assert result is False + + +def test_post_status_success(monkeypatch): + mock_response = MagicMock() + mock_response.status = 201 + mock_response.__enter__ = MagicMock(return_value=mock_response) + mock_response.__exit__ = MagicMock(return_value=False) + + captured_req = {} + + def mock_urlopen(req, **kwargs): + captured_req["url"] = req.full_url + captured_req["data"] = json.loads(req.data) + captured_req["headers"] = dict(req.headers) + return mock_response + + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) + + result = gh.post_commit_status( + "InfiniTensor", + "InfiniOps", + "abc123def", + "success", + "ci/infiniops/nvidia_gpu", + "Tests passed", + token="ghp_test_token", + ) + + assert result is True + assert "abc123def" in captured_req["url"] + assert captured_req["data"]["state"] == "success" + assert captured_req["data"]["context"] == "ci/infiniops/nvidia_gpu" + assert "ghp_test_token" in captured_req["headers"]["Authorization"] + + +def test_post_status_http_error(monkeypatch): + import urllib.error + + def mock_urlopen(req, **kwargs): + raise urllib.error.HTTPError( + url="", code=422, msg="Unprocessable", hdrs=None, fp=None + ) + + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) + + result = gh.post_commit_status( + "owner", "repo", "sha", "success", "ctx", "desc", token="tok" + ) + assert result is False + + +def test_post_status_url_error(monkeypatch): + import urllib.error + + def mock_urlopen(req, **kwargs): + raise urllib.error.URLError("connection refused") + + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) + + result = gh.post_commit_status( + "owner", "repo", "sha", "success", "ctx", "desc", token="tok" + ) + assert result is False + + +def test_post_status_truncates_description(monkeypatch): + mock_response = MagicMock() + mock_response.status = 201 + mock_response.__enter__ = MagicMock(return_value=mock_response) + mock_response.__exit__ = MagicMock(return_value=False) + + captured = {} + + def mock_urlopen(req, **kwargs): + captured["data"] = json.loads(req.data) + return mock_response + + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) + + long_desc = "x" * 200 + gh.post_commit_status("o", "r", "sha", "success", "ctx", long_desc, token="tok") + + assert len(captured["data"]["description"]) == 140 diff --git a/.ci/tests/test_resource.py b/.ci/tests/test_resource.py new file mode 100644 index 0000000..b75043c --- /dev/null +++ b/.ci/tests/test_resource.py @@ -0,0 +1,324 @@ +import threading + +import pytest + +import ci_resource as res + + +# --------------------------------------------------------------------------- +# GpuInfo / SystemResources +# --------------------------------------------------------------------------- + + +def test_gpu_info_fields(): + g = res.GpuInfo(index=0, memory_used_mb=1000, memory_total_mb=8000, utilization_pct=50) + assert g.index == 0 + assert g.memory_total_mb == 8000 + + +def test_system_resources_fields(): + s = res.SystemResources(total_memory_mb=32000, available_memory_mb=16000, cpu_count=8) + assert s.cpu_count == 8 + + +# --------------------------------------------------------------------------- +# detect_gpus +# --------------------------------------------------------------------------- + + +def test_detect_gpus_nvidia_parses_csv(monkeypatch): + csv_output = "0, 512, 8192, 5\n1, 1024, 8192, 80\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia") + gpus = pool.detect_gpus() + assert len(gpus) == 2 + assert gpus[0].index == 0 + assert gpus[0].memory_used_mb == 512 + assert gpus[0].utilization_pct == 5 + assert gpus[1].index == 1 + assert gpus[1].utilization_pct == 80 + + +def test_detect_gpus_empty_on_failure(monkeypatch): + def mock_run(cmd, **kwargs): + class R: + returncode = 1 + stdout = "" + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia") + assert pool.detect_gpus() == [] + + +def test_detect_gpus_unknown_platform(): + pool = res.ResourcePool("unknown_platform") + assert pool.detect_gpus() == [] + + +def test_detect_gpus_file_not_found(monkeypatch): + def mock_run(cmd, **kwargs): + raise FileNotFoundError("nvidia-smi not found") + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia") + assert pool.detect_gpus() == [] + + +# --------------------------------------------------------------------------- +# detect_system_resources +# --------------------------------------------------------------------------- + + +def test_detect_system_resources(monkeypatch, tmp_path): + meminfo = tmp_path / "meminfo" + meminfo.write_text( + "MemTotal: 32000000 kB\n" + "MemFree: 10000000 kB\n" + "MemAvailable: 20000000 kB\n" + ) + + import io + _real_open = open + + def fake_open(path, **kw): + if str(path) == "/proc/meminfo": + return _real_open(str(meminfo), **kw) + return _real_open(path, **kw) + + monkeypatch.setattr("builtins.open", fake_open) + + pool = res.ResourcePool("nvidia") + sys_res = pool.detect_system_resources() + assert abs(sys_res.total_memory_mb - 32000000 / 1024) < 1 + assert abs(sys_res.available_memory_mb - 20000000 / 1024) < 1 + assert sys_res.cpu_count > 0 + + +# --------------------------------------------------------------------------- +# get_free_gpus +# --------------------------------------------------------------------------- + + +def test_get_free_gpus_filters_by_utilization(monkeypatch): + csv_output = "0, 100, 8192, 5\n1, 4000, 8192, 95\n2, 200, 8192, 8\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + free = pool.get_free_gpus() + assert 0 in free + assert 2 in free + assert 1 not in free + + +# --------------------------------------------------------------------------- +# allocate / release +# --------------------------------------------------------------------------- + + +def test_allocate_success(monkeypatch): + csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + gpu_ids, ok = pool.allocate(1) + assert ok is True + assert len(gpu_ids) == 1 + assert gpu_ids[0] in (0, 1) + + +def test_allocate_insufficient_gpus(monkeypatch): + csv_output = "0, 100, 8192, 5\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + gpu_ids, ok = pool.allocate(3) + assert ok is False + assert gpu_ids == [] + + +def test_allocate_zero_gpus(): + pool = res.ResourcePool("unknown") + gpu_ids, ok = pool.allocate(0) + assert ok is True + assert gpu_ids == [] + + +def test_release_frees_gpus(monkeypatch): + csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + gpu_ids, ok = pool.allocate(2) + assert ok is True + assert len(gpu_ids) == 2 + + # All GPUs allocated, next allocation should fail + _, ok2 = pool.allocate(1) + assert ok2 is False + + # Release one + pool.release([gpu_ids[0]]) + gpu_ids2, ok3 = pool.allocate(1) + assert ok3 is True + assert gpu_ids2 == [gpu_ids[0]] + + +def test_allocate_excludes_allocated(monkeypatch): + csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=10) + gpu_ids1, _ = pool.allocate(1) + gpu_ids2, _ = pool.allocate(1) + + assert gpu_ids1 != gpu_ids2 + assert set(gpu_ids1 + gpu_ids2) == {0, 1} + + +def test_thread_safety(monkeypatch): + csv_output = "0, 0, 8192, 0\n1, 0, 8192, 0\n2, 0, 8192, 0\n3, 0, 8192, 0\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia", utilization_threshold=50) + allocated_all = [] + lock = threading.Lock() + + def allocate_one(): + ids, ok = pool.allocate(1) + + if ok: + with lock: + allocated_all.extend(ids) + + threads = [threading.Thread(target=allocate_one) for _ in range(4)] + + for t in threads: + t.start() + + for t in threads: + t.join() + + assert len(allocated_all) == 4 + assert len(set(allocated_all)) == 4 + + +# --------------------------------------------------------------------------- +# get_status +# --------------------------------------------------------------------------- + + +def test_get_status(monkeypatch): + csv_output = "0, 512, 8192, 5\n" + + def mock_run(cmd, **kwargs): + class R: + returncode = 0 + stdout = csv_output + + return R() + + monkeypatch.setattr("subprocess.run", mock_run) + + pool = res.ResourcePool("nvidia") + status = pool.get_status() + assert status["platform"] == "nvidia" + assert len(status["gpus"]) == 1 + assert "system" in status + + +# --------------------------------------------------------------------------- +# parse_gpu_requirement / parse_memory_requirement +# --------------------------------------------------------------------------- + + +def test_parse_gpu_requirement_nvidia(): + job = {"resources": {"gpu_ids": "0,1", "gpu_style": "nvidia"}} + assert res.parse_gpu_requirement(job) == 2 + + +def test_parse_gpu_requirement_none(): + job = {"resources": {"gpu_style": "none"}} + assert res.parse_gpu_requirement(job) == 0 + + +def test_parse_gpu_requirement_all(): + job = {"resources": {"gpu_ids": "all"}} + assert res.parse_gpu_requirement(job) == 0 + + +def test_parse_gpu_requirement_default(): + job = {"resources": {"gpu_ids": "0"}} + assert res.parse_gpu_requirement(job) == 1 + + +def test_parse_memory_requirement_gb(): + assert res.parse_memory_requirement({"resources": {"memory": "32GB"}}) == 32 * 1024 + + +def test_parse_memory_requirement_mb(): + assert res.parse_memory_requirement({"resources": {"memory": "512MB"}}) == 512 + + +def test_parse_memory_requirement_empty(): + assert res.parse_memory_requirement({"resources": {}}) == 0 diff --git a/.ci/tests/test_utils.py b/.ci/tests/test_utils.py new file mode 100644 index 0000000..2a930d3 --- /dev/null +++ b/.ci/tests/test_utils.py @@ -0,0 +1,90 @@ +from utils import normalize_config + + +def test_normalize_creates_flat_jobs(): + raw = { + "repo": {"url": "https://github.com/org/repo.git"}, + "platforms": { + "nvidia": { + "image": {"dockerfile": ".ci/images/nvidia/"}, + "setup": "pip install .", + "docker_args": ["--gpus", "all"], + "jobs": { + "gpu": { + "resources": {"gpu_ids": "0"}, + "stages": [{"name": "test", "run": "pytest"}], + }, + "multi_gpu": { + "resources": {"gpu_ids": "0,1"}, + "stages": [{"name": "test", "run": "pytest"}], + }, + }, + }, + }, + } + config = normalize_config(raw) + + assert "nvidia_gpu" in config["jobs"] + assert "nvidia_multi_gpu" in config["jobs"] + assert config["jobs"]["nvidia_gpu"]["platform"] == "nvidia" + assert config["jobs"]["nvidia_gpu"]["setup"] == "pip install ." + assert config["jobs"]["nvidia_gpu"]["docker_args"] == ["--gpus", "all"] + assert config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] == "0" + assert config["jobs"]["nvidia_multi_gpu"]["resources"]["gpu_ids"] == "0,1" + + +def test_normalize_extracts_images(): + raw = { + "platforms": { + "nvidia": { + "image": { + "dockerfile": ".ci/images/nvidia/", + "build_args": {"BASE_IMAGE": "pytorch:latest"}, + }, + "jobs": {}, + }, + }, + } + config = normalize_config(raw) + assert config["images"]["nvidia"]["dockerfile"] == ".ci/images/nvidia/" + assert config["images"]["nvidia"]["build_args"]["BASE_IMAGE"] == "pytorch:latest" + + +def test_normalize_job_overrides_platform_defaults(): + raw = { + "platforms": { + "nvidia": { + "setup": "default setup", + "jobs": { + "special": { + "setup": "custom setup", + "stages": [], + }, + }, + }, + }, + } + config = normalize_config(raw) + assert config["jobs"]["nvidia_special"]["setup"] == "custom setup" + + +def test_normalize_preserves_top_level_keys(): + raw = { + "repo": {"url": "https://github.com/org/repo.git"}, + "github": {"status_context_prefix": "ci/test"}, + "agents": {"nvidia": {"url": "http://host:8080"}}, + "platforms": {}, + } + config = normalize_config(raw) + assert config["repo"]["url"] == "https://github.com/org/repo.git" + assert config["github"]["status_context_prefix"] == "ci/test" + assert config["agents"]["nvidia"]["url"] == "http://host:8080" + + +def test_normalize_passthrough_flat_config(): + """Old flat format without 'platforms' key is returned as-is.""" + flat = { + "images": {"nvidia": {}}, + "jobs": {"nvidia_gpu": {"platform": "nvidia"}}, + } + assert normalize_config(flat) is flat diff --git a/.ci/utils.py b/.ci/utils.py new file mode 100644 index 0000000..7932ba6 --- /dev/null +++ b/.ci/utils.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +"""Shared utilities for the CI toolchain.""" + +import subprocess +import sys + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + + +def normalize_config(raw): + """Convert platform-centric config to flat images/jobs format. + + Input (new format): + platforms: + nvidia: + image: {dockerfile: ..., build_args: ...} + setup: pip install .[dev] + jobs: + gpu: {resources: ..., stages: ...} + + Output (flat format consumed by run.py / build.py / agent.py): + images: + nvidia: {dockerfile: ..., build_args: ...} + jobs: + nvidia_gpu: {platform: nvidia, setup: ..., resources: ..., stages: ...} + + If the config already uses the flat format (no 'platforms' key), returns as-is. + """ + if "platforms" not in raw: + return raw + + config = {} + + for key in ("repo", "github", "agents"): + if key in raw: + config[key] = raw[key] + + config["images"] = {} + config["jobs"] = {} + + for platform, pcfg in raw.get("platforms", {}).items(): + # Image config + if "image" in pcfg: + config["images"][platform] = pcfg["image"] + + # Platform-level defaults inherited by jobs + defaults = {} + + for key in ("image_tag", "docker_args", "volumes", "setup", "env"): + if key in pcfg: + defaults[key] = pcfg[key] + + # Flatten jobs: {platform}_{job_name} + for job_name, job_cfg in pcfg.get("jobs", {}).items(): + full_name = f"{platform}_{job_name}" + flat = { + "platform": platform, + "image": defaults.get("image_tag", "latest"), + } + + # Apply platform defaults + for key in ("docker_args", "volumes", "setup", "env"): + if key in defaults: + flat[key] = defaults[key] + + # Job-level overrides + flat.update(job_cfg) + + config["jobs"][full_name] = flat + + return config + + +def load_config(path): + """Load a YAML config file and normalize to flat format.""" + with open(path, encoding="utf-8") as f: + raw = yaml.safe_load(f) + + return normalize_config(raw) + + +def get_git_commit(ref="HEAD", short=True): + """Get git commit SHA. Returns 'unknown' on failure.""" + cmd = ["git", "rev-parse"] + + if short: + cmd.append("--short") + + cmd.append(ref) + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + return "unknown" + + return result.stdout.strip() From 5292415c3c5d939fac96788ae55c9d21a6963021 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Mon, 23 Mar 2026 06:03:23 +0000 Subject: [PATCH 05/16] docs: add multi-machine deployment guide for NVIDIA and Iluvatar platform --- .ci/README.md | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/.ci/README.md b/.ci/README.md index 33841ca..4e826e8 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -241,3 +241,154 @@ Agent 自动检测 GPU 利用率和系统内存,动态决定并行度: - `success` / `failure` — job 执行完成 Status context 格式:`ci/infiniops/{job_name}` + +--- + +## 多机部署指南 + +以 NVIDIA + Iluvatar 双平台为例,说明如何在两台机器上部署 Agent 并实现跨平台并行测试。 + +### 前置条件(两台机器共同) + +```bash +# 1. Python 3.10+ 和依赖 +pip install pyyaml + +# 2. Docker 已安装 +docker --version + +# 3. 克隆仓库 +git clone https://github.com/InfiniTensor/InfiniOps.git +cd InfiniOps +``` + +### NVIDIA 机器配置 + +```bash +# 1. 安装 NVIDIA Container Toolkit +# 参考: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html + +# 2. 验证 GPU 可见 +nvidia-smi + +# 3. 构建 CI 镜像 +python .ci/build.py --platform nvidia +``` + +### Iluvatar 机器配置 + +```bash +# 1. 确认 CoreX 运行时已安装 +ixsmi + +# 2. 确认基础镜像已导入(非公开镜像,需提前准备) +docker images | grep corex # 应有 corex:qs_pj20250825 + +# 3. 构建 CI 镜像 +python .ci/build.py --platform iluvatar +``` + +### 启动 Agent 服务 + +在各自机器上启动 Agent: + +```bash +# NVIDIA 机器 +python .ci/agent.py serve --platform nvidia --port 8080 + +# Iluvatar 机器 +python .ci/agent.py serve --platform iluvatar --port 8080 +``` + +验证连通性: + +```bash +curl http://:8080/health +curl http://:8080/health +``` + +### 配置远程 Agent 地址 + +在触发端的 `config.yaml` 中添加 `agents` 段: + +```yaml +agents: + nvidia: + url: http://:8080 + iluvatar: + url: http://:8080 +``` + +### 触发跨平台测试 + +```bash +# 一键运行所有平台的 job +python .ci/agent.py run --branch master + +# 预览模式(不实际执行) +python .ci/agent.py run --branch master --dry-run --no-status + +# 只运行指定平台 +python .ci/agent.py run --branch master --platform nvidia +``` + +### 可选配置 + +#### GitHub Status 上报 + +两台机器均设置环境变量,各自上报所属平台的测试状态: + +```bash +export GITHUB_TOKEN=ghp_xxxxxxxxxxxx +``` + +#### API Token 认证 + +Agent 暴露在非可信网络时,建议启用 Token 认证: + +```bash +# 启动 Agent 时指定 token +python .ci/agent.py serve --platform nvidia --port 8080 --api-token + +# 或通过环境变量 +export API_TOKEN= +``` + +#### GitHub Webhook 自动触发 + +在 GitHub repo → Settings → Webhooks 中为每台机器添加 Webhook: + +| 字段 | 值 | +|---|---| +| Payload URL | `http://<机器IP>:8080/webhook` | +| Content type | `application/json` | +| Secret | 与 `--webhook-secret` 一致 | +| Events | `push` 和 `pull_request` | + +启动时配置 secret: + +```bash +python .ci/agent.py serve --platform nvidia --port 8080 --webhook-secret + +# 或通过环境变量 +export WEBHOOK_SECRET= +``` + +### 验证清单 + +```bash +# 1. 各机器单独 dry-run +python .ci/agent.py run --branch master --platform nvidia --dry-run --no-status +python .ci/agent.py run --branch master --platform iluvatar --dry-run --no-status + +# 2. 健康检查 +curl http://:8080/health +curl http://:8080/health + +# 3. 查看资源状态 +curl http://:8080/status +curl http://:8080/status + +# 4. 跨平台一键测试 +python .ci/agent.py run --branch master +``` From 5eb8fdcaf3742989847501b90f8f17c53e849a14 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Mon, 23 Mar 2026 09:30:36 +0000 Subject: [PATCH 06/16] feat(ci): enhance CI configuration and agent functionality with platform detection and job resolution --- .ci/README.md | 117 ++++++++++++++++++---------- .ci/agent.py | 164 +++++++++++++--------------------------- .ci/ci_resource.py | 10 +++ .ci/run.py | 143 ++++++++++++++++++++++++----------- .ci/tests/test_agent.py | 32 -------- .ci/utils.py | 11 +++ 6 files changed, 248 insertions(+), 229 deletions(-) diff --git a/.ci/README.md b/.ci/README.md index 4e826e8..1926c66 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -3,16 +3,24 @@ ``` .ci/ ├── config.yaml # 统一配置(镜像、job、Agent 定义) -├── utils.py # 共享工具(load_config、get_git_commit) +├── utils.py # 共享工具(load_config、normalize_config、get_git_commit) ├── agent.py # Runner Agent(调度、Webhook、远程触发) ├── build.py # 镜像构建 ├── run.py # CI 流水线执行(Docker 层) ├── ci_resource.py # GPU/内存资源检测与分配 ├── github_status.py # GitHub Commit Status 上报 -└── images/ - ├── nvidia/Dockerfile - ├── iluvatar/Dockerfile - └── ascend/Dockerfile +├── images/ +│ ├── nvidia/Dockerfile +│ ├── iluvatar/Dockerfile +│ └── ascend/Dockerfile +└── tests/ # 单元测试 + ├── conftest.py + ├── test_agent.py + ├── test_build.py + ├── test_run.py + ├── test_resource.py + ├── test_github_status.py + └── test_utils.py ``` **前置依赖**:Docker、Python 3.10+、`pip install pyyaml` @@ -29,13 +37,22 @@ repo: url: https://github.com/InfiniTensor/InfiniOps.git branch: master +github: + status_context_prefix: "ci/infiniops" + +agents: # 远程 Agent 地址(CLI 跨机器触发用) + nvidia: + url: http://nvidia-host:8080 + iluvatar: + url: http://iluvatar-host:8080 + platforms: nvidia: image: # 镜像定义 dockerfile: .ci/images/nvidia/ build_args: BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 - setup: pip install .[dev] # 平台级默认值,job 可覆盖 + setup: pip install .[dev] --no-build-isolation jobs: gpu: # 展平后为 nvidia_gpu resources: @@ -64,7 +81,7 @@ platforms: - /lib/firmware:/lib/firmware - /usr/src:/usr/src - /lib/modules:/lib/modules - setup: pip install .[dev] + setup: pip install .[dev] --no-build-isolation jobs: gpu: # 展平后为 iluvatar_gpu resources: @@ -103,6 +120,7 @@ platforms: | 参数 | 说明 | |---|---| | `--platform nvidia\|iluvatar\|ascend\|all` | 构建平台,默认 `all` | +| `--commit` | 指定 commit ref 作为镜像 tag(默认 HEAD) | | `--force` | 跳过 Dockerfile 变更检测 | | `--dry-run` | 打印命令不执行 | @@ -126,25 +144,31 @@ python .ci/build.py --force ## 流水线执行 `run.py` +平台自动发现(通过检测 `nvidia-smi`/`ixsmi`),无需手动指定。 + | 参数 | 说明 | |---|---| -| `--job` | 指定 job 名称(默认第一个) | -| `--branch` | 覆盖克隆分支 | +| `--config` | 配置文件路径(默认 `.ci/config.yaml`) | +| `--job` | job 名称:短名(`gpu`)或完整名(`nvidia_gpu`)。缺省运行当前平台所有 job | +| `--branch` | 覆盖克隆分支(默认读 config `repo.branch`) | | `--stage` | 只运行指定 stage | | `--image-tag` | 覆盖镜像 tag | -| `--gpu-id` | 覆盖 GPU 设备 ID(仅 nvidia gpu_style) | +| `--gpu-id` | 覆盖 GPU 设备 ID(nvidia 通过 `--gpus`,其他平台通过 `CUDA_VISIBLE_DEVICES`) | | `--results-dir` | 宿主机目录,挂载到容器 `/workspace/results` | | `--dry-run` | 打印 docker 命令不执行 | ```bash -# 运行 NVIDIA job -python .ci/run.py --job nvidia_gpu --branch master +# 最简用法:自动检测平台,运行所有 job,使用 config 默认分支 +python .ci/run.py + +# 指定 job 短名 +python .ci/run.py --job gpu -# 运行 Iluvatar job -python .ci/run.py --job iluvatar_gpu --branch feat/ci-nvidia +# 完整 job 名(向后兼容) +python .ci/run.py --job nvidia_gpu # 只跑 test stage,预览命令 -python .ci/run.py --job iluvatar_gpu --stage test --dry-run +python .ci/run.py --job gpu --stage test --dry-run ``` 容器内执行流程:`git clone` → `checkout` → `setup` → stages。 @@ -158,7 +182,7 @@ python .ci/run.py --job iluvatar_gpu --stage test --dry-run |---|---|---|---| | NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvcr.io/nvidia/pytorch:24.10-py3` | 标准 CUDA | | Iluvatar | `--privileged` + `/dev` 挂载 | `corex:qs_pj20250825` | CoreX 运行时,CUDA 兼容 | -| Ascend | TODO | `ascend-pytorch:24.0.0` | 待完善 | +| Ascend | TODO | `ascend-pytorch:24.0.0` | 待完善,镜像和 job 尚未就绪 | --- @@ -169,43 +193,54 @@ Runner Agent 支持 CLI 手动触发、GitHub Webhook 自动触发、资源感 ### CLI 手动执行 ```bash -# 运行所有 job(本地 + 远程 Agent) -python .ci/agent.py run --branch master +# 运行所有 job(分发到远程 Agent,使用 config 默认分支) +python .ci/agent.py run + +# 指定分支 +python .ci/agent.py run --branch feat/xxx # 运行指定 job -python .ci/agent.py run --branch master --job nvidia_gpu +python .ci/agent.py run --job nvidia_gpu # 按平台运行 -python .ci/agent.py run --branch master --platform nvidia +python .ci/agent.py run --platform nvidia # 预览命令 -python .ci/agent.py run --branch master --dry-run --no-status +python .ci/agent.py run --dry-run ``` | 参数 | 说明 | |---|---| -| `--branch` | 测试分支(必填) | +| `--branch` | 测试分支(默认读 config `repo.branch`) | | `--job` | 指定 job 名称 | | `--platform` | 按平台过滤 job | | `--commit` | 覆盖 commit SHA | | `--image-tag` | 覆盖镜像 tag | -| `--results-dir` | 结果目录(默认 `ci-results`) | -| `--utilization-threshold` | GPU 空闲阈值百分比(默认 10) | -| `--no-status` | 跳过 GitHub Status 上报 | | `--dry-run` | 预览模式 | ### Webhook 服务 -每台平台机器部署一个 Agent 实例: +每台平台机器部署一个 Agent 实例(平台自动发现): ```bash # NVIDIA 机器 -python .ci/agent.py serve --platform nvidia --port 8080 +python .ci/agent.py serve --port 8080 # Iluvatar 机器 -python .ci/agent.py serve --platform iluvatar --port 8080 +python .ci/agent.py serve --port 8080 ``` +`serve` 子命令额外参数: + +| 参数 | 说明 | +|---|---| +| `--port` | 监听端口(默认 8080) | +| `--host` | 监听地址(默认 `0.0.0.0`) | +| `--webhook-secret` | GitHub Webhook 签名密钥(或 `WEBHOOK_SECRET` 环境变量) | +| `--api-token` | `/api/run` Bearer 认证令牌(或 `AGENT_API_TOKEN` 环境变量) | +| `--results-dir` | 结果目录(默认 `ci-results`) | +| `--utilization-threshold` | GPU 空闲阈值百分比(默认 10) | + | 端点 | 方法 | 说明 | |---|---|---| | `/webhook` | POST | GitHub Webhook(push/pull_request) | @@ -293,11 +328,11 @@ python .ci/build.py --platform iluvatar 在各自机器上启动 Agent: ```bash -# NVIDIA 机器 -python .ci/agent.py serve --platform nvidia --port 8080 +# NVIDIA 机器(平台自动发现) +python .ci/agent.py serve --port 8080 -# Iluvatar 机器 -python .ci/agent.py serve --platform iluvatar --port 8080 +# Iluvatar 机器(平台自动发现) +python .ci/agent.py serve --port 8080 ``` 验证连通性: @@ -322,14 +357,14 @@ agents: ### 触发跨平台测试 ```bash -# 一键运行所有平台的 job -python .ci/agent.py run --branch master +# 一键运行所有平台的 job(使用 config 默认分支) +python .ci/agent.py run # 预览模式(不实际执行) -python .ci/agent.py run --branch master --dry-run --no-status +python .ci/agent.py run --dry-run # 只运行指定平台 -python .ci/agent.py run --branch master --platform nvidia +python .ci/agent.py run --platform nvidia ``` ### 可选配置 @@ -348,10 +383,10 @@ Agent 暴露在非可信网络时,建议启用 Token 认证: ```bash # 启动 Agent 时指定 token -python .ci/agent.py serve --platform nvidia --port 8080 --api-token +python .ci/agent.py serve --port 8080 --api-token # 或通过环境变量 -export API_TOKEN= +export AGENT_API_TOKEN= ``` #### GitHub Webhook 自动触发 @@ -368,7 +403,7 @@ export API_TOKEN= 启动时配置 secret: ```bash -python .ci/agent.py serve --platform nvidia --port 8080 --webhook-secret +python .ci/agent.py serve --port 8080 --webhook-secret # 或通过环境变量 export WEBHOOK_SECRET= @@ -378,8 +413,8 @@ export WEBHOOK_SECRET= ```bash # 1. 各机器单独 dry-run -python .ci/agent.py run --branch master --platform nvidia --dry-run --no-status -python .ci/agent.py run --branch master --platform iluvatar --dry-run --no-status +python .ci/agent.py run --platform nvidia --dry-run +python .ci/agent.py run --platform iluvatar --dry-run # 2. 健康检查 curl http://:8080/health diff --git a/.ci/agent.py b/.ci/agent.py index 3696ce2..8c53814 100644 --- a/.ci/agent.py +++ b/.ci/agent.py @@ -3,11 +3,11 @@ Usage: # Run jobs locally (or dispatch to remote agents) - python .ci/agent.py run --branch master + python .ci/agent.py run python .ci/agent.py run --branch master --job nvidia_gpu --dry-run - # Start webhook server - python .ci/agent.py serve --platform nvidia --port 8080 + # Start webhook server (auto-detects platform) + python .ci/agent.py serve --port 8080 """ import argparse @@ -137,32 +137,6 @@ def select_jobs(config, platform=None, job_name=None): return list(jobs.keys()) -def route_jobs(config, job_names, local_platform=None): - """Split jobs into local and remote. - - Returns (local_jobs, remote_jobs) where remote_jobs is a list of - (job_name, agent_url) tuples. - """ - agents = config.get("agents", {}) - jobs = config.get("jobs", {}) - local = [] - remote = [] - - for name in job_names: - job = jobs.get(name, {}) - platform = job.get("platform", "") - - if not local_platform: - local.append(name) - elif platform == local_platform: - local.append(name) - elif platform in agents: - remote.append((name, agents[platform].get("url", ""))) - else: - local.append(name) - - return local, remote - # --------------------------------------------------------------------------- # Scheduler @@ -707,8 +681,10 @@ def poll_remote_job(agent_url, job_id, interval=5.0, timeout=7200): def cmd_run(args): - """Handle 'run' subcommand: execute jobs locally and/or remotely.""" + """Handle 'run' subcommand: dispatch jobs to platform agents via HTTP.""" config = run.load_config(args.config) + agents = config.get("agents", {}) + branch = args.branch or config.get("repo", {}).get("branch", "master") commit_sha = args.commit or run.get_git_commit(short=False) # Determine which jobs to run @@ -722,57 +698,34 @@ def cmd_run(args): print("error: no matching jobs found", file=sys.stderr) sys.exit(1) - # Detect local platform (if running serve on this machine, use that; otherwise guess) - local_platform = args.platform - local_jobs, remote_jobs = route_jobs(config, job_names, local_platform) + # Resolve agent URL for each job + jobs_to_dispatch = [] # [(name, agent_url)] - # Run local jobs - local_results = [] - - if local_jobs: - pool = res.ResourcePool( - local_platform or "unknown", - utilization_threshold=args.utilization_threshold, - ) - scheduler = Scheduler( - config, - local_platform or "unknown", - pool, - results_dir=args.results_dir, - no_status=args.no_status, - dry_run=args.dry_run, - ) + for name in job_names: + job = config.get("jobs", {}).get(name, {}) + platform = job.get("platform", "") + agent_url = agents.get(platform, {}).get("url", "") - for name in local_jobs: - req = JobRequest( - job_name=name, - branch=args.branch, - commit_sha=commit_sha, - config=config, - image_tag=args.image_tag, - results_dir=args.results_dir, - ) - scheduler.submit(req) + if not agent_url: + print(f"error: no agent URL configured for platform {platform!r} (job {name})", file=sys.stderr) + sys.exit(1) - local_results = scheduler.wait_all() + jobs_to_dispatch.append((name, agent_url)) - # Dispatch remote jobs - remote_results = [] api_token = os.environ.get("AGENT_API_TOKEN", "") + results = [] - if remote_jobs and not args.dry_run: - # Dispatch all remote jobs first, then poll concurrently + if args.dry_run: + for name, agent_url in jobs_to_dispatch: + print(f"[dry-run] dispatch {name} to {agent_url}") + else: + # Dispatch all jobs, then poll concurrently dispatched = [] # [(name, agent_url, job_id)] - for name, agent_url in remote_jobs: - if not agent_url: - print(f"warning: no agent URL for {name}, skipping", file=sys.stderr) - remote_results.append({"job_name": name, "state": "error"}) - continue - + for name, agent_url in jobs_to_dispatch: print(f"==> dispatching {name} to {agent_url}", file=sys.stderr) job_id = dispatch_remote_job( - agent_url, name, args.branch, commit_sha, args.image_tag, + agent_url, name, branch, commit_sha, args.image_tag, api_token=api_token or None, ) @@ -781,9 +734,8 @@ def cmd_run(args): dispatched.append((name, agent_url, job_id)) else: print(f" failed to dispatch {name}", file=sys.stderr) - remote_results.append({"job_name": name, "state": "error"}) + results.append({"job_name": name, "state": "error"}) - # Poll all dispatched jobs concurrently if dispatched: with ThreadPoolExecutor(max_workers=len(dispatched)) as executor: futures = { @@ -796,28 +748,16 @@ def cmd_run(args): result = future.result() if result: - remote_results.append(result) + results.append(result) else: print(f" timeout waiting for {name}", file=sys.stderr) - remote_results.append({"job_name": name, "state": "timeout"}) - - elif remote_jobs and args.dry_run: - for name, agent_url in remote_jobs: - print(f"[dry-run] dispatch {name} to {agent_url}") + results.append({"job_name": name, "state": "timeout"}) # Summary print("\n========== Results ==========") all_ok = True - for r in local_results: - status = "PASS" if r.returncode == 0 else "FAIL" - - if r.returncode != 0: - all_ok = False - - print(f" {status} {r.job_name} ({r.duration:.0f}s) {r.results_dir}") - - for r in remote_results: + for r in results: state = r.get("state", "unknown") name = r.get("job_name", "?") status = "PASS" if state == STATE_SUCCESS else "FAIL" @@ -826,7 +766,7 @@ def cmd_run(args): all_ok = False duration = r.get("duration_seconds", 0) - print(f" {status} {name} ({duration:.0f}s) [remote]") + print(f" {status} {name} ({duration:.0f}s)") if not all_ok: sys.exit(1) @@ -836,13 +776,31 @@ def cmd_serve(args): """Handle 'serve' subcommand: start webhook server.""" config = run.load_config(args.config) + platform = res.detect_platform() + + if not platform: + print( + "error: could not detect platform (no nvidia-smi or ixsmi found)", + file=sys.stderr, + ) + sys.exit(1) + + platform_jobs = select_jobs(config, platform=platform) + + if not platform_jobs: + print( + f"error: platform {platform!r} detected but no jobs defined in config", + file=sys.stderr, + ) + sys.exit(1) + pool = res.ResourcePool( - args.platform, + platform, utilization_threshold=args.utilization_threshold, ) scheduler = Scheduler( config, - args.platform, + platform, pool, results_dir=args.results_dir, ) @@ -869,14 +827,14 @@ def cmd_serve(args): args.port, config, scheduler, - args.platform, + platform, webhook_secret=webhook_secret or None, api_token=api_token or None, results_dir=args.results_dir, ) print( - f"Agent serving on {args.host}:{args.port} (platform={args.platform})", + f"Agent serving on {args.host}:{args.port} (platform={platform})", file=sys.stderr, ) print(f" POST /webhook — GitHub webhook", file=sys.stderr) @@ -905,23 +863,11 @@ def main(): type=Path, default=Path(__file__).resolve().parent / "config.yaml", ) - run_parser.add_argument("--branch", type=str, required=True, help="Branch to test") + run_parser.add_argument("--branch", type=str, help="Branch to test (default: config repo.branch)") run_parser.add_argument("--job", type=str, help="Specific job name") run_parser.add_argument("--platform", type=str, help="Filter jobs by platform") run_parser.add_argument("--image-tag", type=str, help="Override image tag") run_parser.add_argument("--commit", type=str, help="Override commit SHA") - run_parser.add_argument( - "--results-dir", - type=Path, - default=Path("ci-results"), - ) - run_parser.add_argument( - "--utilization-threshold", - type=int, - default=10, - help="GPU utilization threshold (%%) to consider free (default: 10)", - ) - run_parser.add_argument("--no-status", action="store_true", help="Skip GitHub status") run_parser.add_argument("--dry-run", action="store_true") # --- serve subcommand --- @@ -931,12 +877,6 @@ def main(): type=Path, default=Path(__file__).resolve().parent / "config.yaml", ) - serve_parser.add_argument( - "--platform", - type=str, - required=True, - help="Platform this agent handles (nvidia, iluvatar, etc.)", - ) serve_parser.add_argument("--port", type=int, default=8080) serve_parser.add_argument("--host", type=str, default="0.0.0.0") serve_parser.add_argument("--webhook-secret", type=str) diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py index f3dbfb1..47b9737 100644 --- a/.ci/ci_resource.py +++ b/.ci/ci_resource.py @@ -2,6 +2,7 @@ """Resource detection and allocation for CI Runner Agent.""" import os +import shutil import subprocess import threading from dataclasses import dataclass, field @@ -239,3 +240,12 @@ def parse_memory_requirement(job_config) -> float: return float(memory) * 1024 # Default: GB except ValueError: return 0 + + +def detect_platform(): + """Auto-detect the current platform by probing GPU query tools on PATH.""" + for platform, tool in ResourcePool.GPU_QUERY_TOOLS.items(): + if shutil.which(tool): + return platform + + return None diff --git a/.ci/run.py b/.ci/run.py index 2575781..6c108e4 100644 --- a/.ci/run.py +++ b/.ci/run.py @@ -9,7 +9,7 @@ from datetime import datetime from pathlib import Path -from ci_resource import GPU_STYLE_NVIDIA, GPU_STYLE_NONE +from ci_resource import GPU_STYLE_NVIDIA, GPU_STYLE_NONE, detect_platform from utils import get_git_commit, load_config @@ -183,6 +183,42 @@ def build_docker_args( return args +def resolve_job_names(jobs, platform, job=None): + """Resolve job names for a platform. + + - ``job=None`` — all jobs for the platform. + - ``job="gpu"`` (short name) — matched via ``short_name`` field. + - ``job="nvidia_gpu"`` (full name) — direct lookup. + """ + if job and job in jobs: + return [job] + + if job: + matches = [ + name for name, cfg in jobs.items() + if cfg.get("platform") == platform and cfg.get("short_name") == job + ] + + if not matches: + print( + f"error: job {job!r} not found for platform {platform!r}", + file=sys.stderr, + ) + sys.exit(1) + + return matches + + matches = [ + name for name, cfg in jobs.items() if cfg.get("platform") == platform + ] + + if not matches: + print(f"error: no jobs for platform {platform!r}", file=sys.stderr) + sys.exit(1) + + return matches + + def main(): parser = argparse.ArgumentParser(description="Run Docker CI pipeline") parser.add_argument( @@ -191,8 +227,12 @@ def main(): default=Path(__file__).resolve().parent / "config.yaml", help="Path to config.yaml", ) - parser.add_argument("--branch", type=str, help="Override repo branch") - parser.add_argument("--job", type=str, help="Job name to run (default: first job)") + parser.add_argument("--branch", type=str, help="Override repo branch (default: config repo.branch)") + parser.add_argument( + "--job", + type=str, + help="Job name: short name (gpu) or full name (nvidia_gpu). Default: all jobs", + ) parser.add_argument( "--stage", type=str, @@ -226,53 +266,68 @@ def main(): repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git") branch = args.branch or repo.get("branch", "master") - jobs = config.get("jobs", {}) + platform = detect_platform() - if not jobs: - print("error: no jobs in config", file=sys.stderr) - sys.exit(1) - - job_name = args.job or next(iter(jobs)) - - if job_name not in jobs: - print(f"error: job {job_name!r} not in config", file=sys.stderr) + if not platform: + print( + "error: could not detect platform (no nvidia-smi or ixsmi found)", + file=sys.stderr, + ) sys.exit(1) - job = jobs[job_name] - all_stages = job.get("stages", []) - - if args.stage: - stages = [s for s in all_stages if s["name"] == args.stage] - - if not stages: - print(f"error: stage {args.stage!r} not found", file=sys.stderr) - sys.exit(1) - else: - stages = all_stages + print(f"platform: {platform}", file=sys.stderr) - platform = job.get("platform", "nvidia") - commit = get_git_commit() - results_dir = build_results_dir(args.results_dir, platform, stages, commit) - - workdir = "/workspace" - docker_args = build_docker_args( - config, - job_name, - repo_url, - branch, - stages, - workdir, - args.image_tag, - gpu_id_override=args.gpu_id, - results_dir=results_dir, - ) + jobs = config.get("jobs", {}) - if args.dry_run: - print(shlex.join(docker_args)) - return + if not jobs: + print("error: no jobs in config", file=sys.stderr) + sys.exit(1) - results_dir.mkdir(parents=True, exist_ok=True) - sys.exit(subprocess.run(docker_args).returncode) + job_names = resolve_job_names(jobs, platform, job=args.job) + failed = 0 + + for job_name in job_names: + job = jobs[job_name] + all_stages = job.get("stages", []) + + if args.stage: + stages = [s for s in all_stages if s["name"] == args.stage] + + if not stages: + print(f"error: stage {args.stage!r} not found in {job_name}", file=sys.stderr) + sys.exit(1) + else: + stages = all_stages + + job_platform = job.get("platform", platform) + commit = get_git_commit() + results_dir = build_results_dir(args.results_dir, job_platform, stages, commit) + + docker_args = build_docker_args( + config, + job_name, + repo_url, + branch, + stages, + "/workspace", + args.image_tag, + gpu_id_override=args.gpu_id, + results_dir=results_dir, + ) + + if args.dry_run: + print(shlex.join(docker_args)) + continue + + print(f"==> running job: {job_name}", file=sys.stderr) + results_dir.mkdir(parents=True, exist_ok=True) + returncode = subprocess.run(docker_args).returncode + + if returncode != 0: + print(f"job {job_name} failed (exit code {returncode})", file=sys.stderr) + failed += 1 + + sys.exit(1 if failed else 0) if __name__ == "__main__": diff --git a/.ci/tests/test_agent.py b/.ci/tests/test_agent.py index 5741385..aa181c4 100644 --- a/.ci/tests/test_agent.py +++ b/.ci/tests/test_agent.py @@ -115,38 +115,6 @@ def test_select_jobs_invalid_name(agent_config): agent.select_jobs(agent_config, job_name="not_exist") -# --------------------------------------------------------------------------- -# route_jobs -# --------------------------------------------------------------------------- - - -def test_route_jobs_local(agent_config): - local, remote = agent.route_jobs(agent_config, ["nvidia_gpu"], local_platform="nvidia") - assert local == ["nvidia_gpu"] - assert remote == [] - - -def test_route_jobs_remote(agent_config): - local, remote = agent.route_jobs(agent_config, ["iluvatar_gpu"], local_platform="nvidia") - assert local == [] - assert len(remote) == 1 - assert remote[0][0] == "iluvatar_gpu" - assert remote[0][1] == "http://iluvatar-host:8080" - - -def test_route_jobs_mixed(agent_config): - local, remote = agent.route_jobs( - agent_config, ["nvidia_gpu", "iluvatar_gpu"], local_platform="nvidia" - ) - assert local == ["nvidia_gpu"] - assert len(remote) == 1 - - -def test_route_jobs_no_platform(agent_config): - local, remote = agent.route_jobs(agent_config, ["nvidia_gpu", "iluvatar_gpu"]) - assert len(local) == 2 - assert remote == [] - # --------------------------------------------------------------------------- # verify_signature diff --git a/.ci/utils.py b/.ci/utils.py index 7932ba6..07dec87 100644 --- a/.ci/utils.py +++ b/.ci/utils.py @@ -61,6 +61,7 @@ def normalize_config(raw): full_name = f"{platform}_{job_name}" flat = { "platform": platform, + "short_name": job_name, "image": defaults.get("image_tag", "latest"), } @@ -74,6 +75,16 @@ def normalize_config(raw): config["jobs"][full_name] = flat + # Warn on mismatched agent/platform keys (catches typos like 'nvdia'). + agent_keys = set(config.get("agents", {}).keys()) + platform_keys = set(raw.get("platforms", {}).keys()) + + for key in agent_keys - platform_keys: + print( + f"warning: agents.{key} has no matching platform in platforms.*", + file=sys.stderr, + ) + return config From 038f884e1873908ffb61f64f9b4d73e851f6d0b9 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Mon, 23 Mar 2026 15:55:27 +0000 Subject: [PATCH 07/16] feat(ci): add MetaX platform CI support Add Dockerfile, config, and mx-smi GPU detection for MetaX (MACA) platform. Co-Authored-By: Claude Opus 4.6 --- .ci/ci_resource.py | 83 ++++++++++++++++++++++++++++++++++++- .ci/config.yaml | 24 +++++++++++ .ci/images/metax/Dockerfile | 46 ++++++++++++++++++++ 3 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 .ci/images/metax/Dockerfile diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py index 47b9737..a24041f 100644 --- a/.ci/ci_resource.py +++ b/.ci/ci_resource.py @@ -30,13 +30,14 @@ class SystemResources: class ResourcePool: """Thread-safe GPU and system resource manager. - Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi) + Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi, mx-smi) and tracks allocations to enable dynamic parallel scheduling. """ GPU_QUERY_TOOLS = { "nvidia": "nvidia-smi", "iluvatar": "ixsmi", + "metax": "mx-smi", } def __init__(self, platform, utilization_threshold=10): @@ -56,6 +57,9 @@ def allocated(self): def detect_gpus(self) -> list[GpuInfo]: """Query GPU status via platform-specific CLI tool.""" + if self._platform == "metax": + return self._detect_gpus_metax() + tool = self.GPU_QUERY_TOOLS.get(self._platform) if not tool: @@ -100,6 +104,83 @@ def detect_gpus(self) -> list[GpuInfo]: return gpus + def _detect_gpus_metax(self) -> list[GpuInfo]: + """Parse mx-smi output for MetaX GPUs. + + Runs --show-memory and --show-usage separately and merges results. + Output format example: + GPU#0 MXC550 0000:1a:00.0 + Memory + vis_vram total : 67108864 KB + vis_vram used : 879032 KB + Utilization + GPU : 0 % + """ + import re + + def run_mxsmi(flag): + try: + r = subprocess.run( + ["mx-smi", flag], + capture_output=True, text=True, timeout=10, + ) + return r.stdout if r.returncode == 0 else "" + except (FileNotFoundError, subprocess.TimeoutExpired): + return "" + + mem_out = run_mxsmi("--show-memory") + util_out = run_mxsmi("--show-usage") + + # Parse memory: collect {index: (used_kb, total_kb)} + mem = {} + current = None + for line in mem_out.splitlines(): + m = re.match(r"GPU#(\d+)", line.strip()) + if m: + current = int(m.group(1)) + mem[current] = [0.0, 0.0] + continue + if current is None: + continue + m = re.search(r"vis_vram total\s*:\s*([\d.]+)\s*KB", line) + if m: + mem[current][1] = float(m.group(1)) / 1024 # KB -> MB + m = re.search(r"vis_vram used\s*:\s*([\d.]+)\s*KB", line) + if m: + mem[current][0] = float(m.group(1)) / 1024 # KB -> MB + + # Parse utilization: collect {index: utilization_pct} + util = {} + current = None + in_util = False + for line in util_out.splitlines(): + m = re.match(r"GPU#(\d+)", line.strip()) + if m: + current = int(m.group(1)) + in_util = False + continue + if current is None: + continue + if "Utilization" in line: + in_util = True + continue + if in_util: + m = re.match(r"\s*GPU\s*:\s*([\d.]+)\s*%", line) + if m: + util[current] = float(m.group(1)) + in_util = False + + gpus = [] + for idx in sorted(mem): + used_mb, total_mb = mem[idx] + gpus.append(GpuInfo( + index=idx, + memory_used_mb=used_mb, + memory_total_mb=total_mb, + utilization_pct=util.get(idx, 0.0), + )) + return gpus + def detect_system_resources(self) -> SystemResources: """Read system memory from /proc/meminfo and CPU count.""" total_mb = 0.0 diff --git a/.ci/config.yaml b/.ci/config.yaml index e62bc07..171c9e9 100644 --- a/.ci/config.yaml +++ b/.ci/config.yaml @@ -61,6 +61,30 @@ platforms: - name: test run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml + metax: + image: + dockerfile: .ci/images/metax/ + build_args: + BASE_IMAGE: cr.metax-tech.com/public-library/maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64 + APT_MIRROR: http://archive.ubuntu.com/ubuntu + PIP_INDEX_URL: https://pypi.org/simple + docker_args: + - "--privileged" + - "--ulimit=memlock=-1" + - "--ulimit=stack=67108864" + setup: pip install .[dev] --no-build-isolation + jobs: + gpu: + resources: + gpu_ids: "0" + gpu_style: none # MetaX 设备通过 --privileged 透传,无需 CUDA_VISIBLE_DEVICES + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/ -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + ascend: # TODO: Ascend image is not ready yet image: dockerfile: .ci/images/ascend/ diff --git a/.ci/images/metax/Dockerfile b/.ci/images/metax/Dockerfile new file mode 100644 index 0000000..fda527c --- /dev/null +++ b/.ci/images/metax/Dockerfile @@ -0,0 +1,46 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +# conda Python is used in this image +ENV PATH=/opt/conda/bin:${PATH} + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + +ARG APT_MIRROR +RUN if [ -n "$APT_MIRROR" ]; then \ + sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ + fi && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + coreutils \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +ARG PIP_INDEX_URL +RUN pip install --no-cache-dir \ + ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest-cov \ + pytest-xdist \ + pyyaml \ + ruff==0.15.7 + +# Pin pre-installed MetaX torch to prevent pip from replacing it with upstream version +RUN pip show torch >/dev/null 2>&1 && \ + echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ + touch /etc/pip-constraints.txt +ENV PIP_CONSTRAINT=/etc/pip-constraints.txt + +WORKDIR /workspace From 78deba2189d7e1cc17891b1ba55c20a9e89d403c Mon Sep 17 00:00:00 2001 From: zhangyue Date: Mon, 23 Mar 2026 16:27:00 +0000 Subject: [PATCH 08/16] feat(ci): improve job dispatch logging and handle job results more effectively --- .ci/agent.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/.ci/agent.py b/.ci/agent.py index 8c53814..0fa3715 100644 --- a/.ci/agent.py +++ b/.ci/agent.py @@ -24,7 +24,7 @@ import urllib.error import urllib.request import uuid -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime from http.server import BaseHTTPRequestHandler, HTTPServer from pathlib import Path @@ -717,13 +717,18 @@ def cmd_run(args): if args.dry_run: for name, agent_url in jobs_to_dispatch: - print(f"[dry-run] dispatch {name} to {agent_url}") + platform, _, job = name.partition("_") + print(f"[dry-run] dispatch {platform} {job} job to {agent_url}") else: - # Dispatch all jobs, then poll concurrently + # Dispatch all jobs, then poll concurrently. dispatched = [] # [(name, agent_url, job_id)] for name, agent_url in jobs_to_dispatch: - print(f"==> dispatching {name} to {agent_url}", file=sys.stderr) + platform, _, job = name.partition("_") + print( + f"==> dispatching {platform} {job} job to {agent_url}", + file=sys.stderr, + ) job_id = dispatch_remote_job( agent_url, name, branch, commit_sha, args.image_tag, api_token=api_token or None, @@ -743,14 +748,21 @@ def cmd_run(args): for name, url, jid in dispatched } - for future in futures: + for future in as_completed(futures): name, _, _ = futures[future] result = future.result() if result: + state = result.get("state", "unknown") + duration = result.get("duration_seconds", 0) + tag = "PASS" if state == STATE_SUCCESS else "FAIL" + print( + f"<== {tag} {name} ({duration:.0f}s)", + file=sys.stderr, + ) results.append(result) else: - print(f" timeout waiting for {name}", file=sys.stderr) + print(f"<== TIMEOUT {name}", file=sys.stderr) results.append({"job_name": name, "state": "timeout"}) # Summary From a599ba9615e08f5e730ec7745e1bc98c276edb1f Mon Sep 17 00:00:00 2001 From: zhangyue Date: Tue, 24 Mar 2026 17:45:27 +0800 Subject: [PATCH 09/16] feat(ci): add Moore Threads (MUSA) platform CI support Add GPU detection via mthreads-gmi, Dockerfile, config, and update docs with Moore and MetaX platform deployment instructions. Co-Authored-By: Claude Opus 4.6 --- .ci/README.md | 65 ++++++++++++++++++++++++++++--- .ci/ci_resource.py | 76 ++++++++++++++++++++++++++++++++++++- .ci/config.yaml | 22 +++++++++++ .ci/images/moore/Dockerfile | 38 +++++++++++++++++++ 4 files changed, 194 insertions(+), 7 deletions(-) create mode 100644 .ci/images/moore/Dockerfile diff --git a/.ci/README.md b/.ci/README.md index 1926c66..12e8094 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -12,6 +12,8 @@ ├── images/ │ ├── nvidia/Dockerfile │ ├── iluvatar/Dockerfile +│ ├── metax/Dockerfile +│ ├── moore/Dockerfile │ └── ascend/Dockerfile └── tests/ # 单元测试 ├── conftest.py @@ -119,7 +121,7 @@ platforms: | 参数 | 说明 | |---|---| -| `--platform nvidia\|iluvatar\|ascend\|all` | 构建平台,默认 `all` | +| `--platform nvidia\|iluvatar\|metax\|moore\|ascend\|all` | 构建平台,默认 `all` | | `--commit` | 指定 commit ref 作为镜像 tag(默认 HEAD) | | `--force` | 跳过 Dockerfile 变更检测 | | `--dry-run` | 打印命令不执行 | @@ -144,7 +146,7 @@ python .ci/build.py --force ## 流水线执行 `run.py` -平台自动发现(通过检测 `nvidia-smi`/`ixsmi`),无需手动指定。 +平台自动发现(通过检测 `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi`),无需手动指定。 | 参数 | 说明 | |---|---| @@ -182,6 +184,8 @@ python .ci/run.py --job gpu --stage test --dry-run |---|---|---|---| | NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvcr.io/nvidia/pytorch:24.10-py3` | 标准 CUDA | | Iluvatar | `--privileged` + `/dev` 挂载 | `corex:qs_pj20250825` | CoreX 运行时,CUDA 兼容 | +| MetaX | `--privileged` | `maca-pytorch:3.2.1.4` | MACA 运行时,通过 `mx-smi` 检测 | +| Moore | `--privileged` | `vllm_musa:20251112_hygon` | MUSA 运行时,通过 `mthreads-gmi` 检测 | | Ascend | TODO | `ascend-pytorch:24.0.0` | 待完善,镜像和 job 尚未就绪 | --- @@ -228,6 +232,9 @@ python .ci/agent.py serve --port 8080 # Iluvatar 机器 python .ci/agent.py serve --port 8080 + +# MetaX 机器 +python .ci/agent.py serve --port 8080 ``` `serve` 子命令额外参数: @@ -261,6 +268,10 @@ agents: url: http://nvidia-host:8080 iluvatar: url: http://iluvatar-host:8080 + metax: + url: http://metax-host:8080 + moore: + url: http://moore-host:8080 ``` ### 资源调度 @@ -281,9 +292,9 @@ Status context 格式:`ci/infiniops/{job_name}` ## 多机部署指南 -以 NVIDIA + Iluvatar 双平台为例,说明如何在两台机器上部署 Agent 并实现跨平台并行测试。 +以 NVIDIA + Iluvatar + MetaX + Moore 多平台为例,说明如何在多台机器上部署 Agent 并实现跨平台并行测试。 -### 前置条件(两台机器共同) +### 前置条件(所有机器共同) ```bash # 1. Python 3.10+ 和依赖 @@ -323,6 +334,32 @@ docker images | grep corex # 应有 corex:qs_pj20250825 python .ci/build.py --platform iluvatar ``` +### MetaX 机器配置 + +```bash +# 1. 确认 MACA 运行时已安装 +mx-smi + +# 2. 确认基础镜像已导入(非公开镜像,需提前准备) +docker images | grep maca-pytorch # 应有 maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64 + +# 3. 构建 CI 镜像 +python .ci/build.py --platform metax +``` + +### Moore 机器配置 + +```bash +# 1. 确认 MUSA 运行时已安装 +mthreads-gmi + +# 2. 确认基础镜像已导入(非公开镜像,需提前准备) +docker images | grep vllm_musa # 应有 vllm_musa:20251112_hygon + +# 3. 构建 CI 镜像 +python .ci/build.py --platform moore +``` + ### 启动 Agent 服务 在各自机器上启动 Agent: @@ -333,6 +370,12 @@ python .ci/agent.py serve --port 8080 # Iluvatar 机器(平台自动发现) python .ci/agent.py serve --port 8080 + +# MetaX 机器(平台自动发现) +python .ci/agent.py serve --port 8080 + +# Moore 机器(平台自动发现) +python .ci/agent.py serve --port 8080 ``` 验证连通性: @@ -340,6 +383,8 @@ python .ci/agent.py serve --port 8080 ```bash curl http://:8080/health curl http://:8080/health +curl http://:8080/health +curl http://:8080/health ``` ### 配置远程 Agent 地址 @@ -352,6 +397,10 @@ agents: url: http://:8080 iluvatar: url: http://:8080 + metax: + url: http://:8080 + moore: + url: http://:8080 ``` ### 触发跨平台测试 @@ -371,7 +420,7 @@ python .ci/agent.py run --platform nvidia #### GitHub Status 上报 -两台机器均设置环境变量,各自上报所属平台的测试状态: +所有机器均设置环境变量,各自上报所属平台的测试状态: ```bash export GITHUB_TOKEN=ghp_xxxxxxxxxxxx @@ -415,14 +464,20 @@ export WEBHOOK_SECRET= # 1. 各机器单独 dry-run python .ci/agent.py run --platform nvidia --dry-run python .ci/agent.py run --platform iluvatar --dry-run +python .ci/agent.py run --platform metax --dry-run +python .ci/agent.py run --platform moore --dry-run # 2. 健康检查 curl http://:8080/health curl http://:8080/health +curl http://:8080/health +curl http://:8080/health # 3. 查看资源状态 curl http://:8080/status curl http://:8080/status +curl http://:8080/status +curl http://:8080/status # 4. 跨平台一键测试 python .ci/agent.py run --branch master diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py index a24041f..a49cbff 100644 --- a/.ci/ci_resource.py +++ b/.ci/ci_resource.py @@ -1,7 +1,9 @@ #!/usr/bin/env python3 """Resource detection and allocation for CI Runner Agent.""" +import json import os +import re import shutil import subprocess import threading @@ -30,7 +32,7 @@ class SystemResources: class ResourcePool: """Thread-safe GPU and system resource manager. - Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi, mx-smi) + Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi, mx-smi, mthreads-gmi) and tracks allocations to enable dynamic parallel scheduling. """ @@ -38,6 +40,7 @@ class ResourcePool: "nvidia": "nvidia-smi", "iluvatar": "ixsmi", "metax": "mx-smi", + "moore": "mthreads-gmi", } def __init__(self, platform, utilization_threshold=10): @@ -60,6 +63,9 @@ def detect_gpus(self) -> list[GpuInfo]: if self._platform == "metax": return self._detect_gpus_metax() + if self._platform == "moore": + return self._detect_gpus_moore() + tool = self.GPU_QUERY_TOOLS.get(self._platform) if not tool: @@ -116,7 +122,6 @@ def _detect_gpus_metax(self) -> list[GpuInfo]: Utilization GPU : 0 % """ - import re def run_mxsmi(flag): try: @@ -181,6 +186,73 @@ def run_mxsmi(flag): )) return gpus + def _detect_gpus_moore(self) -> list[GpuInfo]: + """Parse mthreads-gmi JSON output for Moore Threads GPUs. + + Uses: mthreads-gmi -q --json + Expected JSON structure: + { + "Attached GPUs": { + "GPU 00000000:3B:00.0": { + "Minor Number": "0", + "Memory Usage": { + "Total": "24576 MiB", + "Used": "512 MiB" + }, + "Utilization": { + "Gpu": "5 %" + } + } + } + } + """ + def extract_number(s): + m = re.search(r"([\d.]+)", str(s)) + return float(m.group(1)) if m else 0.0 + + try: + result = subprocess.run( + ["mthreads-gmi", "-q", "--json"], + capture_output=True, + text=True, + timeout=10, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return [] + + if result.returncode != 0: + return [] + + try: + data = json.loads(result.stdout) + except json.JSONDecodeError: + return [] + + gpus = [] + attached = data.get("Attached GPUs", {}) + + for gpu_data in attached.values(): + try: + index = int(gpu_data.get("Minor Number", len(gpus))) + + mem = gpu_data.get("Memory Usage", {}) + total_mb = extract_number(mem.get("Total", "0 MiB")) + used_mb = extract_number(mem.get("Used", "0 MiB")) + util_pct = extract_number( + gpu_data.get("Utilization", {}).get("Gpu", "0 %") + ) + + gpus.append(GpuInfo( + index=index, + memory_used_mb=used_mb, + memory_total_mb=total_mb, + utilization_pct=util_pct, + )) + except (ValueError, AttributeError): + continue + + return sorted(gpus, key=lambda g: g.index) + def detect_system_resources(self) -> SystemResources: """Read system memory from /proc/meminfo and CPU count.""" total_mb = 0.0 diff --git a/.ci/config.yaml b/.ci/config.yaml index 171c9e9..24b4006 100644 --- a/.ci/config.yaml +++ b/.ci/config.yaml @@ -85,6 +85,28 @@ platforms: - name: test run: pytest tests/ -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + moore: + image: + dockerfile: .ci/images/moore/ + build_args: + BASE_IMAGE: sh-harbor.mthreads.com/mcctest/vllm_musa:20251112_hygon + APT_MIRROR: http://archive.ubuntu.com/ubuntu + PIP_INDEX_URL: https://pypi.org/simple + docker_args: + - "--privileged" + setup: pip install .[dev] --no-build-isolation + jobs: + gpu: + resources: + gpu_ids: "0" + gpu_style: none # Moore 设备通过 --privileged 透传,MTHREADS_VISIBLE_DEVICES 由基础镜像设置 + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + ascend: # TODO: Ascend image is not ready yet image: dockerfile: .ci/images/ascend/ diff --git a/.ci/images/moore/Dockerfile b/.ci/images/moore/Dockerfile new file mode 100644 index 0000000..9a073ba --- /dev/null +++ b/.ci/images/moore/Dockerfile @@ -0,0 +1,38 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +# MUSA_HOME, PATH, LD_LIBRARY_PATH already set by base image + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + +ARG APT_MIRROR +RUN if [ -n "$APT_MIRROR" ]; then \ + sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ + fi && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + ninja-build \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +ARG PIP_INDEX_URL +RUN pip install --no-cache-dir \ + ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ + scikit-build-core \ + libclang \ + pytest-cov \ + pytest-xdist \ + ruff==0.15.7 + +# Pin pre-installed torch to prevent pip from replacing it with upstream version +RUN echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt +ENV PIP_CONSTRAINT=/etc/pip-constraints.txt + +WORKDIR /workspace From 3166c87b224e8678651a0b69a4e9a13d1775e44e Mon Sep 17 00:00:00 2001 From: zhangyue Date: Tue, 24 Mar 2026 11:23:23 +0000 Subject: [PATCH 10/16] feat(ci): capture Docker error output for remote job diagnostics --- .ci/agent.py | 129 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 98 insertions(+), 31 deletions(-) diff --git a/.ci/agent.py b/.ci/agent.py index 0fa3715..485221f 100644 --- a/.ci/agent.py +++ b/.ci/agent.py @@ -52,6 +52,8 @@ STATE_FAILURE = "failure" STATE_ERROR = "error" +TAIL_LINES = 50 + # urllib helpers (module-level for easier mocking in tests) urllib_request = urllib.request.Request urllib_urlopen = urllib.request.urlopen @@ -65,7 +67,9 @@ class JobRequest: """Describes a CI job to be executed.""" - def __init__(self, job_name, branch, commit_sha, config, image_tag=None, results_dir=None): + def __init__( + self, job_name, branch, commit_sha, config, image_tag=None, results_dir=None + ): self.job_id = str(uuid.uuid4())[:8] self.job_name = job_name self.branch = branch @@ -92,18 +96,28 @@ def to_dict(self): class JobResult: """Outcome of a completed job.""" - def __init__(self, job_id, job_name, commit_sha, returncode, results_dir, duration): + def __init__( + self, + job_id, + job_name, + commit_sha, + returncode, + results_dir, + duration, + error_tail=None, + ): self.job_id = job_id self.job_name = job_name self.commit_sha = commit_sha self.returncode = returncode self.results_dir = results_dir self.duration = duration + self.error_tail = error_tail or [] self.state = STATE_SUCCESS if returncode == 0 else STATE_FAILURE def to_dict(self): - return { + d = { "job_id": self.job_id, "job_name": self.job_name, "commit_sha": self.commit_sha, @@ -113,6 +127,11 @@ def to_dict(self): "duration_seconds": round(self.duration, 1), } + if self.error_tail: + d["error_tail"] = self.error_tail + + return d + # --------------------------------------------------------------------------- # Job selection and routing @@ -130,14 +149,11 @@ def select_jobs(config, platform=None, job_name=None): return [job_name] if platform: - return [ - name for name, job in jobs.items() if job.get("platform") == platform - ] + return [name for name, job in jobs.items() if job.get("platform") == platform] return list(jobs.keys()) - # --------------------------------------------------------------------------- # Scheduler # --------------------------------------------------------------------------- @@ -211,10 +227,7 @@ def get_job(self, job_id): def get_status(self): """Return scheduler status for the /status endpoint.""" with self._lock: - queued = [ - self._jobs[r.job_id]["request"].to_dict() - for r in self._queue - ] + queued = [self._jobs[r.job_id]["request"].to_dict() for r in self._queue] running = [] completed = [] @@ -222,7 +235,9 @@ def get_status(self): state = entry["state"] if state == STATE_RUNNING: - running.append({**entry["request"].to_dict(), "gpu_ids": entry["gpu_ids"]}) + running.append( + {**entry["request"].to_dict(), "gpu_ids": entry["gpu_ids"]} + ) elif state in (STATE_SUCCESS, STATE_FAILURE): completed.append(entry["result"].to_dict()) @@ -238,7 +253,8 @@ def wait_all(self): while True: with self._lock: pending = any( - e["state"] in (STATE_QUEUED, STATE_RUNNING) for e in self._jobs.values() + e["state"] in (STATE_QUEUED, STATE_RUNNING) + for e in self._jobs.values() ) if not pending: @@ -248,11 +264,7 @@ def wait_all(self): self._done_event.clear() with self._lock: - return [ - e["result"] - for e in self._jobs.values() - if e["result"] is not None - ] + return [e["result"] for e in self._jobs.values() if e["result"] is not None] def _try_schedule(self): """Try to run queued jobs that have enough resources. @@ -315,7 +327,9 @@ def _run_job(self, req, gpu_ids): job_cfg = self._config["jobs"][req.job_name] all_stages = job_cfg.get("stages", []) repo_url = self._config.get("repo", {}).get("url", "") - commit_short = req.commit_sha[:7] if len(req.commit_sha) > 7 else req.commit_sha + commit_short = ( + req.commit_sha[:7] if len(req.commit_sha) > 7 else req.commit_sha + ) results_dir = run.build_results_dir( req.results_dir, req.platform, all_stages, commit_short ) @@ -338,10 +352,30 @@ def _run_job(self, req, gpu_ids): if self._dry_run: print(f"[dry-run] {req.job_name}: {shlex.join(docker_args)}") returncode = 0 + error_tail = [] else: results_dir.mkdir(parents=True, exist_ok=True) - proc = subprocess.run(docker_args) - returncode = proc.returncode + proc = subprocess.Popen( + docker_args, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + tail_buf = collections.deque(maxlen=TAIL_LINES) + + for line in proc.stdout: + sys.stdout.buffer.write(line) + tail_buf.append(line) + + proc.stdout.close() + returncode = proc.wait() + + if returncode != 0: + error_tail = [ + raw.decode("utf-8", errors="replace").rstrip("\n") + for raw in tail_buf + ] + else: + error_tail = [] duration = time.monotonic() - start @@ -352,6 +386,7 @@ def _run_job(self, req, gpu_ids): returncode=returncode, results_dir=results_dir, duration=duration, + error_tail=error_tail, ) # Post final status @@ -365,7 +400,9 @@ def _run_job(self, req, gpu_ids): f"{req.job_name}: {result.state} in {duration:.0f}s", ) except Exception as e: - print(f"error: job {req.job_name} failed with exception: {e}", file=sys.stderr) + print( + f"error: job {req.job_name} failed with exception: {e}", file=sys.stderr + ) if result is None: result = JobResult( @@ -375,6 +412,7 @@ def _run_job(self, req, gpu_ids): returncode=-1, results_dir=req.results_dir, duration=0, + error_tail=[str(e)], ) if not self._no_status: @@ -392,7 +430,9 @@ def _run_job(self, req, gpu_ids): with self._lock: self._jobs[req.job_id]["result"] = result - self._jobs[req.job_id]["state"] = result.state if result else STATE_FAILURE + self._jobs[req.job_id]["state"] = ( + result.state if result else STATE_FAILURE + ) self._done_event.set() self._try_schedule() @@ -410,9 +450,9 @@ def verify_signature(secret, body, signature_header): if not signature_header: return False - expected = "sha256=" + hmac.new( - secret.encode("utf-8"), body, hashlib.sha256 - ).hexdigest() + expected = ( + "sha256=" + hmac.new(secret.encode("utf-8"), body, hashlib.sha256).hexdigest() + ) return hmac.compare_digest(expected, signature_header) @@ -567,7 +607,9 @@ def _parse_pull_request(self, payload): def _submit_jobs(self, branch, sha, job_name=None, image_tag=None): config = self.server.config - job_names = select_jobs(config, platform=self.server.platform, job_name=job_name) + job_names = select_jobs( + config, platform=self.server.platform, job_name=job_name + ) job_ids = [] for name in job_names: @@ -621,7 +663,9 @@ def __init__( # --------------------------------------------------------------------------- -def dispatch_remote_job(agent_url, job_name, branch, commit_sha, image_tag=None, api_token=None): +def dispatch_remote_job( + agent_url, job_name, branch, commit_sha, image_tag=None, api_token=None +): """Send a job to a remote agent via HTTP API. Returns job_id or None.""" url = f"{agent_url.rstrip('/')}/api/run" body = { @@ -707,7 +751,10 @@ def cmd_run(args): agent_url = agents.get(platform, {}).get("url", "") if not agent_url: - print(f"error: no agent URL configured for platform {platform!r} (job {name})", file=sys.stderr) + print( + f"error: no agent URL configured for platform {platform!r} (job {name})", + file=sys.stderr, + ) sys.exit(1) jobs_to_dispatch.append((name, agent_url)) @@ -730,7 +777,11 @@ def cmd_run(args): file=sys.stderr, ) job_id = dispatch_remote_job( - agent_url, name, branch, commit_sha, args.image_tag, + agent_url, + name, + branch, + commit_sha, + args.image_tag, api_token=api_token or None, ) @@ -760,6 +811,20 @@ def cmd_run(args): f"<== {tag} {name} ({duration:.0f}s)", file=sys.stderr, ) + + error_tail = result.get("error_tail", []) + + if error_tail: + print( + f"--- error output (last {len(error_tail)} lines) ---", + file=sys.stderr, + ) + + for line in error_tail: + print(f" {line}", file=sys.stderr) + + print("---", file=sys.stderr) + results.append(result) else: print(f"<== TIMEOUT {name}", file=sys.stderr) @@ -875,7 +940,9 @@ def main(): type=Path, default=Path(__file__).resolve().parent / "config.yaml", ) - run_parser.add_argument("--branch", type=str, help="Branch to test (default: config repo.branch)") + run_parser.add_argument( + "--branch", type=str, help="Branch to test (default: config repo.branch)" + ) run_parser.add_argument("--job", type=str, help="Specific job name") run_parser.add_argument("--platform", type=str, help="Filter jobs by platform") run_parser.add_argument("--image-tag", type=str, help="Override image tag") From 04424eac9688c32b0272c316eeaeacc1a2529a58 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Tue, 24 Mar 2026 11:40:56 +0000 Subject: [PATCH 11/16] feat(ci): capture error output and improve CLI result display - Capture last 50 lines of Docker output via ring buffer so failed jobs return diagnostic info to the CLI client. - Store raw bytes during execution; decode only on the failure path. - Align job name columns in `<==` result lines for readability. - Show summary only when jobs fail, removing redundant all-pass output. Co-Authored-By: Claude --- .ci/agent.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/.ci/agent.py b/.ci/agent.py index 485221f..2fa9971 100644 --- a/.ci/agent.py +++ b/.ci/agent.py @@ -799,6 +799,9 @@ def cmd_run(args): for name, url, jid in dispatched } + # Collect name lengths for column alignment. + name_width = max(len(n) for n, _, _ in dispatched) + for future in as_completed(futures): name, _, _ = futures[future] result = future.result() @@ -808,7 +811,7 @@ def cmd_run(args): duration = result.get("duration_seconds", 0) tag = "PASS" if state == STATE_SUCCESS else "FAIL" print( - f"<== {tag} {name} ({duration:.0f}s)", + f"<== {tag} {name:<{name_width}} ({duration:.0f}s)", file=sys.stderr, ) @@ -827,25 +830,28 @@ def cmd_run(args): results.append(result) else: - print(f"<== TIMEOUT {name}", file=sys.stderr) + print( + f"<== TIMEOUT {name:<{name_width}}", + file=sys.stderr, + ) results.append({"job_name": name, "state": "timeout"}) - # Summary - print("\n========== Results ==========") - all_ok = True + # Summary: only print when there are failures. + failed = [r for r in results if r.get("state") != STATE_SUCCESS] - for r in results: - state = r.get("state", "unknown") - name = r.get("job_name", "?") - status = "PASS" if state == STATE_SUCCESS else "FAIL" + if failed: + print("\n========== Failed ==========", file=sys.stderr) + name_width = max(len(r.get("job_name", "?")) for r in failed) - if state != STATE_SUCCESS: - all_ok = False - - duration = r.get("duration_seconds", 0) - print(f" {status} {name} ({duration:.0f}s)") + for r in failed: + name = r.get("job_name", "?") + state = r.get("state", "unknown") + duration = r.get("duration_seconds", 0) + print( + f" FAIL {name:<{name_width}} {state} ({duration:.0f}s)", + file=sys.stderr, + ) - if not all_ok: sys.exit(1) From a7fa544a84f7aeb49a6320045374dac9ab89ae68 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Tue, 24 Mar 2026 23:35:04 +0800 Subject: [PATCH 12/16] feat(ci): add Cambricon MLU platform CI support - Add .ci/images/cambricon/Dockerfile for AnolisOS-based Cambricon image - Add cambricon platform to config.yaml with MLU-style GPU passthrough - Add GPU_STYLE_MLU constant and MLU_VISIBLE_DEVICES support in run.py - Add cnmon-based GPU detection (_detect_gpus_cambricon) in ci_resource.py - Add --test CLI flag to override pytest test path at runtime - Skip empty stage run commands instead of erroring (compilation-only mode) - Fix _torch_gemm fallback for CPU float16/bfloat16 (upcast to float32) - Skip bfloat16 on MLU (cnnlBatchMatMulEx does not support it) - Hoist _PYTEST_VALUE_FLAGS to module level; add ValueError guard in cambricon parser - Remove redundant yaml import guard in agent.py (utils.py already handles it) Co-Authored-By: Claude Sonnet 4.6 --- .ci/agent.py | 18 ++--- .ci/ci_resource.py | 106 ++++++++++++++++++++----- .ci/config.yaml | 21 +++++ .ci/github_status.py | 4 +- .ci/images/cambricon/Dockerfile | 33 ++++++++ .ci/run.py | 86 +++++++++++++++++---- .ci/tests/test_agent.py | 132 ++++++++++++++++++++++++-------- .ci/tests/test_github_status.py | 7 +- .ci/tests/test_resource.py | 11 ++- tests/test_gemm.py | 12 ++- 10 files changed, 341 insertions(+), 89 deletions(-) create mode 100644 .ci/images/cambricon/Dockerfile diff --git a/.ci/agent.py b/.ci/agent.py index 2fa9971..3fb5d9e 100644 --- a/.ci/agent.py +++ b/.ci/agent.py @@ -29,14 +29,6 @@ from http.server import BaseHTTPRequestHandler, HTTPServer from pathlib import Path -try: - import yaml -except ImportError: - print( - "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr - ) - sys.exit(1) - import ci_resource as res import github_status as gh import run @@ -920,11 +912,11 @@ def cmd_serve(args): f"Agent serving on {args.host}:{args.port} (platform={platform})", file=sys.stderr, ) - print(f" POST /webhook — GitHub webhook", file=sys.stderr) - print(f" POST /api/run — remote job trigger", file=sys.stderr) - print(f" GET /health — health check", file=sys.stderr) - print(f" GET /status — queue & resource status", file=sys.stderr) - print(f" GET /api/job/{{id}} — job status", file=sys.stderr) + print(" POST /webhook — GitHub webhook", file=sys.stderr) + print(" POST /api/run — remote job trigger", file=sys.stderr) + print(" GET /health — health check", file=sys.stderr) + print(" GET /status — queue & resource status", file=sys.stderr) + print(" GET /api/job/{id} — job status", file=sys.stderr) try: server.serve_forever() diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py index a49cbff..bbf27ae 100644 --- a/.ci/ci_resource.py +++ b/.ci/ci_resource.py @@ -2,16 +2,18 @@ """Resource detection and allocation for CI Runner Agent.""" import json +import operator import os import re import shutil import subprocess import threading -from dataclasses import dataclass, field +from dataclasses import dataclass # GPU passthrough styles GPU_STYLE_NVIDIA = "nvidia" GPU_STYLE_NONE = "none" +GPU_STYLE_MLU = "mlu" @dataclass @@ -41,6 +43,7 @@ class ResourcePool: "iluvatar": "ixsmi", "metax": "mx-smi", "moore": "mthreads-gmi", + "cambricon": "cnmon", } def __init__(self, platform, utilization_threshold=10): @@ -66,6 +69,9 @@ def detect_gpus(self) -> list[GpuInfo]: if self._platform == "moore": return self._detect_gpus_moore() + if self._platform == "cambricon": + return self._detect_gpus_cambricon() + tool = self.GPU_QUERY_TOOLS.get(self._platform) if not tool: @@ -127,7 +133,9 @@ def run_mxsmi(flag): try: r = subprocess.run( ["mx-smi", flag], - capture_output=True, text=True, timeout=10, + capture_output=True, + text=True, + timeout=10, ) return r.stdout if r.returncode == 0 else "" except (FileNotFoundError, subprocess.TimeoutExpired): @@ -178,12 +186,14 @@ def run_mxsmi(flag): gpus = [] for idx in sorted(mem): used_mb, total_mb = mem[idx] - gpus.append(GpuInfo( - index=idx, - memory_used_mb=used_mb, - memory_total_mb=total_mb, - utilization_pct=util.get(idx, 0.0), - )) + gpus.append( + GpuInfo( + index=idx, + memory_used_mb=used_mb, + memory_total_mb=total_mb, + utilization_pct=util.get(idx, 0.0), + ) + ) return gpus def _detect_gpus_moore(self) -> list[GpuInfo]: @@ -206,6 +216,7 @@ def _detect_gpus_moore(self) -> list[GpuInfo]: } } """ + def extract_number(s): m = re.search(r"([\d.]+)", str(s)) return float(m.group(1)) if m else 0.0 @@ -242,16 +253,77 @@ def extract_number(s): gpu_data.get("Utilization", {}).get("Gpu", "0 %") ) - gpus.append(GpuInfo( - index=index, - memory_used_mb=used_mb, - memory_total_mb=total_mb, - utilization_pct=util_pct, - )) + gpus.append( + GpuInfo( + index=index, + memory_used_mb=used_mb, + memory_total_mb=total_mb, + utilization_pct=util_pct, + ) + ) except (ValueError, AttributeError): continue - return sorted(gpus, key=lambda g: g.index) + return sorted(gpus, key=operator.attrgetter("index")) + + def _detect_gpus_cambricon(self) -> list[GpuInfo]: + """Parse cnmon output for Cambricon MLU cards. + + Each card appears as two consecutive data rows: + Row 1: | {card} {vf} {name} {fw} | {bus_id} | {util}% {ecc} | + Row 2: | {fan}% {temp} {pwr} | {mem_used} MiB/ {mem_total} MiB | ... | + """ + try: + result = subprocess.run( + ["cnmon"], + capture_output=True, + text=True, + timeout=10, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return [] + + if result.returncode != 0: + return [] + + gpus = [] + lines = result.stdout.splitlines() + i = 0 + + while i < len(lines): + line = lines[i] + # Row 1: "| {index} ... | {bus_id} | {util}% {ecc} |" + m1 = re.match(r"^\|\s+(\d+)\s+.*\|\s*([\d.]+)%", line) + + if m1 and i + 1 < len(lines): + try: + card_index = int(m1.group(1)) + util_pct = float(m1.group(2)) + row2 = lines[i + 1] + mem_m = re.search(r"([\d.]+)\s+MiB/\s*([\d.]+)\s+MiB", row2) + + if mem_m: + used_mb = float(mem_m.group(1)) + total_mb = float(mem_m.group(2)) + else: + used_mb, total_mb = 0.0, 0.0 + + gpus.append( + GpuInfo( + index=card_index, + memory_used_mb=used_mb, + memory_total_mb=total_mb, + utilization_pct=util_pct, + ) + ) + except (ValueError, AttributeError): + pass + i += 2 + continue + + i += 1 + + return sorted(gpus, key=operator.attrgetter("index")) def detect_system_resources(self) -> SystemResources: """Read system memory from /proc/meminfo and CPU count.""" @@ -278,9 +350,7 @@ def get_free_gpus(self) -> list[int]: """Return GPU indices with utilization below threshold.""" gpus = self.detect_gpus() return [ - g.index - for g in gpus - if g.utilization_pct < self._utilization_threshold + g.index for g in gpus if g.utilization_pct < self._utilization_threshold ] def allocate(self, gpu_count, memory_mb=0) -> tuple[list[int], bool]: diff --git a/.ci/config.yaml b/.ci/config.yaml index 24b4006..2509b40 100644 --- a/.ci/config.yaml +++ b/.ci/config.yaml @@ -107,6 +107,27 @@ platforms: - name: test run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + cambricon: + image: + dockerfile: .ci/images/cambricon/ + build_args: + BASE_IMAGE: cambricon/pytorch:v1.25.3-torch2.1-anolisos8.8-py310 + PIP_INDEX_URL: https://pypi.org/simple + docker_args: + - "--privileged" + setup: pip install .[dev] --no-build-isolation + jobs: + gpu: + resources: + gpu_ids: "0" + gpu_style: mlu # Cambricon MLU 通过 --privileged 透传,通过 MLU_VISIBLE_DEVICES 控制可见设备 + memory: 32GB + shm_size: 16g + timeout: 3600 + stages: + - name: test + run: pytest tests/test_gemm.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + ascend: # TODO: Ascend image is not ready yet image: dockerfile: .ci/images/ascend/ diff --git a/.ci/github_status.py b/.ci/github_status.py index a7abb8f..f8f017f 100644 --- a/.ci/github_status.py +++ b/.ci/github_status.py @@ -59,7 +59,9 @@ def post_commit_status( return False if not owner or not repo or not sha: - print("warning: missing owner/repo/sha, skipping status update", file=sys.stderr) + print( + "warning: missing owner/repo/sha, skipping status update", file=sys.stderr + ) return False url = f"https://api.github.com/repos/{owner}/{repo}/statuses/{sha}" diff --git a/.ci/images/cambricon/Dockerfile b/.ci/images/cambricon/Dockerfile new file mode 100644 index 0000000..f1282d9 --- /dev/null +++ b/.ci/images/cambricon/Dockerfile @@ -0,0 +1,33 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +# Python 3.10 executables (pip-installed tools) live under /usr/local/python3.10/bin. +ENV PATH=/usr/local/python3.10/bin:${PATH} + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy +ARG https_proxy +ARG no_proxy + +# git and cmake are pre-installed; coreutils-single covers coreutils needs. +RUN dnf install -y ninja-build && dnf clean all + +ARG PIP_INDEX_URL +RUN pip install --no-cache-dir \ + ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ + scikit-build-core \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + ruff==0.15.7 + +# Pin pre-installed Cambricon torch to prevent pip from replacing it with upstream version. +RUN pip show torch >/dev/null 2>&1 && \ + echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ + touch /etc/pip-constraints.txt +ENV PIP_CONSTRAINT=/etc/pip-constraints.txt + +WORKDIR /workspace diff --git a/.ci/run.py b/.ci/run.py index 6c108e4..811ba2d 100644 --- a/.ci/run.py +++ b/.ci/run.py @@ -9,9 +9,52 @@ from datetime import datetime from pathlib import Path -from ci_resource import GPU_STYLE_NVIDIA, GPU_STYLE_NONE, detect_platform +from ci_resource import ( + GPU_STYLE_NVIDIA, + GPU_STYLE_NONE, + GPU_STYLE_MLU, + ResourcePool, + detect_platform, +) from utils import get_git_commit, load_config +# Flags that consume the next token as their value (e.g. -n 4, -k expr). +_PYTEST_VALUE_FLAGS = {"-n", "-k", "-m", "-p", "--tb", "--junitxml", "--rootdir"} + + +def apply_test_override(run_cmd, test_path): + """Replace positional test path(s) in a pytest stage command. + + For example: ``pytest tests/ -n 4 ...`` becomes + ``pytest tests/test_gemm.py -n 4 ...`` when ``test_path`` is + ``tests/test_gemm.py``. + """ + parts = shlex.split(run_cmd) + + if not parts or parts[0] != "pytest": + return run_cmd + + result = ["pytest", test_path] + skip_next = False + + for p in parts[1:]: + if skip_next: + result.append(p) + skip_next = False + continue + + if p.startswith("-"): + result.append(p) + if p in _PYTEST_VALUE_FLAGS: + skip_next = True + continue + + # Skip existing test paths; the override is already in result[1]. + if not ("/" in p or p.endswith(".py") or "::" in p): + result.append(p) + + return shlex.join(result) + def build_results_dir(base, platform, stages, commit): """Build a results directory path: `{base}/{platform}_{stages}_{commit}_{timestamp}`.""" @@ -57,7 +100,7 @@ def build_runner_script(): name="${!name_var}" cmd="${!cmd_var}" echo "========== Stage: $name ==========" - eval "$cmd" || failed=1 + [ -n "$cmd" ] && { eval "$cmd" || failed=1; } done echo "========== Summary ==========" if [ -n "$HOST_UID" ] && [ -n "$HOST_GID" ]; then @@ -130,7 +173,7 @@ def build_docker_args( args.append("-e") args.append(f"STAGE_{i + 1}_NAME={s['name']}") args.append("-e") - args.append(f"STAGE_{i + 1}_CMD={s['run']}") + args.append(f"STAGE_{i + 1}_CMD={s.get('run', '')}") # Platform-specific device access for flag in job.get("docker_args", []): @@ -155,6 +198,10 @@ def build_docker_args( # For platforms like Iluvatar/CoreX that use --privileged + /dev mount, # control visible GPUs via CUDA_VISIBLE_DEVICES. args.extend(["-e", f"CUDA_VISIBLE_DEVICES={gpu_id}"]) + elif gpu_style == GPU_STYLE_MLU and gpu_id and gpu_id != "all": + # For Cambricon MLU platforms that use --privileged, + # control visible devices via MLU_VISIBLE_DEVICES. + args.extend(["-e", f"MLU_VISIBLE_DEVICES={gpu_id}"]) memory = resources.get("memory") @@ -195,7 +242,8 @@ def resolve_job_names(jobs, platform, job=None): if job: matches = [ - name for name, cfg in jobs.items() + name + for name, cfg in jobs.items() if cfg.get("platform") == platform and cfg.get("short_name") == job ] @@ -208,9 +256,7 @@ def resolve_job_names(jobs, platform, job=None): return matches - matches = [ - name for name, cfg in jobs.items() if cfg.get("platform") == platform - ] + matches = [name for name, cfg in jobs.items() if cfg.get("platform") == platform] if not matches: print(f"error: no jobs for platform {platform!r}", file=sys.stderr) @@ -227,7 +273,9 @@ def main(): default=Path(__file__).resolve().parent / "config.yaml", help="Path to config.yaml", ) - parser.add_argument("--branch", type=str, help="Override repo branch (default: config repo.branch)") + parser.add_argument( + "--branch", type=str, help="Override repo branch (default: config repo.branch)" + ) parser.add_argument( "--job", type=str, @@ -254,6 +302,11 @@ def main(): default=Path("ci-results"), help="Base directory for test results (default: ./ci-results)", ) + parser.add_argument( + "--test", + type=str, + help='Override pytest test path, e.g. "tests/test_gemm.py" or "tests/test_gemm.py::test_gemm"', + ) parser.add_argument( "--dry-run", action="store_true", @@ -269,10 +322,8 @@ def main(): platform = detect_platform() if not platform: - print( - "error: could not detect platform (no nvidia-smi or ixsmi found)", - file=sys.stderr, - ) + tools = ", ".join(ResourcePool.GPU_QUERY_TOOLS.values()) + print(f"error: could not detect platform (no {tools} found)", file=sys.stderr) sys.exit(1) print(f"platform: {platform}", file=sys.stderr) @@ -294,11 +345,20 @@ def main(): stages = [s for s in all_stages if s["name"] == args.stage] if not stages: - print(f"error: stage {args.stage!r} not found in {job_name}", file=sys.stderr) + print( + f"error: stage {args.stage!r} not found in {job_name}", + file=sys.stderr, + ) sys.exit(1) else: stages = all_stages + if args.test: + stages = [ + {**s, "run": apply_test_override(s.get("run", ""), args.test)} + for s in stages + ] + job_platform = job.get("platform", platform) commit = get_git_commit() results_dir = build_results_dir(args.results_dir, job_platform, stages, commit) diff --git a/.ci/tests/test_agent.py b/.ci/tests/test_agent.py index aa181c4..e51af2a 100644 --- a/.ci/tests/test_agent.py +++ b/.ci/tests/test_agent.py @@ -2,9 +2,8 @@ import hmac import json import threading -import time from pathlib import Path -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock import pytest @@ -81,7 +80,12 @@ def mock_resource_pool(): pool.platform = "nvidia" pool.allocate.return_value = ([0], True) pool.release.return_value = None - pool.get_status.return_value = {"platform": "nvidia", "gpus": [], "allocated_gpu_ids": [], "system": {}} + pool.get_status.return_value = { + "platform": "nvidia", + "gpus": [], + "allocated_gpu_ids": [], + "system": {}, + } return pool @@ -115,7 +119,6 @@ def test_select_jobs_invalid_name(agent_config): agent.select_jobs(agent_config, job_name="not_exist") - # --------------------------------------------------------------------------- # verify_signature # --------------------------------------------------------------------------- @@ -171,13 +174,21 @@ def test_scheduler_submit_and_run(agent_config, mock_resource_pool, monkeypatch) monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) scheduler = agent.Scheduler( - agent_config, "nvidia", mock_resource_pool, + agent_config, + "nvidia", + mock_resource_pool, results_dir=Path("/tmp/test-results"), - no_status=True, dry_run=True, + no_status=True, + dry_run=True, ) - req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config, - results_dir=Path("/tmp/test-results")) - jid = scheduler.submit(req) + req = agent.JobRequest( + "nvidia_gpu", + "master", + "abc123", + agent_config, + results_dir=Path("/tmp/test-results"), + ) + scheduler.submit(req) results = scheduler.wait_all() assert len(results) == 1 assert results[0].state == "success" @@ -186,11 +197,19 @@ def test_scheduler_submit_and_run(agent_config, mock_resource_pool, monkeypatch) def test_scheduler_queues_when_no_resources(agent_config, monkeypatch): pool = MagicMock(spec=res.ResourcePool) pool.allocate.return_value = ([], False) - pool.get_status.return_value = {"platform": "nvidia", "gpus": [], "allocated_gpu_ids": [], "system": {}} + pool.get_status.return_value = { + "platform": "nvidia", + "gpus": [], + "allocated_gpu_ids": [], + "system": {}, + } scheduler = agent.Scheduler( - agent_config, "nvidia", pool, - no_status=True, dry_run=False, + agent_config, + "nvidia", + pool, + no_status=True, + dry_run=False, ) req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) @@ -202,8 +221,11 @@ def test_scheduler_queues_when_no_resources(agent_config, monkeypatch): def test_scheduler_get_status(agent_config, mock_resource_pool): scheduler = agent.Scheduler( - agent_config, "nvidia", mock_resource_pool, - no_status=True, dry_run=True, + agent_config, + "nvidia", + mock_resource_pool, + no_status=True, + dry_run=True, ) status = scheduler.get_status() @@ -256,11 +278,17 @@ def _urlopen_no_proxy(url_or_req, **kwargs): def test_health_endpoint(agent_config, mock_resource_pool): scheduler = agent.Scheduler( - agent_config, "nvidia", mock_resource_pool, + agent_config, + "nvidia", + mock_resource_pool, no_status=True, ) server = agent.AgentServer( - "127.0.0.1", 0, agent_config, scheduler, "nvidia", + "127.0.0.1", + 0, + agent_config, + scheduler, + "nvidia", ) port = server.server_address[1] @@ -280,11 +308,18 @@ def test_api_run_endpoint(agent_config, mock_resource_pool, monkeypatch): monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) scheduler = agent.Scheduler( - agent_config, "nvidia", mock_resource_pool, - no_status=True, dry_run=True, + agent_config, + "nvidia", + mock_resource_pool, + no_status=True, + dry_run=True, ) server = agent.AgentServer( - "127.0.0.1", 0, agent_config, scheduler, "nvidia", + "127.0.0.1", + 0, + agent_config, + scheduler, + "nvidia", results_dir=Path("/tmp/test-results"), ) port = server.server_address[1] @@ -314,12 +349,19 @@ def test_webhook_with_signature(agent_config, mock_resource_pool, monkeypatch): monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) scheduler = agent.Scheduler( - agent_config, "nvidia", mock_resource_pool, - no_status=True, dry_run=True, + agent_config, + "nvidia", + mock_resource_pool, + no_status=True, + dry_run=True, ) secret = "test-secret" server = agent.AgentServer( - "127.0.0.1", 0, agent_config, scheduler, "nvidia", + "127.0.0.1", + 0, + agent_config, + scheduler, + "nvidia", webhook_secret=secret, results_dir=Path("/tmp/test-results"), ) @@ -330,10 +372,12 @@ def test_webhook_with_signature(agent_config, mock_resource_pool, monkeypatch): import urllib.request - payload = json.dumps({ - "ref": "refs/heads/master", - "after": "abc123def456", - }).encode() + payload = json.dumps( + { + "ref": "refs/heads/master", + "after": "abc123def456", + } + ).encode() sig = "sha256=" + hmac.new(secret.encode(), payload, hashlib.sha256).hexdigest() req = urllib.request.Request( @@ -356,11 +400,17 @@ def test_webhook_with_signature(agent_config, mock_resource_pool, monkeypatch): def test_webhook_invalid_signature(agent_config, mock_resource_pool): scheduler = agent.Scheduler( - agent_config, "nvidia", mock_resource_pool, + agent_config, + "nvidia", + mock_resource_pool, no_status=True, ) server = agent.AgentServer( - "127.0.0.1", 0, agent_config, scheduler, "nvidia", + "127.0.0.1", + 0, + agent_config, + scheduler, + "nvidia", webhook_secret="real-secret", ) port = server.server_address[1] @@ -401,11 +451,18 @@ def test_api_run_requires_token(agent_config, mock_resource_pool, monkeypatch): monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) scheduler = agent.Scheduler( - agent_config, "nvidia", mock_resource_pool, - no_status=True, dry_run=True, + agent_config, + "nvidia", + mock_resource_pool, + no_status=True, + dry_run=True, ) server = agent.AgentServer( - "127.0.0.1", 0, agent_config, scheduler, "nvidia", + "127.0.0.1", + 0, + agent_config, + scheduler, + "nvidia", api_token="my-secret-token", results_dir=Path("/tmp/test-results"), ) @@ -438,11 +495,18 @@ def test_api_run_accepts_valid_token(agent_config, mock_resource_pool, monkeypat monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) scheduler = agent.Scheduler( - agent_config, "nvidia", mock_resource_pool, - no_status=True, dry_run=True, + agent_config, + "nvidia", + mock_resource_pool, + no_status=True, + dry_run=True, ) server = agent.AgentServer( - "127.0.0.1", 0, agent_config, scheduler, "nvidia", + "127.0.0.1", + 0, + agent_config, + scheduler, + "nvidia", api_token="my-secret-token", results_dir=Path("/tmp/test-results"), ) diff --git a/.ci/tests/test_github_status.py b/.ci/tests/test_github_status.py index 0efa36e..edb2915 100644 --- a/.ci/tests/test_github_status.py +++ b/.ci/tests/test_github_status.py @@ -1,7 +1,6 @@ import json -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock -import pytest import github_status as gh @@ -57,7 +56,9 @@ def test_post_status_no_token(monkeypatch): def test_post_status_missing_owner(): - result = gh.post_commit_status("", "repo", "abc123", "success", "ctx", "desc", token="tok") + result = gh.post_commit_status( + "", "repo", "abc123", "success", "ctx", "desc", token="tok" + ) assert result is False diff --git a/.ci/tests/test_resource.py b/.ci/tests/test_resource.py index b75043c..ac15b7e 100644 --- a/.ci/tests/test_resource.py +++ b/.ci/tests/test_resource.py @@ -1,6 +1,5 @@ import threading -import pytest import ci_resource as res @@ -11,13 +10,17 @@ def test_gpu_info_fields(): - g = res.GpuInfo(index=0, memory_used_mb=1000, memory_total_mb=8000, utilization_pct=50) + g = res.GpuInfo( + index=0, memory_used_mb=1000, memory_total_mb=8000, utilization_pct=50 + ) assert g.index == 0 assert g.memory_total_mb == 8000 def test_system_resources_fields(): - s = res.SystemResources(total_memory_mb=32000, available_memory_mb=16000, cpu_count=8) + s = res.SystemResources( + total_memory_mb=32000, available_memory_mb=16000, cpu_count=8 + ) assert s.cpu_count == 8 @@ -90,7 +93,7 @@ def test_detect_system_resources(monkeypatch, tmp_path): "MemAvailable: 20000000 kB\n" ) - import io + _real_open = open def fake_open(path, **kw): diff --git a/tests/test_gemm.py b/tests/test_gemm.py index 43a47b6..491fb47 100644 --- a/tests/test_gemm.py +++ b/tests/test_gemm.py @@ -48,6 +48,10 @@ def test_gemm( if device == "mlu" and (trans_a or trans_b): pytest.skip("transposing is not currently supported on MLU") + # cnnlBatchMatMulEx does not accept bfloat16 inputs on MLU. + if device == "mlu" and dtype == torch.bfloat16: + pytest.skip("bfloat16 is not supported by cnnlBatchMatMulEx") + a = randn_strided(a_shape, a_strides, dtype=dtype, device=device) b = randn_strided(b_shape, b_strides, dtype=dtype, device=device) @@ -97,8 +101,10 @@ def _torch_gemm(a, b, alpha=1.0, beta=1.0, trans_a=False, trans_b=False, c=None) return torch.baddbmm(c, a, b, beta=beta, alpha=alpha, out=c) except RuntimeError: - c_original = c.clone() - torch.matmul(a, b, out=c) - c.mul_(alpha).add_(c_original, alpha=beta) + # Fallback for backends that don't support addmm/baddbmm (e.g. CPU float16/bfloat16): + # compute in float32 and cast back. + c_original = c.float() + result = torch.matmul(a.float(), b.float()) + c.copy_((alpha * result + beta * c_original).to(c.dtype)) return c From 7253bcd47871c359271a69385ac33589c40c0fe4 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Wed, 25 Mar 2026 02:53:51 +0000 Subject: [PATCH 13/16] docs(ci): translate README and comments to English, use ngpus for NVIDIA scheduler - Rewrite README.md entirely in English; add Cambricon to platform table and directory tree. - Translate all inline comments in config.yaml to English. - Replace `gpu_ids: "0"` with `ngpus: 1` for NVIDIA platform so the scheduler auto-picks a free GPU rather than pinning to device 0. - Add `ngpus` support to `parse_gpu_requirement` in ci_resource.py so scheduler correctly counts NVIDIA GPU demand. - Replace deprecated `gpu_count` fallback with `ngpus` in run.py `build_docker_args`. Co-Authored-By: Claude --- .ci/README.md | 350 +++++++++++++++++++++++---------------------- .ci/ci_resource.py | 4 + .ci/config.yaml | 26 ++-- .ci/run.py | 6 +- 4 files changed, 199 insertions(+), 187 deletions(-) diff --git a/.ci/README.md b/.ci/README.md index 12e8094..f468d90 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -1,21 +1,22 @@ -# .ci — CI 镜像与流水线 +# .ci — CI Images and Pipeline ``` .ci/ -├── config.yaml # 统一配置(镜像、job、Agent 定义) -├── utils.py # 共享工具(load_config、normalize_config、get_git_commit) -├── agent.py # Runner Agent(调度、Webhook、远程触发) -├── build.py # 镜像构建 -├── run.py # CI 流水线执行(Docker 层) -├── ci_resource.py # GPU/内存资源检测与分配 -├── github_status.py # GitHub Commit Status 上报 +├── config.yaml # Unified config (images, jobs, agent definitions) +├── utils.py # Shared utilities (load_config, normalize_config, get_git_commit) +├── agent.py # Runner Agent (scheduler, webhooks, remote dispatch) +├── build.py # Image builder +├── run.py # CI pipeline runner (Docker layer) +├── ci_resource.py # GPU/memory detection and allocation +├── github_status.py # GitHub Commit Status reporting ├── images/ │ ├── nvidia/Dockerfile │ ├── iluvatar/Dockerfile │ ├── metax/Dockerfile │ ├── moore/Dockerfile +│ ├── cambricon/Dockerfile │ └── ascend/Dockerfile -└── tests/ # 单元测试 +└── tests/ # Unit tests ├── conftest.py ├── test_agent.py ├── test_build.py @@ -25,14 +26,14 @@ └── test_utils.py ``` -**前置依赖**:Docker、Python 3.10+、`pip install pyyaml` +**Prerequisites**: Docker, Python 3.10+, `pip install pyyaml` --- -## 配置文件 `config.yaml` +## Configuration `config.yaml` -配置以 **platform** 为顶级结构,每个平台包含镜像定义、平台级默认值和 job 列表。 -加载时自动展平为 `{platform}_{job}` 格式(如 `nvidia_gpu`)。 +Config uses a **platform-centric** top-level structure. Each platform defines its image, platform-level defaults, and job list. +At load time, jobs are flattened to `{platform}_{job}` format (e.g., `nvidia_gpu`). ```yaml repo: @@ -42,7 +43,7 @@ repo: github: status_context_prefix: "ci/infiniops" -agents: # 远程 Agent 地址(CLI 跨机器触发用) +agents: # Remote agent URLs (used by CLI for cross-machine dispatch) nvidia: url: http://nvidia-host:8080 iluvatar: @@ -50,13 +51,13 @@ agents: # 远程 Agent 地址(CLI 跨机器 platforms: nvidia: - image: # 镜像定义 + image: # Image definition dockerfile: .ci/images/nvidia/ build_args: BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 setup: pip install .[dev] --no-build-isolation jobs: - gpu: # 展平后为 nvidia_gpu + gpu: # Flattened as nvidia_gpu resources: gpu_ids: "0" # "0" | "0,2" | "all" memory: 32GB @@ -73,7 +74,7 @@ platforms: BASE_IMAGE: corex:qs_pj20250825 APT_MIRROR: http://archive.ubuntu.com/ubuntu PIP_INDEX_URL: https://pypi.org/simple - docker_args: # 平台级 docker 参数,所有 job 继承 + docker_args: # Platform-level docker args, inherited by all jobs - "--privileged" - "--cap-add=ALL" - "--pid=host" @@ -85,10 +86,10 @@ platforms: - /lib/modules:/lib/modules setup: pip install .[dev] --no-build-isolation jobs: - gpu: # 展平后为 iluvatar_gpu + gpu: # Flattened as iluvatar_gpu resources: gpu_ids: "0" - gpu_style: none # CoreX 设备通过 --privileged + /dev 挂载 + gpu_style: none # CoreX: passthrough via --privileged + /dev mount memory: 32GB shm_size: 16g timeout: 3600 @@ -97,170 +98,171 @@ platforms: run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml ``` -### 配置层级说明 +### Config hierarchy -| 层级 | 字段 | 说明 | +| Level | Field | Description | |---|---|---| -| **平台级** | `image` | 镜像定义(dockerfile、build_args) | -| | `image_tag` | 默认镜像 tag(默认 `latest`) | -| | `docker_args` | 额外 docker run 参数(如 `--privileged`) | -| | `volumes` | 额外挂载卷 | -| | `setup` | 容器内 setup 命令 | -| | `env` | 注入容器环境变量 | -| **Job 级** | `resources.gpu_ids` | GPU 设备 ID | -| | `resources.gpu_style` | GPU 透传方式:`nvidia`(默认)或 `none` | -| | `resources.memory` | 容器内存限制 | -| | `resources.shm_size` | 共享内存大小 | -| | `resources.timeout` | 容器内脚本最大运行秒数 | -| | `stages` | 执行阶段列表 | -| | 以上平台级字段 | Job 可覆盖任意平台级默认值 | +| **Platform** | `image` | Image definition (dockerfile, build_args) | +| | `image_tag` | Default image tag (defaults to `latest`) | +| | `docker_args` | Extra `docker run` args (e.g., `--privileged`) | +| | `volumes` | Extra volume mounts | +| | `setup` | In-container setup command | +| | `env` | Injected container env vars | +| **Job** | `resources.gpu_ids` | GPU device IDs | +| | `resources.gpu_style` | GPU passthrough: `nvidia` (default) or `none` | +| | `resources.memory` | Container memory limit | +| | `resources.shm_size` | Shared memory size | +| | `resources.timeout` | Max run time in seconds | +| | `stages` | Execution stage list | +| | Any platform field | Jobs can override any platform-level default | --- -## 镜像构建 `build.py` +## Image builder `build.py` -| 参数 | 说明 | +| Flag | Description | |---|---| -| `--platform nvidia\|iluvatar\|metax\|moore\|ascend\|all` | 构建平台,默认 `all` | -| `--commit` | 指定 commit ref 作为镜像 tag(默认 HEAD) | -| `--force` | 跳过 Dockerfile 变更检测 | -| `--dry-run` | 打印命令不执行 | +| `--platform nvidia\|iluvatar\|metax\|moore\|ascend\|all` | Target platform (default: `all`) | +| `--commit` | Use specific commit ref as image tag (default: HEAD) | +| `--force` | Skip Dockerfile change detection | +| `--dry-run` | Print commands without executing | ```bash -# 检测变更后构建(无变更自动跳过) +# Build with change detection (skips if no Dockerfile changes) python .ci/build.py --platform nvidia -# 构建 Iluvatar 镜像 +# Build Iluvatar image python .ci/build.py --platform iluvatar --force -# 强制构建全部 +# Force build all platforms python .ci/build.py --force ``` -构建产物以宿主机本地镜像 tag 存储:`infiniops-ci/:` 和 `:latest`。 -代理、`no_proxy` 自动从宿主机环境变量透传到 `docker build`。 +Build artifacts are stored as local Docker image tags: `infiniops-ci/:` and `:latest`. +Proxy and `no_proxy` env vars are forwarded from the host to `docker build` automatically. -> `--push` 为预留功能,需在 `config.yaml` 中配置 `registry` 段后方可使用。 +> `--push` is reserved for future use; requires a `registry` section in `config.yaml`. --- -## 流水线执行 `run.py` +## Pipeline runner `run.py` -平台自动发现(通过检测 `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi`),无需手动指定。 +Platform is auto-detected (via `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi` on PATH), no manual specification needed. -| 参数 | 说明 | +| Flag | Description | |---|---| -| `--config` | 配置文件路径(默认 `.ci/config.yaml`) | -| `--job` | job 名称:短名(`gpu`)或完整名(`nvidia_gpu`)。缺省运行当前平台所有 job | -| `--branch` | 覆盖克隆分支(默认读 config `repo.branch`) | -| `--stage` | 只运行指定 stage | -| `--image-tag` | 覆盖镜像 tag | -| `--gpu-id` | 覆盖 GPU 设备 ID(nvidia 通过 `--gpus`,其他平台通过 `CUDA_VISIBLE_DEVICES`) | -| `--results-dir` | 宿主机目录,挂载到容器 `/workspace/results` | -| `--dry-run` | 打印 docker 命令不执行 | +| `--config` | Config file path (default: `.ci/config.yaml`) | +| `--job` | Job name: short (`gpu`) or full (`nvidia_gpu`). Defaults to all jobs for the current platform | +| `--branch` | Override clone branch (default: config `repo.branch`) | +| `--stage` | Run only the specified stage | +| `--image-tag` | Override image tag | +| `--gpu-id` | Override GPU device IDs (nvidia via `--gpus`, others via `CUDA_VISIBLE_DEVICES`) | +| `--results-dir` | Host directory mounted to `/workspace/results` inside the container | +| `--dry-run` | Print docker command without executing | ```bash -# 最简用法:自动检测平台,运行所有 job,使用 config 默认分支 +# Simplest usage: auto-detect platform, run all jobs, use config default branch python .ci/run.py -# 指定 job 短名 +# Specify short job name python .ci/run.py --job gpu -# 完整 job 名(向后兼容) +# Full job name (backward compatible) python .ci/run.py --job nvidia_gpu -# 只跑 test stage,预览命令 +# Run only the test stage, preview mode python .ci/run.py --job gpu --stage test --dry-run ``` -容器内执行流程:`git clone` → `checkout` → `setup` → stages。 -代理从宿主机透传,测试结果写入 `--results-dir`。每次运行均为干净环境(不挂载宿主机 pip 缓存)。 +Container execution flow: `git clone` → `checkout` → `setup` → stages. +Proxy vars are forwarded from the host. Test results are written to `--results-dir`. Each run uses a clean environment (no host pip cache mounted). --- -## 平台差异 +## Platform differences -| 平台 | GPU 透传方式 | 基础镜像 | 备注 | +| Platform | GPU passthrough | Base image | Notes | |---|---|---|---| -| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvcr.io/nvidia/pytorch:24.10-py3` | 标准 CUDA | -| Iluvatar | `--privileged` + `/dev` 挂载 | `corex:qs_pj20250825` | CoreX 运行时,CUDA 兼容 | -| MetaX | `--privileged` | `maca-pytorch:3.2.1.4` | MACA 运行时,通过 `mx-smi` 检测 | -| Moore | `--privileged` | `vllm_musa:20251112_hygon` | MUSA 运行时,通过 `mthreads-gmi` 检测 | -| Ascend | TODO | `ascend-pytorch:24.0.0` | 待完善,镜像和 job 尚未就绪 | +| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvcr.io/nvidia/pytorch:24.10-py3` | Standard CUDA | +| Iluvatar | `--privileged` + `/dev` mount | `corex:qs_pj20250825` | CoreX runtime, CUDA compatible | +| MetaX | `--privileged` | `maca-pytorch:3.2.1.4` | MACA runtime, detected via `mx-smi` | +| Moore | `--privileged` | `vllm_musa:20251112_hygon` | MUSA runtime, detected via `mthreads-gmi` | +| Cambricon | `--privileged` | `cambricon/pytorch:v1.25.3` | Neuware runtime, detected via `cnmon` | +| Ascend | TODO | `ascend-pytorch:24.0.0` | Not ready, image and jobs pending | --- ## Runner Agent `agent.py` -Runner Agent 支持 CLI 手动触发、GitHub Webhook 自动触发、资源感知的动态调度,以及跨机器远程触发。 +The Runner Agent supports CLI manual dispatch, GitHub webhook triggers, resource-aware dynamic scheduling, and cross-machine remote dispatch. -### CLI 手动执行 +### CLI manual execution ```bash -# 运行所有 job(分发到远程 Agent,使用 config 默认分支) +# Run all jobs (dispatched to remote agents, using config default branch) python .ci/agent.py run -# 指定分支 +# Specify branch python .ci/agent.py run --branch feat/xxx -# 运行指定 job +# Run a specific job python .ci/agent.py run --job nvidia_gpu -# 按平台运行 +# Filter by platform python .ci/agent.py run --platform nvidia -# 预览命令 +# Preview mode python .ci/agent.py run --dry-run ``` -| 参数 | 说明 | +| Flag | Description | |---|---| -| `--branch` | 测试分支(默认读 config `repo.branch`) | -| `--job` | 指定 job 名称 | -| `--platform` | 按平台过滤 job | -| `--commit` | 覆盖 commit SHA | -| `--image-tag` | 覆盖镜像 tag | -| `--dry-run` | 预览模式 | +| `--branch` | Test branch (default: config `repo.branch`) | +| `--job` | Specific job name | +| `--platform` | Filter jobs by platform | +| `--commit` | Override commit SHA | +| `--image-tag` | Override image tag | +| `--dry-run` | Preview mode | -### Webhook 服务 +### Webhook server -每台平台机器部署一个 Agent 实例(平台自动发现): +Deploy one Agent instance per platform machine (platform is auto-detected): ```bash -# NVIDIA 机器 +# NVIDIA machine python .ci/agent.py serve --port 8080 -# Iluvatar 机器 +# Iluvatar machine python .ci/agent.py serve --port 8080 -# MetaX 机器 +# MetaX machine python .ci/agent.py serve --port 8080 ``` -`serve` 子命令额外参数: +Additional `serve` flags: -| 参数 | 说明 | +| Flag | Description | |---|---| -| `--port` | 监听端口(默认 8080) | -| `--host` | 监听地址(默认 `0.0.0.0`) | -| `--webhook-secret` | GitHub Webhook 签名密钥(或 `WEBHOOK_SECRET` 环境变量) | -| `--api-token` | `/api/run` Bearer 认证令牌(或 `AGENT_API_TOKEN` 环境变量) | -| `--results-dir` | 结果目录(默认 `ci-results`) | -| `--utilization-threshold` | GPU 空闲阈值百分比(默认 10) | - -| 端点 | 方法 | 说明 | +| `--port` | Listen port (default: 8080) | +| `--host` | Listen address (default: `0.0.0.0`) | +| `--webhook-secret` | GitHub webhook signing secret (or `WEBHOOK_SECRET` env var) | +| `--api-token` | `/api/run` Bearer auth token (or `AGENT_API_TOKEN` env var) | +| `--results-dir` | Results directory (default: `ci-results`) | +| `--utilization-threshold` | GPU idle threshold percentage (default: 10) | + +| Endpoint | Method | Description | |---|---|---| -| `/webhook` | POST | GitHub Webhook(push/pull_request) | -| `/api/run` | POST | 远程触发 job | -| `/api/job/{id}` | GET | 查询 job 状态 | -| `/health` | GET | 健康检查 | -| `/status` | GET | 队列 + 资源状态 | +| `/webhook` | POST | GitHub webhook (push/pull_request) | +| `/api/run` | POST | Remote job trigger | +| `/api/job/{id}` | GET | Query job status | +| `/health` | GET | Health check | +| `/status` | GET | Queue + resource status | -Webhook 支持 `X-Hub-Signature-256` 签名验证,通过 `--webhook-secret` 或 `WEBHOOK_SECRET` 环境变量配置。 +Webhook supports `X-Hub-Signature-256` signature verification via `--webhook-secret` or `WEBHOOK_SECRET` env var. -### 远程 Agent 配置 +### Remote agent configuration -在 `config.yaml` 中配置各平台 Agent 地址,CLI 执行时自动将远程 job 分发到对应 Agent: +Configure agent URLs in `config.yaml`; the CLI automatically dispatches remote jobs to the corresponding agents: ```yaml agents: @@ -274,111 +276,111 @@ agents: url: http://moore-host:8080 ``` -### 资源调度 +### Resource scheduling -Agent 自动检测 GPU 利用率和系统内存,动态决定并行度: -- GPU 利用率 < 阈值(默认 10%)且未被 Agent 分配 → 可用 -- 资源不足时 job 自动排队,已完成 job 释放资源后自动调度排队任务 +The Agent auto-detects GPU utilization and system memory to dynamically determine parallelism: +- GPU utilization < threshold (default 10%) and not allocated by Agent → available +- When resources are insufficient, jobs are queued automatically; completed jobs release resources and trigger scheduling of queued tasks ### GitHub Status -设置 `GITHUB_TOKEN` 环境变量后,Agent 会自动上报 commit status: -- `pending` — job 开始执行 -- `success` / `failure` — job 执行完成 +Set the `GITHUB_TOKEN` env var and the Agent will automatically report commit status: +- `pending` — job started +- `success` / `failure` — job completed -Status context 格式:`ci/infiniops/{job_name}` +Status context format: `ci/infiniops/{job_name}` --- -## 多机部署指南 +## Multi-machine deployment guide -以 NVIDIA + Iluvatar + MetaX + Moore 多平台为例,说明如何在多台机器上部署 Agent 并实现跨平台并行测试。 +Example with NVIDIA + Iluvatar + MetaX + Moore multi-platform setup, showing how to deploy agents across machines for cross-platform parallel testing. -### 前置条件(所有机器共同) +### Prerequisites (all machines) ```bash -# 1. Python 3.10+ 和依赖 +# 1. Python 3.10+ and dependencies pip install pyyaml -# 2. Docker 已安装 +# 2. Docker installed docker --version -# 3. 克隆仓库 +# 3. Clone the repository git clone https://github.com/InfiniTensor/InfiniOps.git cd InfiniOps ``` -### NVIDIA 机器配置 +### NVIDIA machine setup ```bash -# 1. 安装 NVIDIA Container Toolkit -# 参考: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html +# 1. Install NVIDIA Container Toolkit +# See: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html -# 2. 验证 GPU 可见 +# 2. Verify GPU visibility nvidia-smi -# 3. 构建 CI 镜像 +# 3. Build CI image python .ci/build.py --platform nvidia ``` -### Iluvatar 机器配置 +### Iluvatar machine setup ```bash -# 1. 确认 CoreX 运行时已安装 +# 1. Verify CoreX runtime is installed ixsmi -# 2. 确认基础镜像已导入(非公开镜像,需提前准备) -docker images | grep corex # 应有 corex:qs_pj20250825 +# 2. Verify base image is imported (non-public, must be prepared in advance) +docker images | grep corex # Should show corex:qs_pj20250825 -# 3. 构建 CI 镜像 +# 3. Build CI image python .ci/build.py --platform iluvatar ``` -### MetaX 机器配置 +### MetaX machine setup ```bash -# 1. 确认 MACA 运行时已安装 +# 1. Verify MACA runtime is installed mx-smi -# 2. 确认基础镜像已导入(非公开镜像,需提前准备) -docker images | grep maca-pytorch # 应有 maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64 +# 2. Verify base image is imported (non-public, must be prepared in advance) +docker images | grep maca-pytorch # Should show maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64 -# 3. 构建 CI 镜像 +# 3. Build CI image python .ci/build.py --platform metax ``` -### Moore 机器配置 +### Moore machine setup ```bash -# 1. 确认 MUSA 运行时已安装 +# 1. Verify MUSA runtime is installed mthreads-gmi -# 2. 确认基础镜像已导入(非公开镜像,需提前准备) -docker images | grep vllm_musa # 应有 vllm_musa:20251112_hygon +# 2. Verify base image is imported (non-public, must be prepared in advance) +docker images | grep vllm_musa # Should show vllm_musa:20251112_hygon -# 3. 构建 CI 镜像 +# 3. Build CI image python .ci/build.py --platform moore ``` -### 启动 Agent 服务 +### Start Agent services -在各自机器上启动 Agent: +Start the Agent on each machine: ```bash -# NVIDIA 机器(平台自动发现) +# NVIDIA machine (platform auto-detected) python .ci/agent.py serve --port 8080 -# Iluvatar 机器(平台自动发现) +# Iluvatar machine (platform auto-detected) python .ci/agent.py serve --port 8080 -# MetaX 机器(平台自动发现) +# MetaX machine (platform auto-detected) python .ci/agent.py serve --port 8080 -# Moore 机器(平台自动发现) +# Moore machine (platform auto-detected) python .ci/agent.py serve --port 8080 ``` -验证连通性: +Verify connectivity: ```bash curl http://:8080/health @@ -387,9 +389,9 @@ curl http://:8080/health curl http://:8080/health ``` -### 配置远程 Agent 地址 +### Configure remote agent URLs -在触发端的 `config.yaml` 中添加 `agents` 段: +Add the `agents` section to `config.yaml` on the trigger machine: ```yaml agents: @@ -403,82 +405,82 @@ agents: url: http://:8080 ``` -### 触发跨平台测试 +### Trigger cross-platform tests ```bash -# 一键运行所有平台的 job(使用 config 默认分支) +# Run all platform jobs at once (using config default branch) python .ci/agent.py run -# 预览模式(不实际执行) +# Preview mode (no actual execution) python .ci/agent.py run --dry-run -# 只运行指定平台 +# Run only a specific platform python .ci/agent.py run --platform nvidia ``` -### 可选配置 +### Optional configuration -#### GitHub Status 上报 +#### GitHub Status reporting -所有机器均设置环境变量,各自上报所属平台的测试状态: +Set the env var on all machines so each reports its own platform's test status: ```bash export GITHUB_TOKEN=ghp_xxxxxxxxxxxx ``` -#### API Token 认证 +#### API Token authentication -Agent 暴露在非可信网络时,建议启用 Token 认证: +When agents are exposed on untrusted networks, enable token auth: ```bash -# 启动 Agent 时指定 token +# Specify token at startup python .ci/agent.py serve --port 8080 --api-token -# 或通过环境变量 +# Or via env var export AGENT_API_TOKEN= ``` -#### GitHub Webhook 自动触发 +#### GitHub Webhook auto-trigger -在 GitHub repo → Settings → Webhooks 中为每台机器添加 Webhook: +In GitHub repo → Settings → Webhooks, add a webhook for each machine: -| 字段 | 值 | +| Field | Value | |---|---| -| Payload URL | `http://<机器IP>:8080/webhook` | +| Payload URL | `http://:8080/webhook` | | Content type | `application/json` | -| Secret | 与 `--webhook-secret` 一致 | -| Events | `push` 和 `pull_request` | +| Secret | Must match `--webhook-secret` | +| Events | `push` and `pull_request` | -启动时配置 secret: +Configure the secret at startup: ```bash python .ci/agent.py serve --port 8080 --webhook-secret -# 或通过环境变量 +# Or via env var export WEBHOOK_SECRET= ``` -### 验证清单 +### Verification checklist ```bash -# 1. 各机器单独 dry-run +# 1. Dry-run each machine individually python .ci/agent.py run --platform nvidia --dry-run python .ci/agent.py run --platform iluvatar --dry-run python .ci/agent.py run --platform metax --dry-run python .ci/agent.py run --platform moore --dry-run -# 2. 健康检查 +# 2. Health checks curl http://:8080/health curl http://:8080/health curl http://:8080/health curl http://:8080/health -# 3. 查看资源状态 +# 3. Resource status curl http://:8080/status curl http://:8080/status curl http://:8080/status curl http://:8080/status -# 4. 跨平台一键测试 +# 4. Cross-platform test python .ci/agent.py run --branch master ``` diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py index bbf27ae..51b181f 100644 --- a/.ci/ci_resource.py +++ b/.ci/ci_resource.py @@ -429,6 +429,10 @@ def parse_gpu_requirement(job_config) -> int: if gpu_style == GPU_STYLE_NONE: return 0 + ngpus = resources.get("ngpus") + if ngpus is not None: + return int(ngpus) + gpu_ids = str(resources.get("gpu_ids", "")) if not gpu_ids: diff --git a/.ci/config.yaml b/.ci/config.yaml index 2509b40..3ac211d 100644 --- a/.ci/config.yaml +++ b/.ci/config.yaml @@ -3,13 +3,19 @@ repo: branch: master github: - status_context_prefix: "ci/infiniops" # GitHub Commit Status context 前缀 + status_context_prefix: "ci/infiniops" -# agents: # 远程 Agent 地址(CLI 跨机器触发用) +# agents: # nvidia: # url: http://nvidia-host:8080 # iluvatar: # url: http://iluvatar-host:8080 +# metax: +# url: http://metax-host:8080 +# moore: +# url: http://moore-host:8080 +# cambricon: +# url: http://cambricon-host:8080 platforms: nvidia: @@ -21,11 +27,11 @@ platforms: jobs: gpu: resources: - gpu_ids: "0" # 指定 GPU ID,如 "0" "0,2" "all" + ngpus: 1 # Scheduler auto-picks this many free GPUs memory: 32GB - shm_size: 16g # 避免 PyTorch 默认 64MB SHMEM 不足 + shm_size: 16g # Prevent PyTorch default 64MB shared memory limit timeout: 3600 - # env: # 可选,注入容器环境变量 + # env: # Optional: inject container env vars # MY_VAR: value stages: - name: test @@ -52,8 +58,8 @@ platforms: jobs: gpu: resources: - gpu_ids: "0" # 通过 CUDA_VISIBLE_DEVICES 控制可见 GPU - gpu_style: none # CoreX 设备通过 --privileged + /dev 挂载透传 + gpu_ids: "0" # GPU visibility via CUDA_VISIBLE_DEVICES + gpu_style: none # CoreX: passthrough via --privileged + /dev mount memory: 32GB shm_size: 16g timeout: 3600 @@ -77,7 +83,7 @@ platforms: gpu: resources: gpu_ids: "0" - gpu_style: none # MetaX 设备通过 --privileged 透传,无需 CUDA_VISIBLE_DEVICES + gpu_style: none # MetaX: passthrough via --privileged, no CUDA_VISIBLE_DEVICES memory: 32GB shm_size: 16g timeout: 3600 @@ -99,7 +105,7 @@ platforms: gpu: resources: gpu_ids: "0" - gpu_style: none # Moore 设备通过 --privileged 透传,MTHREADS_VISIBLE_DEVICES 由基础镜像设置 + gpu_style: none # Moore: passthrough via --privileged, MTHREADS_VISIBLE_DEVICES set by base image memory: 32GB shm_size: 16g timeout: 3600 @@ -120,7 +126,7 @@ platforms: gpu: resources: gpu_ids: "0" - gpu_style: mlu # Cambricon MLU 通过 --privileged 透传,通过 MLU_VISIBLE_DEVICES 控制可见设备 + gpu_style: mlu # Cambricon: passthrough via --privileged, MLU_VISIBLE_DEVICES for device control memory: 32GB shm_size: 16g timeout: 3600 diff --git a/.ci/run.py b/.ci/run.py index 811ba2d..969336d 100644 --- a/.ci/run.py +++ b/.ci/run.py @@ -183,7 +183,7 @@ def build_docker_args( args.extend(["-v", vol]) gpu_id = gpu_id_override or str(resources.get("gpu_ids", "")) - gpu_count = resources.get("gpu_count", 0) + ngpus = resources.get("ngpus") gpu_style = resources.get("gpu_style", GPU_STYLE_NVIDIA) if gpu_style == GPU_STYLE_NVIDIA: @@ -192,8 +192,8 @@ def build_docker_args( args.extend(["--gpus", "all"]) else: args.extend(["--gpus", f'"device={gpu_id}"']) - elif gpu_count and gpu_count > 0: - args.extend(["--gpus", f"count={gpu_count}"]) + elif ngpus: + args.extend(["--gpus", f"count={ngpus}"]) elif gpu_style == GPU_STYLE_NONE and gpu_id and gpu_id != "all": # For platforms like Iluvatar/CoreX that use --privileged + /dev mount, # control visible GPUs via CUDA_VISIBLE_DEVICES. From b4a43d55742be330b4dac0147d4a34b555fb92c4 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Wed, 25 Mar 2026 07:23:37 +0000 Subject: [PATCH 14/16] feat(ci): add --local flag to run.py for testing uncommitted changes - Mount current directory read-only into container via `-v cwd:/workspace/repo:ro` - Copy to writable `/tmp/src` inside container before setup runs, so host files are never modified by pip install or build artifacts - Simplify README: fix ngpus example, add gpu_style column, add --local docs Co-Authored-By: Claude --- .ci/README.md | 200 +++++++++++++------------------------------------- .ci/run.py | 23 +++++- 2 files changed, 70 insertions(+), 153 deletions(-) diff --git a/.ci/README.md b/.ci/README.md index f468d90..190d012 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -59,7 +59,7 @@ platforms: jobs: gpu: # Flattened as nvidia_gpu resources: - gpu_ids: "0" # "0" | "0,2" | "all" + ngpus: 1 # Scheduler auto-picks this many free GPUs memory: 32GB shm_size: 16g timeout: 3600 @@ -108,8 +108,9 @@ platforms: | | `volumes` | Extra volume mounts | | | `setup` | In-container setup command | | | `env` | Injected container env vars | -| **Job** | `resources.gpu_ids` | GPU device IDs | -| | `resources.gpu_style` | GPU passthrough: `nvidia` (default) or `none` | +| **Job** | `resources.ngpus` | Number of GPUs — scheduler auto-picks free ones (NVIDIA only) | +| | `resources.gpu_ids` | Static GPU device IDs (e.g., `"0"`, `"0,2"`) | +| | `resources.gpu_style` | GPU passthrough: `nvidia` (default), `none`, or `mlu` | | | `resources.memory` | Container memory limit | | | `resources.shm_size` | Shared memory size | | | `resources.timeout` | Max run time in seconds | @@ -147,7 +148,7 @@ Proxy and `no_proxy` env vars are forwarded from the host to `docker build` auto ## Pipeline runner `run.py` -Platform is auto-detected (via `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi` on PATH), no manual specification needed. +Platform is auto-detected (via `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi`/`cnmon` on PATH), no manual specification needed. | Flag | Description | |---|---| @@ -157,7 +158,9 @@ Platform is auto-detected (via `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi` on P | `--stage` | Run only the specified stage | | `--image-tag` | Override image tag | | `--gpu-id` | Override GPU device IDs (nvidia via `--gpus`, others via `CUDA_VISIBLE_DEVICES`) | +| `--test` | Override pytest test path (e.g., `tests/test_gemm.py::test_gemm`) | | `--results-dir` | Host directory mounted to `/workspace/results` inside the container | +| `--local` | Mount current directory (read-only) instead of cloning from git | | `--dry-run` | Print docker command without executing | ```bash @@ -172,23 +175,29 @@ python .ci/run.py --job nvidia_gpu # Run only the test stage, preview mode python .ci/run.py --job gpu --stage test --dry-run + +# Test local uncommitted changes without pushing +python .ci/run.py --local ``` Container execution flow: `git clone` → `checkout` → `setup` → stages. +With `--local`, the current directory is mounted read-only at `/workspace/repo` and copied to a writable temp directory inside the container before setup runs — host files are never modified. Proxy vars are forwarded from the host. Test results are written to `--results-dir`. Each run uses a clean environment (no host pip cache mounted). --- ## Platform differences -| Platform | GPU passthrough | Base image | Notes | -|---|---|---|---| -| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvcr.io/nvidia/pytorch:24.10-py3` | Standard CUDA | -| Iluvatar | `--privileged` + `/dev` mount | `corex:qs_pj20250825` | CoreX runtime, CUDA compatible | -| MetaX | `--privileged` | `maca-pytorch:3.2.1.4` | MACA runtime, detected via `mx-smi` | -| Moore | `--privileged` | `vllm_musa:20251112_hygon` | MUSA runtime, detected via `mthreads-gmi` | -| Cambricon | `--privileged` | `cambricon/pytorch:v1.25.3` | Neuware runtime, detected via `cnmon` | -| Ascend | TODO | `ascend-pytorch:24.0.0` | Not ready, image and jobs pending | +| Platform | GPU passthrough | `gpu_style` | Base image | Detection tool | +|---|---|---|---|---| +| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | `nvidia` (default) | `nvcr.io/nvidia/pytorch:24.10-py3` | `nvidia-smi` | +| Iluvatar | `--privileged` + `/dev` mount | `none` | `corex:qs_pj20250825` | `ixsmi` | +| MetaX | `--privileged` | `none` | `maca-pytorch:3.2.1.4-...` | `mx-smi` | +| Moore | `--privileged` | `none` | `vllm_musa:20251112_hygon` | `mthreads-gmi` | +| Cambricon | `--privileged` | `mlu` | `cambricon/pytorch:v1.25.3` | `cnmon` | +| Ascend | TODO | — | `ascend-pytorch:24.0.0` | — | + +`gpu_style` controls the Docker device injection mechanism: `nvidia` uses `--gpus`, `none` uses `CUDA_VISIBLE_DEVICES` (or skips injection for Moore), `mlu` uses `MLU_VISIBLE_DEVICES`. --- @@ -220,22 +229,15 @@ python .ci/agent.py run --dry-run | `--branch` | Test branch (default: config `repo.branch`) | | `--job` | Specific job name | | `--platform` | Filter jobs by platform | -| `--commit` | Override commit SHA | +| `--commit` | Override commit SHA used for GitHub status reporting | | `--image-tag` | Override image tag | | `--dry-run` | Preview mode | ### Webhook server -Deploy one Agent instance per platform machine (platform is auto-detected): +Deploy one Agent instance per platform machine (platform is auto-detected). On each machine: ```bash -# NVIDIA machine -python .ci/agent.py serve --port 8080 - -# Iluvatar machine -python .ci/agent.py serve --port 8080 - -# MetaX machine python .ci/agent.py serve --port 8080 ``` @@ -267,13 +269,13 @@ Configure agent URLs in `config.yaml`; the CLI automatically dispatches remote j ```yaml agents: nvidia: - url: http://nvidia-host:8080 + url: http://:8080 iluvatar: - url: http://iluvatar-host:8080 + url: http://:8080 metax: - url: http://metax-host:8080 + url: http://:8080 moore: - url: http://moore-host:8080 + url: http://:8080 ``` ### Resource scheduling @@ -294,116 +296,28 @@ Status context format: `ci/infiniops/{job_name}` ## Multi-machine deployment guide -Example with NVIDIA + Iluvatar + MetaX + Moore multi-platform setup, showing how to deploy agents across machines for cross-platform parallel testing. - -### Prerequisites (all machines) - -```bash -# 1. Python 3.10+ and dependencies -pip install pyyaml - -# 2. Docker installed -docker --version - -# 3. Clone the repository -git clone https://github.com/InfiniTensor/InfiniOps.git -cd InfiniOps -``` - -### NVIDIA machine setup - -```bash -# 1. Install NVIDIA Container Toolkit -# See: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html - -# 2. Verify GPU visibility -nvidia-smi - -# 3. Build CI image -python .ci/build.py --platform nvidia -``` - -### Iluvatar machine setup - -```bash -# 1. Verify CoreX runtime is installed -ixsmi - -# 2. Verify base image is imported (non-public, must be prepared in advance) -docker images | grep corex # Should show corex:qs_pj20250825 - -# 3. Build CI image -python .ci/build.py --platform iluvatar -``` - -### MetaX machine setup - -```bash -# 1. Verify MACA runtime is installed -mx-smi - -# 2. Verify base image is imported (non-public, must be prepared in advance) -docker images | grep maca-pytorch # Should show maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64 - -# 3. Build CI image -python .ci/build.py --platform metax -``` +### Per-platform setup -### Moore machine setup - -```bash -# 1. Verify MUSA runtime is installed -mthreads-gmi +Each machine needs Docker installed, the platform runtime, and the base CI image built. -# 2. Verify base image is imported (non-public, must be prepared in advance) -docker images | grep vllm_musa # Should show vllm_musa:20251112_hygon - -# 3. Build CI image -python .ci/build.py --platform moore -``` +| Platform | Runtime check | Base image | Build command | +|---|---|---|---| +| NVIDIA | `nvidia-smi` (+ [Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) | `nvcr.io/nvidia/pytorch:24.10-py3` (public) | `python .ci/build.py --platform nvidia` | +| Iluvatar | `ixsmi` | `corex:qs_pj20250825` (import in advance) | `python .ci/build.py --platform iluvatar` | +| MetaX | `mx-smi` | `maca-pytorch:3.2.1.4-...` (import in advance) | `python .ci/build.py --platform metax` | +| Moore | `mthreads-gmi` | `vllm_musa:20251112_hygon` (import in advance) | `python .ci/build.py --platform moore` | ### Start Agent services -Start the Agent on each machine: +On each machine (platform is auto-detected): ```bash -# NVIDIA machine (platform auto-detected) -python .ci/agent.py serve --port 8080 - -# Iluvatar machine (platform auto-detected) -python .ci/agent.py serve --port 8080 - -# MetaX machine (platform auto-detected) -python .ci/agent.py serve --port 8080 - -# Moore machine (platform auto-detected) python .ci/agent.py serve --port 8080 ``` -Verify connectivity: - -```bash -curl http://:8080/health -curl http://:8080/health -curl http://:8080/health -curl http://:8080/health -``` - ### Configure remote agent URLs -Add the `agents` section to `config.yaml` on the trigger machine: - -```yaml -agents: - nvidia: - url: http://:8080 - iluvatar: - url: http://:8080 - metax: - url: http://:8080 - moore: - url: http://:8080 -``` +On the trigger machine, add the `agents` section to `config.yaml` (see [Remote agent configuration](#remote-agent-configuration) above for the format). ### Trigger cross-platform tests @@ -433,11 +347,8 @@ export GITHUB_TOKEN=ghp_xxxxxxxxxxxx When agents are exposed on untrusted networks, enable token auth: ```bash -# Specify token at startup python .ci/agent.py serve --port 8080 --api-token - -# Or via env var -export AGENT_API_TOKEN= +# Or: export AGENT_API_TOKEN= ``` #### GitHub Webhook auto-trigger @@ -451,36 +362,25 @@ In GitHub repo → Settings → Webhooks, add a webhook for each machine: | Secret | Must match `--webhook-secret` | | Events | `push` and `pull_request` | -Configure the secret at startup: - ```bash python .ci/agent.py serve --port 8080 --webhook-secret - -# Or via env var -export WEBHOOK_SECRET= +# Or: export WEBHOOK_SECRET= ``` ### Verification checklist ```bash # 1. Dry-run each machine individually -python .ci/agent.py run --platform nvidia --dry-run -python .ci/agent.py run --platform iluvatar --dry-run -python .ci/agent.py run --platform metax --dry-run -python .ci/agent.py run --platform moore --dry-run - -# 2. Health checks -curl http://:8080/health -curl http://:8080/health -curl http://:8080/health -curl http://:8080/health - -# 3. Resource status -curl http://:8080/status -curl http://:8080/status -curl http://:8080/status -curl http://:8080/status - -# 4. Cross-platform test +for platform in nvidia iluvatar metax moore; do + python .ci/agent.py run --platform $platform --dry-run +done + +# 2. Health and resource checks +for ip in ; do + curl http://$ip:8080/health + curl http://$ip:8080/status +done + +# 3. Cross-platform test python .ci/agent.py run --branch master ``` diff --git a/.ci/run.py b/.ci/run.py index 969336d..24a8867 100644 --- a/.ci/run.py +++ b/.ci/run.py @@ -87,9 +87,14 @@ def build_runner_script(): set -e cd /workspace mkdir -p /workspace/results -git clone "$REPO_URL" repo -cd repo -git checkout "$BRANCH" +if [ -n "$LOCAL_SRC" ]; then + cp -r "$LOCAL_SRC" /tmp/src + cd /tmp/src +else + git clone "$REPO_URL" repo + cd repo + git checkout "$BRANCH" +fi echo "========== Setup ==========" eval "$SETUP_CMD" set +e @@ -120,6 +125,7 @@ def build_docker_args( image_tag_override, gpu_id_override=None, results_dir=None, + local_path=None, ): job = config["jobs"][job_name] platform = job.get("platform", "nvidia") @@ -169,6 +175,10 @@ def build_docker_args( if results_dir: args.extend(["-v", f"{results_dir.resolve()}:/workspace/results"]) + if local_path: + args.extend(["-v", f"{local_path}:/workspace/repo:ro"]) + args.extend(["-e", "LOCAL_SRC=/workspace/repo"]) + for i, s in enumerate(stages): args.append("-e") args.append(f"STAGE_{i + 1}_NAME={s['name']}") @@ -307,6 +317,11 @@ def main(): type=str, help='Override pytest test path, e.g. "tests/test_gemm.py" or "tests/test_gemm.py::test_gemm"', ) + parser.add_argument( + "--local", + action="store_true", + help="Mount current directory (read-only) into the container instead of cloning from git", + ) parser.add_argument( "--dry-run", action="store_true", @@ -363,6 +378,7 @@ def main(): commit = get_git_commit() results_dir = build_results_dir(args.results_dir, job_platform, stages, commit) + local_path = Path.cwd().resolve() if args.local else None docker_args = build_docker_args( config, job_name, @@ -373,6 +389,7 @@ def main(): args.image_tag, gpu_id_override=args.gpu_id, results_dir=results_dir, + local_path=local_path, ) if args.dry_run: From fab00e233dbd7eb9b21c72ee1b67b0a601b5d02c Mon Sep 17 00:00:00 2001 From: zhangyue Date: Wed, 25 Mar 2026 08:46:46 +0000 Subject: [PATCH 15/16] style(ci): normalize comments to complete English sentences with markdown - Backtick-quote tool/package names (`torch`, `pip`, `git`, `cmake`, `coreutils-single`, `conda`) and paths in Dockerfile comments. - Add explanatory comment to the commented-out `agents:` block in `config.yaml` describing when to uncomment it. - Convert all section-header banners in `.ci/tests/` to "Tests for `FunctionName`." sentence form; fix three docstrings in `test_agent.py`. - Backtick-quote identifiers in `tests/test_gemm.py` inline comments. Co-Authored-By: Claude --- .ci/config.yaml | 5 ++++- .ci/images/cambricon/Dockerfile | 6 +++--- .ci/images/iluvatar/Dockerfile | 6 +++--- .ci/images/metax/Dockerfile | 4 ++-- .ci/images/moore/Dockerfile | 4 ++-- .ci/images/nvidia/Dockerfile | 2 +- .ci/tests/test_agent.py | 22 +++++++++++----------- .ci/tests/test_build.py | 8 ++++---- .ci/tests/test_github_status.py | 6 +++--- .ci/tests/test_resource.py | 18 +++++++++--------- .ci/tests/test_run.py | 18 +++++++++--------- .ci/tests/test_utils.py | 2 +- tests/test_gemm.py | 4 ++-- 13 files changed, 54 insertions(+), 51 deletions(-) diff --git a/.ci/config.yaml b/.ci/config.yaml index 3ac211d..b70e7df 100644 --- a/.ci/config.yaml +++ b/.ci/config.yaml @@ -5,6 +5,9 @@ repo: github: status_context_prefix: "ci/infiniops" +# Uncomment and replace the URLs below with actual host IPs to dispatch jobs to remote +# machines via `agent.py run`. Required on the trigger machine when each platform's +# agent runs on a separate host. See the README for multi-machine deployment details. # agents: # nvidia: # url: http://nvidia-host:8080 @@ -31,7 +34,7 @@ platforms: memory: 32GB shm_size: 16g # Prevent PyTorch default 64MB shared memory limit timeout: 3600 - # env: # Optional: inject container env vars + # env: # Uncomment to inject extra env vars into the container. # MY_VAR: value stages: - name: test diff --git a/.ci/images/cambricon/Dockerfile b/.ci/images/cambricon/Dockerfile index f1282d9..138f3cb 100644 --- a/.ci/images/cambricon/Dockerfile +++ b/.ci/images/cambricon/Dockerfile @@ -1,7 +1,7 @@ ARG BASE_IMAGE FROM ${BASE_IMAGE} -# Python 3.10 executables (pip-installed tools) live under /usr/local/python3.10/bin. +# Python 3.10 executables (`pip`-installed tools) live under `/usr/local/python3.10/bin`. ENV PATH=/usr/local/python3.10/bin:${PATH} ARG HTTP_PROXY @@ -11,7 +11,7 @@ ARG http_proxy ARG https_proxy ARG no_proxy -# git and cmake are pre-installed; coreutils-single covers coreutils needs. +# `git` and `cmake` are pre-installed; `coreutils-single` covers coreutils needs. RUN dnf install -y ninja-build && dnf clean all ARG PIP_INDEX_URL @@ -24,7 +24,7 @@ RUN pip install --no-cache-dir \ pytest-xdist \ ruff==0.15.7 -# Pin pre-installed Cambricon torch to prevent pip from replacing it with upstream version. +# Pin pre-installed Cambricon `torch` to prevent `pip` from replacing it with upstream version. RUN pip show torch >/dev/null 2>&1 && \ echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ touch /etc/pip-constraints.txt diff --git a/.ci/images/iluvatar/Dockerfile b/.ci/images/iluvatar/Dockerfile index f098e5f..79afc85 100644 --- a/.ci/images/iluvatar/Dockerfile +++ b/.ci/images/iluvatar/Dockerfile @@ -3,8 +3,8 @@ FROM ${BASE_IMAGE} ENV DEBIAN_FRONTEND=noninteractive -# CoreX runtime environment (base image sets these in /etc/bash.bashrc, -# but docker build RUN uses /bin/sh which doesn't source it) +# CoreX runtime environment (base image sets these in `/etc/bash.bashrc`, +# but `docker build` `RUN` uses `/bin/sh` which doesn't source it). ENV PATH=/usr/local/corex/bin:/usr/local/corex-4.3.0/corex-toolbox-1.0.0/bin:/usr/local/corex/lib64/python3/dist-packages/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages ENV LD_LIBRARY_PATH=/usr/local/corex/lib64:/usr/local/lib:/usr/local/openmpi/lib @@ -44,7 +44,7 @@ RUN pip install --no-cache-dir --upgrade pip && \ RUN pip config set global.index-url https://pypi.org/simple -# Pin pre-installed CoreX torch to prevent pip from replacing it with upstream version +# Pin pre-installed CoreX `torch` to prevent `pip` from replacing it with upstream version. RUN pip show torch >/dev/null 2>&1 && \ echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ touch /etc/pip-constraints.txt diff --git a/.ci/images/metax/Dockerfile b/.ci/images/metax/Dockerfile index fda527c..540bc9d 100644 --- a/.ci/images/metax/Dockerfile +++ b/.ci/images/metax/Dockerfile @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} ENV DEBIAN_FRONTEND=noninteractive -# conda Python is used in this image +# `conda` Python is used in this image. ENV PATH=/opt/conda/bin:${PATH} ARG HTTP_PROXY @@ -37,7 +37,7 @@ RUN pip install --no-cache-dir \ pyyaml \ ruff==0.15.7 -# Pin pre-installed MetaX torch to prevent pip from replacing it with upstream version +# Pin pre-installed MetaX `torch` to prevent `pip` from replacing it with upstream version. RUN pip show torch >/dev/null 2>&1 && \ echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ touch /etc/pip-constraints.txt diff --git a/.ci/images/moore/Dockerfile b/.ci/images/moore/Dockerfile index 9a073ba..a95d9bd 100644 --- a/.ci/images/moore/Dockerfile +++ b/.ci/images/moore/Dockerfile @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} ENV DEBIAN_FRONTEND=noninteractive -# MUSA_HOME, PATH, LD_LIBRARY_PATH already set by base image +# `MUSA_HOME`, `PATH`, `LD_LIBRARY_PATH` already set by base image. ARG HTTP_PROXY ARG HTTPS_PROXY @@ -31,7 +31,7 @@ RUN pip install --no-cache-dir \ pytest-xdist \ ruff==0.15.7 -# Pin pre-installed torch to prevent pip from replacing it with upstream version +# Pin pre-installed `torch` to prevent `pip` from replacing it with upstream version. RUN echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt ENV PIP_CONSTRAINT=/etc/pip-constraints.txt diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile index 05da963..b4984da 100644 --- a/.ci/images/nvidia/Dockerfile +++ b/.ci/images/nvidia/Dockerfile @@ -37,7 +37,7 @@ RUN pip install --no-cache-dir --upgrade pip && \ pyyaml \ ruff==0.15.7 -# Pin pre-installed torch to prevent pip from replacing it with a different version +# Pin pre-installed `torch` to prevent `pip` from replacing it with a different version. RUN pip show torch >/dev/null 2>&1 && \ echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ touch /etc/pip-constraints.txt diff --git a/.ci/tests/test_agent.py b/.ci/tests/test_agent.py index e51af2a..73708db 100644 --- a/.ci/tests/test_agent.py +++ b/.ci/tests/test_agent.py @@ -13,7 +13,7 @@ # --------------------------------------------------------------------------- -# Fixtures +# Test fixtures. # --------------------------------------------------------------------------- @@ -90,7 +90,7 @@ def mock_resource_pool(): # --------------------------------------------------------------------------- -# select_jobs +# Tests for `select_jobs`. # --------------------------------------------------------------------------- @@ -120,7 +120,7 @@ def test_select_jobs_invalid_name(agent_config): # --------------------------------------------------------------------------- -# verify_signature +# Tests for `verify_signature`. # --------------------------------------------------------------------------- @@ -140,7 +140,7 @@ def test_verify_signature_empty(): # --------------------------------------------------------------------------- -# JobRequest / JobResult +# Tests for `JobRequest` and `JobResult`. # --------------------------------------------------------------------------- @@ -165,7 +165,7 @@ def test_job_result_failure(): # --------------------------------------------------------------------------- -# Scheduler +# Tests for the `Scheduler` class. # --------------------------------------------------------------------------- @@ -236,7 +236,7 @@ def test_scheduler_get_status(agent_config, mock_resource_pool): # --------------------------------------------------------------------------- -# WebhookHandler — push event parsing +# Tests for `WebhookHandler` push event parsing. # --------------------------------------------------------------------------- @@ -264,12 +264,12 @@ def test_webhook_parse_pr(): # --------------------------------------------------------------------------- -# Integration-style: webhook HTTP test +# Integration-style webhook HTTP tests. # --------------------------------------------------------------------------- def _urlopen_no_proxy(url_or_req, **kwargs): - """urlopen that bypasses any HTTP_PROXY.""" + """`urlopen` mock that bypasses any `HTTP_PROXY`.""" import urllib.request opener = urllib.request.build_opener(urllib.request.ProxyHandler({})) @@ -442,12 +442,12 @@ def test_webhook_invalid_signature(agent_config, mock_resource_pool): # --------------------------------------------------------------------------- -# API token authentication +# Tests for API token authentication. # --------------------------------------------------------------------------- def test_api_run_requires_token(agent_config, mock_resource_pool, monkeypatch): - """When api_token is set, /api/run rejects requests without valid token.""" + """When `api_token` is set, `/api/run` rejects requests without a valid token.""" monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) scheduler = agent.Scheduler( @@ -491,7 +491,7 @@ def test_api_run_requires_token(agent_config, mock_resource_pool, monkeypatch): def test_api_run_accepts_valid_token(agent_config, mock_resource_pool, monkeypatch): - """When api_token is set, /api/run accepts requests with correct Bearer token.""" + """When `api_token` is set, `/api/run` accepts requests with a correct Bearer token.""" monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) scheduler = agent.Scheduler( diff --git a/.ci/tests/test_build.py b/.ci/tests/test_build.py index fa2f292..4d28885 100644 --- a/.ci/tests/test_build.py +++ b/.ci/tests/test_build.py @@ -2,7 +2,7 @@ # --------------------------------------------------------------------------- -# build_image_tag +# Tests for `build_image_tag`. # --------------------------------------------------------------------------- @@ -24,7 +24,7 @@ def test_build_image_tag_commit_hash(): # --------------------------------------------------------------------------- -# has_dockerfile_changed +# Tests for `has_dockerfile_changed`. # --------------------------------------------------------------------------- @@ -54,7 +54,7 @@ def test_has_dockerfile_changed_true_on_git_error(mocker): # --------------------------------------------------------------------------- -# docker_login +# Tests for `docker_login`. # --------------------------------------------------------------------------- @@ -99,7 +99,7 @@ def test_docker_login_success(mocker, monkeypatch): # --------------------------------------------------------------------------- -# build_image — dry_run and proxy +# Tests for `build_image` dry-run mode and proxy forwarding. # --------------------------------------------------------------------------- diff --git a/.ci/tests/test_github_status.py b/.ci/tests/test_github_status.py index edb2915..9e29c79 100644 --- a/.ci/tests/test_github_status.py +++ b/.ci/tests/test_github_status.py @@ -6,7 +6,7 @@ # --------------------------------------------------------------------------- -# parse_repo_url +# Tests for `parse_repo_url`. # --------------------------------------------------------------------------- @@ -35,7 +35,7 @@ def test_parse_repo_url_invalid(): # --------------------------------------------------------------------------- -# build_status_context +# Tests for `build_status_context`. # --------------------------------------------------------------------------- @@ -45,7 +45,7 @@ def test_build_status_context(): # --------------------------------------------------------------------------- -# post_commit_status +# Tests for `post_commit_status`. # --------------------------------------------------------------------------- diff --git a/.ci/tests/test_resource.py b/.ci/tests/test_resource.py index ac15b7e..0db3fbb 100644 --- a/.ci/tests/test_resource.py +++ b/.ci/tests/test_resource.py @@ -5,7 +5,7 @@ # --------------------------------------------------------------------------- -# GpuInfo / SystemResources +# Tests for `GpuInfo` and `SystemResources`. # --------------------------------------------------------------------------- @@ -25,7 +25,7 @@ def test_system_resources_fields(): # --------------------------------------------------------------------------- -# detect_gpus +# Tests for `detect_gpus`. # --------------------------------------------------------------------------- @@ -81,7 +81,7 @@ def mock_run(cmd, **kwargs): # --------------------------------------------------------------------------- -# detect_system_resources +# Tests for `detect_system_resources`. # --------------------------------------------------------------------------- @@ -111,7 +111,7 @@ def fake_open(path, **kw): # --------------------------------------------------------------------------- -# get_free_gpus +# Tests for `get_free_gpus`. # --------------------------------------------------------------------------- @@ -135,7 +135,7 @@ class R: # --------------------------------------------------------------------------- -# allocate / release +# Tests for `allocate` and `release`. # --------------------------------------------------------------------------- @@ -200,11 +200,11 @@ class R: assert ok is True assert len(gpu_ids) == 2 - # All GPUs allocated, next allocation should fail + # All GPUs allocated; next allocation should fail. _, ok2 = pool.allocate(1) assert ok2 is False - # Release one + # Release one GPU. pool.release([gpu_ids[0]]) gpu_ids2, ok3 = pool.allocate(1) assert ok3 is True @@ -267,7 +267,7 @@ def allocate_one(): # --------------------------------------------------------------------------- -# get_status +# Tests for `get_status`. # --------------------------------------------------------------------------- @@ -291,7 +291,7 @@ class R: # --------------------------------------------------------------------------- -# parse_gpu_requirement / parse_memory_requirement +# Tests for `parse_gpu_requirement` and `parse_memory_requirement`. # --------------------------------------------------------------------------- diff --git a/.ci/tests/test_run.py b/.ci/tests/test_run.py index 075546e..93987e5 100644 --- a/.ci/tests/test_run.py +++ b/.ci/tests/test_run.py @@ -6,7 +6,7 @@ # --------------------------------------------------------------------------- -# resolve_image +# Tests for `resolve_image`. # --------------------------------------------------------------------------- @@ -22,7 +22,7 @@ def test_resolve_image_without_registry(minimal_config): # --------------------------------------------------------------------------- -# build_runner_script +# Tests for `build_runner_script`. # --------------------------------------------------------------------------- @@ -47,7 +47,7 @@ def test_runner_script_creates_results_dir(): # --------------------------------------------------------------------------- -# build_docker_args — basic structure +# Tests for `build_docker_args` basic structure. # --------------------------------------------------------------------------- @@ -93,7 +93,7 @@ def test_docker_args_image_tag_override(minimal_config): # --------------------------------------------------------------------------- -# build_docker_args — proxy passthrough +# Tests for `build_docker_args` proxy passthrough. # --------------------------------------------------------------------------- @@ -156,7 +156,7 @@ def test_docker_args_proxy_lowercase_fallback(minimal_config, monkeypatch): # --------------------------------------------------------------------------- -# build_docker_args — GPU flags +# Tests for `build_docker_args` GPU flags. # --------------------------------------------------------------------------- @@ -200,7 +200,7 @@ def test_docker_args_gpu_override(minimal_config): # --------------------------------------------------------------------------- -# build_docker_args — memory format +# Tests for `build_docker_args` memory format. # --------------------------------------------------------------------------- @@ -222,7 +222,7 @@ def test_docker_args_memory_format(minimal_config, raw, expected): # --------------------------------------------------------------------------- -# build_docker_args — stages encoding +# Tests for `build_docker_args` stages encoding. # --------------------------------------------------------------------------- @@ -249,7 +249,7 @@ def test_docker_args_multiple_stages(minimal_config): # --------------------------------------------------------------------------- -# build_docker_args — results_dir mount +# Tests for `build_docker_args` `results_dir` mount. # --------------------------------------------------------------------------- @@ -270,7 +270,7 @@ def test_docker_args_results_dir(minimal_config, tmp_path): # --------------------------------------------------------------------------- -# build_results_dir +# Tests for `build_results_dir`. # --------------------------------------------------------------------------- diff --git a/.ci/tests/test_utils.py b/.ci/tests/test_utils.py index 2a930d3..b07011c 100644 --- a/.ci/tests/test_utils.py +++ b/.ci/tests/test_utils.py @@ -82,7 +82,7 @@ def test_normalize_preserves_top_level_keys(): def test_normalize_passthrough_flat_config(): - """Old flat format without 'platforms' key is returned as-is.""" + """Old flat format without `platforms` key is returned as-is.""" flat = { "images": {"nvidia": {}}, "jobs": {"nvidia_gpu": {"platform": "nvidia"}}, diff --git a/tests/test_gemm.py b/tests/test_gemm.py index 491fb47..d75ac45 100644 --- a/tests/test_gemm.py +++ b/tests/test_gemm.py @@ -48,7 +48,7 @@ def test_gemm( if device == "mlu" and (trans_a or trans_b): pytest.skip("transposing is not currently supported on MLU") - # cnnlBatchMatMulEx does not accept bfloat16 inputs on MLU. + # `cnnlBatchMatMulEx` does not accept `bfloat16` inputs on MLU. if device == "mlu" and dtype == torch.bfloat16: pytest.skip("bfloat16 is not supported by cnnlBatchMatMulEx") @@ -101,7 +101,7 @@ def _torch_gemm(a, b, alpha=1.0, beta=1.0, trans_a=False, trans_b=False, c=None) return torch.baddbmm(c, a, b, beta=beta, alpha=alpha, out=c) except RuntimeError: - # Fallback for backends that don't support addmm/baddbmm (e.g. CPU float16/bfloat16): + # Fallback for backends that don't support `addmm`/`baddbmm` (e.g. CPU `float16`/`bfloat16`): # compute in float32 and cast back. c_original = c.float() result = torch.matmul(a.float(), b.float()) From bedede278c615461141cf50d9814141812246ac5 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Wed, 25 Mar 2026 08:50:51 +0000 Subject: [PATCH 16/16] style(tests): backtick-quote identifiers in test_gemm.py skip message Co-Authored-By: Claude --- tests/test_gemm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_gemm.py b/tests/test_gemm.py index d75ac45..136e991 100644 --- a/tests/test_gemm.py +++ b/tests/test_gemm.py @@ -50,7 +50,7 @@ def test_gemm( # `cnnlBatchMatMulEx` does not accept `bfloat16` inputs on MLU. if device == "mlu" and dtype == torch.bfloat16: - pytest.skip("bfloat16 is not supported by cnnlBatchMatMulEx") + pytest.skip("`bfloat16` is not supported by `cnnlBatchMatMulEx`") a = randn_strided(a_shape, a_strides, dtype=dtype, device=device) b = randn_strided(b_shape, b_strides, dtype=dtype, device=device)