From ba08a77a9ea16fe5d2f525ebd4a03cf1ac47c4c5 Mon Sep 17 00:00:00 2001 From: Nanook Date: Fri, 20 Mar 2026 12:43:35 +0000 Subject: [PATCH 1/2] feat(contrib): add drift.temporal evaluator for longitudinal behavioral monitoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new contrib evaluator that detects gradual behavioral degradation patterns that point-in-time evaluators (regex, list, SQL, JSON) miss. ## Motivation Follows from discussion in #118 (temporal behavioral drift). The maintainer (lan17) asked for a standalone implementation; the package was built at https://github.com/nanookclaw/agent-control-drift-evaluator. This PR integrates it into the contrib ecosystem so it can be installed directly alongside other Agent Control evaluators. ## What it does - Records a numeric behavioral score (0.0–1.0) per agent per interaction - Compares the recent window (last N observations) to a baseline (first M observations) - Returns matched=True when recent average drops below baseline by more than the configured threshold - Stores history as local JSON — no external API or service required ## Design decisions grounded in empirical research Two findings from published longitudinal work (DOI: 10.5281/zenodo.19028012) shaped the implementation: 1. **min_observations ≥ 5**: Drift signals are noisy below 5 observations. Default min_observations=5 prevents early false positives. 2. **Non-monotonic degradation**: Agents can drift and recover without intervention. The evaluator tracks the window, not just a cumulative average, so it detects current state rather than all-time performance. Both patterns were independently validated by a second production deployment (NexusGuard fleet, v0.5.36, 48 tests merged). ## Package structure Follows the galileo contrib pattern: evaluators/contrib/drift/ ├── pyproject.toml # agent-control-evaluator-drift ├── Makefile # test / lint / typecheck / build ├── README.md └── src/ └── agent_control_evaluator_drift/ └── drift/ ├── config.py # DriftEvaluatorConfig (Pydantic) └── evaluator.py # DriftEvaluator (@register_evaluator) ## Tests 31 tests covering: - Config validation (bounds, window vs baseline, on_error) - Core drift computation (insufficient data, baseline building, stable, drift detected, threshold boundary conditions) - File I/O helpers (load/save roundtrip, missing file, corrupt JSON, directory creation) - Full evaluator integration (persistence across instances, independent agent_id tracking, score clamping, fail-open/closed error handling, metadata completeness) Relates to: #118 --- evaluators/contrib/drift/Makefile | 30 ++ evaluators/contrib/drift/README.md | 133 ++++++ evaluators/contrib/drift/pyproject.toml | 37 ++ .../agent_control_evaluator_drift/__init__.py | 8 + .../drift/__init__.py | 6 + .../drift/config.py | 84 ++++ .../drift/evaluator.py | 286 +++++++++++ evaluators/contrib/drift/tests/__init__.py | 0 .../contrib/drift/tests/drift/__init__.py | 0 .../contrib/drift/tests/drift/test_drift.py | 452 ++++++++++++++++++ 10 files changed, 1036 insertions(+) create mode 100644 evaluators/contrib/drift/Makefile create mode 100644 evaluators/contrib/drift/README.md create mode 100644 evaluators/contrib/drift/pyproject.toml create mode 100644 evaluators/contrib/drift/src/agent_control_evaluator_drift/__init__.py create mode 100644 evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/__init__.py create mode 100644 evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/config.py create mode 100644 evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py create mode 100644 evaluators/contrib/drift/tests/__init__.py create mode 100644 evaluators/contrib/drift/tests/drift/__init__.py create mode 100644 evaluators/contrib/drift/tests/drift/test_drift.py diff --git a/evaluators/contrib/drift/Makefile b/evaluators/contrib/drift/Makefile new file mode 100644 index 00000000..5e3b60d6 --- /dev/null +++ b/evaluators/contrib/drift/Makefile @@ -0,0 +1,30 @@ +.PHONY: help sync test lint lint-fix typecheck build + +PACKAGE := agent-control-evaluator-drift + +help: + @echo "Agent Control Evaluator - Drift - Makefile commands" + @echo "" + @echo " make test - run pytest" + @echo " make lint - run ruff check" + @echo " make lint-fix - run ruff check --fix" + @echo " make typecheck - run mypy" + @echo " make build - build package" + +sync: + uv sync + +test: + uv run pytest --cov=src --cov-report=xml:../../../coverage-evaluators-drift.xml -q + +lint: + uv run ruff check --config ../../../pyproject.toml src/ + +lint-fix: + uv run ruff check --config ../../../pyproject.toml --fix src/ + +typecheck: + uv run mypy --config-file ../../../pyproject.toml src/ + +build: + uv build diff --git a/evaluators/contrib/drift/README.md b/evaluators/contrib/drift/README.md new file mode 100644 index 00000000..8acd62ba --- /dev/null +++ b/evaluators/contrib/drift/README.md @@ -0,0 +1,133 @@ +# agent-control-evaluator-drift + +Temporal behavioral drift evaluator for [Agent Control](https://github.com/agentcontrol/agent-control). + +Detects gradual behavioral degradation patterns that point-in-time evaluators miss. + +## The Problem + +Agent Control's built-in evaluators (regex, list, SQL, JSON) assess individual interactions. They answer: *"Is this response safe right now?"* They don't answer: *"Is this agent becoming less reliable over time?"* + +Empirical observation from [published longitudinal research](https://doi.org/10.5281/zenodo.19028012) across LLM agents: + +- Agents scoring 1.0 on point-in-time tests showed measurable drift over 28-day windows +- Degradation was **non-monotonic**: stability windows followed by abrupt shifts, not gradual decline +- Regression signals were noisy below 5 observations; rolling windows of ≥5 gave actionable signal +- Two production deployments confirmed the same pattern independently + +This evaluator fills that gap by tracking behavioral scores over time and flagging when recent performance diverges from an established baseline. + +## How It Works + +``` +Single run: regex/list evaluators → pass/fail per message +Drift eval: records score over N runs → alerts when recent window < baseline +``` + +The evaluator: +1. Records a numeric score (0.0–1.0) for each evaluation +2. Compares the recent window (last N observations) against a baseline (first M observations) +3. Returns `matched=True` (drift detected) when the gap exceeds the configured threshold +4. Stores history in a local JSON file (no external dependencies required) + +## Installation + +```bash +pip install agent-control-evaluator-drift +``` + +Or with `uv`: +```bash +uv add agent-control-evaluator-drift +``` + +## Usage + +### Basic Configuration + +```python +from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig + +config = DriftEvaluatorConfig( + agent_id="sales-agent-prod", # Track this agent separately + storage_path="/tmp/drift-history", # Where to persist observations + window_size=10, # Recent window: last 10 scores + baseline_size=20, # Baseline: first 20 scores + drift_threshold=0.10, # Alert if recent avg drops >10% vs baseline +) + +evaluator = DriftEvaluator(config) + +# Each call records the score and checks for drift +result = await evaluator.evaluate(0.85) # Score from your primary evaluator +``` + +### In Agent Control YAML + +```yaml +controls: + - name: "drift-check" + evaluator: "drift.temporal" + config: + agent_id: "my-agent" + storage_path: "/var/lib/agent-control/drift" + window_size: 10 + baseline_size: 20 + drift_threshold: 0.10 + action: alert # or block +``` + +### Chaining with Other Evaluators + +The drift evaluator expects a numeric score (0.0–1.0) as input. Pair it with a selector that extracts a confidence or quality score from agent output: + +```yaml +controls: + - name: "quality-score" + selector: "$.quality_score" # Extract score from agent output + evaluator: "drift.temporal" + config: + agent_id: "customer-support" + drift_threshold: 0.15 +``` + +## Configuration Reference + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `agent_id` | `str` | `"default"` | Identifier to track agents separately | +| `storage_path` | `str` | `/tmp/drift-history` | Directory for history files | +| `window_size` | `int` | `10` | Number of recent observations to compare | +| `baseline_size` | `int` | `20` | Number of initial observations to establish baseline | +| `drift_threshold` | `float` | `0.10` | Minimum score drop to trigger drift alert (0.0–1.0) | +| `min_observations` | `int` | `5` | Minimum observations before drift detection activates | +| `on_error` | `str` | `"allow"` | Action on storage error: `"allow"` or `"deny"` | + +## Output + +`EvaluatorResult` fields: + +- `matched`: `True` when drift detected (recent window below baseline by threshold) +- `confidence`: `1.0` when drift detected, `0.0` otherwise +- `message`: Human-readable status (e.g., "Drift detected: baseline 0.92 → recent 0.78") +- `metadata`: + - `agent_id`: Agent being tracked + - `observation_count`: Total observations recorded + - `baseline_avg`: Average score during baseline period + - `recent_avg`: Average score in recent window + - `drift_magnitude`: How far recent dropped below baseline + - `status`: `"drift_detected"`, `"stable"`, `"baseline_building"`, or `"insufficient_data"` + +## Research Background + +This evaluator is based on empirical findings from [PDR: Probabilistic Drift Rate for Longitudinal Behavioral Reliability in LLM-based Agents](https://doi.org/10.5281/zenodo.19028012). + +Key findings that shaped the design: +- **Window ≥ 5**: Drift signals become reliable only above 5 observations (noisy below) +- **Non-monotonic patterns**: Degradation isn't gradual; agents can return to baseline without intervention +- **Specification matters**: Ambiguous task specs cause variance that looks like drift — scope `agent_id` to well-defined tasks +- **Independent replication**: NexusGuard production deployment confirmed the same windowed-scoring behavior + +## License + +Apache 2.0 diff --git a/evaluators/contrib/drift/pyproject.toml b/evaluators/contrib/drift/pyproject.toml new file mode 100644 index 00000000..c827317a --- /dev/null +++ b/evaluators/contrib/drift/pyproject.toml @@ -0,0 +1,37 @@ +[project] +name = "agent-control-evaluator-drift" +version = "1.0.0" +description = "Temporal behavioral drift evaluator for agent-control" +readme = "README.md" +requires-python = ">=3.12" +license = { text = "Apache-2.0" } +authors = [{ name = "Nanook (nanookclaw)" }] +dependencies = [ + "agent-control-evaluators>=3.0.0", + "agent-control-models>=3.0.0", + "pydantic>=2.12.4", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0.0", + "pytest-asyncio>=0.23.0", + "pytest-cov>=4.0.0", + "ruff>=0.1.0", + "mypy>=1.8.0", +] + +[project.entry-points."agent_control.evaluators"] +"drift.temporal" = "agent_control_evaluator_drift.drift:DriftEvaluator" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/agent_control_evaluator_drift"] + +# For local dev, use override to resolve from workspace +[tool.uv.sources] +agent-control-evaluators = { path = "../../builtin", editable = true } +agent-control-models = { path = "../../../models", editable = true } diff --git a/evaluators/contrib/drift/src/agent_control_evaluator_drift/__init__.py b/evaluators/contrib/drift/src/agent_control_evaluator_drift/__init__.py new file mode 100644 index 00000000..f8c8e393 --- /dev/null +++ b/evaluators/contrib/drift/src/agent_control_evaluator_drift/__init__.py @@ -0,0 +1,8 @@ +"""Agent Control Drift Evaluator. + +Temporal behavioral drift detection for LLM agents. +""" + +from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig + +__all__ = ["DriftEvaluator", "DriftEvaluatorConfig"] diff --git a/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/__init__.py b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/__init__.py new file mode 100644 index 00000000..68367114 --- /dev/null +++ b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/__init__.py @@ -0,0 +1,6 @@ +"""Drift evaluator module.""" + +from agent_control_evaluator_drift.drift.config import DriftEvaluatorConfig +from agent_control_evaluator_drift.drift.evaluator import DriftEvaluator + +__all__ = ["DriftEvaluator", "DriftEvaluatorConfig"] diff --git a/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/config.py b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/config.py new file mode 100644 index 00000000..0587e31d --- /dev/null +++ b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/config.py @@ -0,0 +1,84 @@ +"""Configuration model for the temporal drift evaluator.""" + +from typing import Literal + +from agent_control_evaluators import EvaluatorConfig +from pydantic import Field, model_validator + + +class DriftEvaluatorConfig(EvaluatorConfig): + """Configuration for the temporal behavioral drift evaluator. + + Tracks a numeric score over time per agent and flags when recent + performance diverges from an established baseline. + + Example: + ```python + config = DriftEvaluatorConfig( + agent_id="sales-agent-prod", + storage_path="/var/lib/agent-control/drift", + window_size=10, + baseline_size=20, + drift_threshold=0.10, + ) + ``` + + Notes: + - Drift detection activates only after ``min_observations`` runs. + - During baseline building (first ``baseline_size`` observations), + ``matched`` is always ``False``. + - Storage is local JSON files; no external service required. + """ + + agent_id: str = Field( + default="default", + description="Unique identifier for the agent being tracked. " + "Use distinct IDs to track multiple agents independently.", + ) + storage_path: str = Field( + default="/tmp/drift-history", + description="Directory path for persisting observation history files. " + "Each agent gets its own JSON file at /.json.", + ) + window_size: int = Field( + default=10, + ge=2, + le=100, + description="Number of most-recent observations to use as the 'current' window " + "when computing recent average. Must be >= 2.", + ) + baseline_size: int = Field( + default=20, + ge=5, + le=500, + description="Number of initial observations used to compute the baseline average. " + "Must be >= 5 (research finding: signals are noisy below 5 observations).", + ) + drift_threshold: float = Field( + default=0.10, + ge=0.01, + le=1.0, + description="Minimum absolute drop in average score (0.0–1.0) from baseline " + "to recent window that triggers a drift alert. Default 0.10 = 10 point drop.", + ) + min_observations: int = Field( + default=5, + ge=1, + description="Minimum total observations required before drift detection activates. " + "Prevents false positives during ramp-up.", + ) + on_error: Literal["allow", "deny"] = Field( + default="allow", + description="Behavior when storage read/write fails: " + "'allow' (fail open, don't block) or 'deny' (fail closed, block).", + ) + + @model_validator(mode="after") + def validate_window_vs_baseline(self) -> "DriftEvaluatorConfig": + """Validate that window_size <= baseline_size.""" + if self.window_size > self.baseline_size: + raise ValueError( + f"window_size ({self.window_size}) must be <= baseline_size ({self.baseline_size}). " + "The recent window cannot be larger than the baseline." + ) + return self diff --git a/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py new file mode 100644 index 00000000..3b5d3a04 --- /dev/null +++ b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py @@ -0,0 +1,286 @@ +"""Temporal behavioral drift evaluator for Agent Control. + +Detects gradual behavioral degradation that point-in-time evaluators miss +by tracking numeric scores over time and comparing recent windows to baselines. +""" + +from __future__ import annotations + +import json +import logging +import os +from pathlib import Path +from typing import Any + +from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator +from agent_control_models import EvaluatorResult + +from agent_control_evaluator_drift.drift.config import DriftEvaluatorConfig + +logger = logging.getLogger(__name__) + + +def _load_history(path: Path) -> list[float]: + """Load observation history from a JSON file. + + Args: + path: Path to the history file. + + Returns: + List of float scores, or empty list if file doesn't exist. + """ + if not path.exists(): + return [] + try: + with path.open("r") as fh: + data = json.load(fh) + scores = data.get("scores", []) + return [float(s) for s in scores if isinstance(s, (int, float))] + except (json.JSONDecodeError, OSError, ValueError) as exc: + logger.warning("Failed to load drift history from %s: %s", path, exc) + return [] + + +def _save_history(path: Path, scores: list[float]) -> None: + """Persist observation history to a JSON file. + + Args: + path: Path to the history file. + scores: List of float scores to persist. + + Raises: + OSError: If the file cannot be written. + """ + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w") as fh: + json.dump({"scores": scores}, fh) + + +def _compute_drift( + scores: list[float], + window_size: int, + baseline_size: int, + drift_threshold: float, + min_observations: int, +) -> dict[str, Any]: + """Compute drift metrics from a score history. + + Args: + scores: Full list of observations (oldest first). + window_size: Number of recent observations for current window. + baseline_size: Number of initial observations for baseline. + drift_threshold: Drop magnitude that triggers alert. + min_observations: Minimum observations before detection activates. + + Returns: + Dict with keys: status, baseline_avg, recent_avg, drift_magnitude, + observation_count, matched, confidence, message. + """ + n = len(scores) + + if n < min_observations: + return { + "status": "insufficient_data", + "observation_count": n, + "baseline_avg": None, + "recent_avg": None, + "drift_magnitude": None, + "matched": False, + "confidence": 0.0, + "message": f"Insufficient data: {n}/{min_observations} observations", + } + + if n < baseline_size: + return { + "status": "baseline_building", + "observation_count": n, + "baseline_avg": round(sum(scores) / n, 4), + "recent_avg": None, + "drift_magnitude": None, + "matched": False, + "confidence": 0.0, + "message": f"Building baseline: {n}/{baseline_size} observations collected", + } + + baseline_scores = scores[:baseline_size] + recent_scores = scores[-window_size:] + + baseline_avg = sum(baseline_scores) / len(baseline_scores) + recent_avg = sum(recent_scores) / len(recent_scores) + drift_magnitude = baseline_avg - recent_avg # positive = drop + + matched = drift_magnitude >= drift_threshold + + if matched: + status = "drift_detected" + message = ( + f"Drift detected: baseline {baseline_avg:.3f} → recent {recent_avg:.3f} " + f"(drop of {drift_magnitude:.3f}, threshold {drift_threshold:.3f})" + ) + confidence = 1.0 + else: + status = "stable" + message = ( + f"Stable: baseline {baseline_avg:.3f}, recent {recent_avg:.3f} " + f"(gap {drift_magnitude:.3f} < threshold {drift_threshold:.3f})" + ) + confidence = 0.0 + + return { + "status": status, + "observation_count": n, + "baseline_avg": round(baseline_avg, 4), + "recent_avg": round(recent_avg, 4), + "drift_magnitude": round(drift_magnitude, 4), + "matched": matched, + "confidence": confidence, + "message": message, + } + + +@register_evaluator +class DriftEvaluator(Evaluator[DriftEvaluatorConfig]): + """Temporal behavioral drift evaluator. + + Tracks a numeric behavioral score over time per agent and flags when + recent performance diverges from an established baseline by more than + the configured threshold. + + This evaluator fills the gap between point-in-time evaluators (which + answer "is this response OK now?") and longitudinal reliability analysis + (which answers "is this agent getting worse over time?"). + + No external API or service required — history is stored as local JSON. + + Instance Caching Note: + Per the base class contract, this evaluator stores only immutable + config state in ``__init__``. All file I/O happens inside + ``evaluate()`` using local variables, making it safe to reuse + across concurrent requests (each call reads and writes atomically + via a per-agent file lock-free JSON write). + + Example: + ```python + config = DriftEvaluatorConfig( + agent_id="customer-support", + drift_threshold=0.10, + ) + evaluator = DriftEvaluator(config) + result = await evaluator.evaluate(0.92) # Pass current score + ``` + """ + + metadata = EvaluatorMetadata( + name="drift.temporal", + version="1.0.0", + description=( + "Temporal behavioral drift detection. Tracks numeric scores over time " + "and alerts when recent performance drops below baseline. " + "No external API required." + ), + requires_api_key=False, + timeout_ms=1000, # Local file I/O only — fast + ) + config_model = DriftEvaluatorConfig + + async def evaluate(self, data: Any) -> EvaluatorResult: + """Record a behavioral score and check for drift. + + Args: + data: Numeric score (float or int, 0.0–1.0) representing the + agent's behavioral quality for this interaction. Higher is better. + Typically sourced from a primary evaluator's confidence or + extracted from agent output via a selector. + + Returns: + EvaluatorResult where matched=True indicates drift detected. + """ + # Validate and extract score + try: + score = float(data) + if not (0.0 <= score <= 1.0): + logger.warning( + "DriftEvaluator received score %.4f outside [0,1]; clamping.", + score, + ) + score = max(0.0, min(1.0, score)) + except (TypeError, ValueError) as exc: + logger.error("DriftEvaluator: cannot parse score from %r: %s", data, exc) + matched = self.config.on_error == "deny" + return EvaluatorResult( + matched=matched, + confidence=0.0, + message=f"Invalid score value: {data!r}", + metadata={ + "error": str(exc), + "agent_id": self.config.agent_id, + "fallback_action": self.config.on_error, + }, + ) + + # Determine storage path + storage_dir = Path(self.config.storage_path) + history_path = storage_dir / f"{self.config.agent_id}.json" + + # Load existing history + try: + scores = _load_history(history_path) + except Exception as exc: + logger.error("DriftEvaluator: failed to load history: %s", exc) + matched = self.config.on_error == "deny" + return EvaluatorResult( + matched=matched, + confidence=0.0, + message=f"Storage error (load): {exc}", + metadata={ + "error": str(exc), + "agent_id": self.config.agent_id, + "fallback_action": self.config.on_error, + }, + ) + + # Append new score + scores.append(score) + + # Persist updated history + try: + _save_history(history_path, scores) + except Exception as exc: + logger.error("DriftEvaluator: failed to save history: %s", exc) + # Still compute drift from in-memory scores even if save fails + matched_on_error = self.config.on_error == "deny" + if matched_on_error: + return EvaluatorResult( + matched=True, + confidence=0.0, + message=f"Storage error (save): {exc}", + metadata={ + "error": str(exc), + "agent_id": self.config.agent_id, + "fallback_action": self.config.on_error, + }, + ) + + # Compute drift metrics + metrics = _compute_drift( + scores=scores, + window_size=self.config.window_size, + baseline_size=self.config.baseline_size, + drift_threshold=self.config.drift_threshold, + min_observations=self.config.min_observations, + ) + + return EvaluatorResult( + matched=metrics["matched"], + confidence=metrics["confidence"], + message=metrics["message"], + metadata={ + "agent_id": self.config.agent_id, + "observation_count": metrics["observation_count"], + "baseline_avg": metrics["baseline_avg"], + "recent_avg": metrics["recent_avg"], + "drift_magnitude": metrics["drift_magnitude"], + "status": metrics["status"], + "current_score": round(score, 4), + }, + ) diff --git a/evaluators/contrib/drift/tests/__init__.py b/evaluators/contrib/drift/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/evaluators/contrib/drift/tests/drift/__init__.py b/evaluators/contrib/drift/tests/drift/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/evaluators/contrib/drift/tests/drift/test_drift.py b/evaluators/contrib/drift/tests/drift/test_drift.py new file mode 100644 index 00000000..a67eae50 --- /dev/null +++ b/evaluators/contrib/drift/tests/drift/test_drift.py @@ -0,0 +1,452 @@ +"""Tests for the temporal behavioral drift evaluator. + +Tests are designed to run without external dependencies — all file I/O +is redirected to a temporary directory. +""" + +from __future__ import annotations + +import json +import tempfile +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest +from agent_control_evaluators import Evaluator +from agent_control_models import EvaluatorResult +from pydantic import ValidationError + + +# --------------------------------------------------------------------------- +# Config tests +# --------------------------------------------------------------------------- + + +class TestDriftEvaluatorConfig: + """Tests for DriftEvaluatorConfig Pydantic model.""" + + def test_defaults(self): + from agent_control_evaluator_drift.drift import DriftEvaluatorConfig + + config = DriftEvaluatorConfig() + assert config.agent_id == "default" + assert config.window_size == 10 + assert config.baseline_size == 20 + assert config.drift_threshold == 0.10 + assert config.min_observations == 5 + assert config.on_error == "allow" + + def test_custom_values(self): + from agent_control_evaluator_drift.drift import DriftEvaluatorConfig + + config = DriftEvaluatorConfig( + agent_id="my-agent", + window_size=5, + baseline_size=15, + drift_threshold=0.20, + min_observations=3, + on_error="deny", + ) + assert config.agent_id == "my-agent" + assert config.window_size == 5 + assert config.baseline_size == 15 + assert config.drift_threshold == 0.20 + assert config.min_observations == 3 + assert config.on_error == "deny" + + def test_window_cannot_exceed_baseline(self): + from agent_control_evaluator_drift.drift import DriftEvaluatorConfig + + with pytest.raises(ValidationError, match="window_size.*must be <="): + DriftEvaluatorConfig(window_size=25, baseline_size=20) + + def test_window_equal_to_baseline_is_valid(self): + from agent_control_evaluator_drift.drift import DriftEvaluatorConfig + + config = DriftEvaluatorConfig(window_size=20, baseline_size=20) + assert config.window_size == config.baseline_size + + def test_drift_threshold_bounds(self): + from agent_control_evaluator_drift.drift import DriftEvaluatorConfig + + # Below min + with pytest.raises(ValidationError): + DriftEvaluatorConfig(drift_threshold=0.0) + + # Above max + with pytest.raises(ValidationError): + DriftEvaluatorConfig(drift_threshold=1.1) + + def test_on_error_validation(self): + from agent_control_evaluator_drift.drift import DriftEvaluatorConfig + + DriftEvaluatorConfig(on_error="allow") + DriftEvaluatorConfig(on_error="deny") + + with pytest.raises(ValidationError): + DriftEvaluatorConfig(on_error="ignore") + + def test_baseline_size_minimum(self): + from agent_control_evaluator_drift.drift import DriftEvaluatorConfig + + with pytest.raises(ValidationError): + DriftEvaluatorConfig(baseline_size=4) # below ge=5 + + def test_window_size_minimum(self): + from agent_control_evaluator_drift.drift import DriftEvaluatorConfig + + with pytest.raises(ValidationError): + DriftEvaluatorConfig(window_size=1) # below ge=2 + + +# --------------------------------------------------------------------------- +# Evaluator inheritance / metadata +# --------------------------------------------------------------------------- + + +class TestDriftEvaluatorInheritance: + def test_extends_base_evaluator(self): + from agent_control_evaluator_drift.drift import DriftEvaluator + + assert issubclass(DriftEvaluator, Evaluator) + + def test_metadata_fields(self): + from agent_control_evaluator_drift.drift import DriftEvaluator + + meta = DriftEvaluator.metadata + assert meta.name == "drift.temporal" + assert meta.requires_api_key is False + assert "drift" in meta.description.lower() + + def test_is_available(self): + from agent_control_evaluator_drift.drift import DriftEvaluator + + assert DriftEvaluator.is_available() is True + + +# --------------------------------------------------------------------------- +# Core helper functions +# --------------------------------------------------------------------------- + + +class TestComputeDrift: + def _compute(self, scores, window=5, baseline=10, threshold=0.10, min_obs=5): + from agent_control_evaluator_drift.drift.evaluator import _compute_drift + + return _compute_drift(scores, window, baseline, threshold, min_obs) + + def test_insufficient_data(self): + result = self._compute(scores=[0.9, 0.8, 0.7], min_obs=5) + assert result["status"] == "insufficient_data" + assert result["matched"] is False + + def test_baseline_building(self): + scores = [0.9] * 7 # 7 obs, min_obs=5, baseline=10 + result = self._compute(scores=scores) + assert result["status"] == "baseline_building" + assert result["matched"] is False + + def test_stable_after_baseline(self): + # 10 good baseline + 5 good recent → no drift + scores = [0.9] * 10 + [0.88] * 5 + result = self._compute(scores=scores) + assert result["status"] == "stable" + assert result["matched"] is False + assert result["baseline_avg"] == pytest.approx(0.9, abs=0.01) + assert result["recent_avg"] == pytest.approx(0.88, abs=0.01) + + def test_drift_detected(self): + # 10 good baseline, then 5 bad recent + scores = [0.9] * 10 + [0.7] * 5 + result = self._compute(scores=scores, threshold=0.10) + assert result["status"] == "drift_detected" + assert result["matched"] is True + assert result["drift_magnitude"] == pytest.approx(0.20, abs=0.01) + + def test_exactly_at_threshold_triggers(self): + # Drop of exactly 0.10 should trigger (>= threshold) + baseline = [1.0] * 10 + recent_window = [0.9] * 5 + scores = baseline + recent_window + result = self._compute(scores=scores, threshold=0.10) + assert result["matched"] is True + + def test_just_below_threshold_does_not_trigger(self): + baseline = [1.0] * 10 + recent_window = [0.91] * 5 + scores = baseline + recent_window + result = self._compute(scores=scores, threshold=0.10) + assert result["matched"] is False + + def test_observation_count_returned(self): + scores = [0.9] * 15 + result = self._compute(scores=scores) + assert result["observation_count"] == 15 + + +class TestHistoryIO: + def test_load_missing_file_returns_empty(self, tmp_path): + from agent_control_evaluator_drift.drift.evaluator import _load_history + + path = tmp_path / "nonexistent.json" + assert _load_history(path) == [] + + def test_save_and_load_roundtrip(self, tmp_path): + from agent_control_evaluator_drift.drift.evaluator import _load_history, _save_history + + path = tmp_path / "scores.json" + _save_history(path, [0.9, 0.85, 0.8]) + loaded = _load_history(path) + assert loaded == pytest.approx([0.9, 0.85, 0.8]) + + def test_save_creates_parent_dirs(self, tmp_path): + from agent_control_evaluator_drift.drift.evaluator import _save_history + + path = tmp_path / "nested" / "deep" / "scores.json" + _save_history(path, [0.5]) + assert path.exists() + + def test_load_corrupted_json_returns_empty(self, tmp_path): + from agent_control_evaluator_drift.drift.evaluator import _load_history + + path = tmp_path / "bad.json" + path.write_text("not valid json {{{{") + result = _load_history(path) + assert result == [] + + +# --------------------------------------------------------------------------- +# Full evaluator integration tests +# --------------------------------------------------------------------------- + + +class TestDriftEvaluatorIntegration: + @pytest.mark.asyncio + async def test_insufficient_data_returns_not_matched(self, tmp_path): + from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig + + config = DriftEvaluatorConfig( + agent_id="test", + storage_path=str(tmp_path), + window_size=5, + baseline_size=10, + min_observations=5, + ) + evaluator = DriftEvaluator(config) + + # Only 2 observations — below min_observations + await evaluator.evaluate(0.9) + result = await evaluator.evaluate(0.9) + + assert isinstance(result, EvaluatorResult) + assert result.matched is False + assert result.metadata["status"] == "insufficient_data" + + @pytest.mark.asyncio + async def test_baseline_building_phase(self, tmp_path): + from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig + + config = DriftEvaluatorConfig( + agent_id="test", + storage_path=str(tmp_path), + window_size=3, + baseline_size=10, + min_observations=5, + ) + evaluator = DriftEvaluator(config) + + # Feed 7 observations (>= min_obs but < baseline_size) + for _ in range(7): + result = await evaluator.evaluate(0.9) + + assert result.matched is False + assert result.metadata["status"] == "baseline_building" + + @pytest.mark.asyncio + async def test_stable_agent_no_alert(self, tmp_path): + from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig + + config = DriftEvaluatorConfig( + agent_id="stable", + storage_path=str(tmp_path), + window_size=5, + baseline_size=10, + drift_threshold=0.10, + min_observations=5, + ) + evaluator = DriftEvaluator(config) + + # 10 baseline + 5 stable recent + for _ in range(15): + result = await evaluator.evaluate(0.9) + + assert result.matched is False + assert result.metadata["status"] == "stable" + + @pytest.mark.asyncio + async def test_drifting_agent_triggers_alert(self, tmp_path): + from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig + + config = DriftEvaluatorConfig( + agent_id="drifting", + storage_path=str(tmp_path), + window_size=5, + baseline_size=10, + drift_threshold=0.10, + min_observations=5, + ) + evaluator = DriftEvaluator(config) + + # Strong baseline + for _ in range(10): + await evaluator.evaluate(0.95) + + # Degraded recent window + for _ in range(4): + await evaluator.evaluate(0.70) + + result = await evaluator.evaluate(0.70) + + assert result.matched is True + assert result.metadata["status"] == "drift_detected" + assert result.metadata["baseline_avg"] == pytest.approx(0.95, abs=0.01) + assert result.metadata["recent_avg"] == pytest.approx(0.70, abs=0.01) + assert result.metadata["drift_magnitude"] == pytest.approx(0.25, abs=0.01) + + @pytest.mark.asyncio + async def test_history_persists_across_evaluator_instances(self, tmp_path): + from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig + + def make_evaluator(): + return DriftEvaluator( + DriftEvaluatorConfig( + agent_id="persist-test", + storage_path=str(tmp_path), + window_size=5, + baseline_size=10, + drift_threshold=0.10, + min_observations=5, + ) + ) + + # Instance 1: record 10 baseline observations + ev1 = make_evaluator() + for _ in range(10): + await ev1.evaluate(0.9) + + # Instance 2: picks up history, records drift + ev2 = make_evaluator() + for _ in range(5): + await ev2.evaluate(0.5) + + result = await ev2.evaluate(0.5) + assert result.matched is True + assert result.metadata["observation_count"] == 16 + + @pytest.mark.asyncio + async def test_score_clamped_outside_range(self, tmp_path): + from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig + + config = DriftEvaluatorConfig( + agent_id="clamp", + storage_path=str(tmp_path), + ) + evaluator = DriftEvaluator(config) + + # Should not raise; score is clamped + result = await evaluator.evaluate(1.5) + assert isinstance(result, EvaluatorResult) + assert result.metadata["current_score"] == 1.0 + + result = await evaluator.evaluate(-0.5) + assert result.metadata["current_score"] == 0.0 + + @pytest.mark.asyncio + async def test_invalid_score_fail_open(self, tmp_path): + from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig + + config = DriftEvaluatorConfig( + agent_id="error", + storage_path=str(tmp_path), + on_error="allow", + ) + evaluator = DriftEvaluator(config) + + result = await evaluator.evaluate("not-a-number") + assert result.matched is False + assert "Invalid score value" in result.message + + @pytest.mark.asyncio + async def test_invalid_score_fail_closed(self, tmp_path): + from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig + + config = DriftEvaluatorConfig( + agent_id="error-deny", + storage_path=str(tmp_path), + on_error="deny", + ) + evaluator = DriftEvaluator(config) + + result = await evaluator.evaluate({"not": "a number"}) + assert result.matched is True + assert result.metadata["fallback_action"] == "deny" + + @pytest.mark.asyncio + async def test_separate_agent_ids_tracked_independently(self, tmp_path): + from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig + + def make_evaluator(agent_id): + return DriftEvaluator( + DriftEvaluatorConfig( + agent_id=agent_id, + storage_path=str(tmp_path), + window_size=5, + baseline_size=10, + drift_threshold=0.10, + min_observations=5, + ) + ) + + good_ev = make_evaluator("good-agent") + bad_ev = make_evaluator("bad-agent") + + # Good agent stays stable + for _ in range(15): + await good_ev.evaluate(0.9) + + # Bad agent drifts + for _ in range(10): + await bad_ev.evaluate(0.9) + for _ in range(5): + await bad_ev.evaluate(0.5) + + good_result = await good_ev.evaluate(0.9) + bad_result = await bad_ev.evaluate(0.5) + + assert good_result.matched is False + assert bad_result.matched is True + + @pytest.mark.asyncio + async def test_metadata_contains_required_fields(self, tmp_path): + from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig + + config = DriftEvaluatorConfig( + agent_id="meta-test", + storage_path=str(tmp_path), + window_size=5, + baseline_size=10, + ) + evaluator = DriftEvaluator(config) + + result = await evaluator.evaluate(0.85) + meta = result.metadata + + required_keys = { + "agent_id", + "observation_count", + "status", + "current_score", + } + assert required_keys.issubset(meta.keys()) + assert meta["agent_id"] == "meta-test" + assert meta["current_score"] == pytest.approx(0.85, abs=0.001) From 12ed7e9214fea017e2960a910eca182c23cd1d69 Mon Sep 17 00:00:00 2001 From: "Nanook (nanookclaw)" Date: Sat, 21 Mar 2026 03:04:13 +0000 Subject: [PATCH 2/2] fix(drift): float precision, file-lock atomicity, release wiring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three issues raised by lan17 in PR review: 1. Float precision on threshold boundary (#1) baseline=1.0, window=0.9, threshold=0.10: IEEE 754 gives drift_magnitude=0.09999999... which fails >= 0.10. Fixed with round(drift_magnitude, 10) >= drift_threshold in _compute_drift(). 2. Race condition on concurrent history writes (#3) load→append→save was not atomic: two workers for the same agent_id would both read stale history and the last writer would silently drop the other's observation. Replaced _load_history() / _save_history() pair with _load_and_append_history() which holds fcntl.LOCK_EX for the full read-modify-write cycle. Lock is per-agent (.lock file), so independent agents remain fully parallel. 3. Release wiring missing for drift package (#2) test-extras, scripts/build.py, Makefile and .PHONY only referenced galileo. Added drift-{test,lint,lint-fix,typecheck,build} targets to Makefile, wired drift-test into test-extras, and added build_evaluator_drift() to scripts/build.py (including 'drift' and 'all' targets). --- Makefile | 26 +++- .../drift/evaluator.py | 119 +++++++++--------- scripts/build.py | 26 +++- 3 files changed, 107 insertions(+), 64 deletions(-) diff --git a/Makefile b/Makefile index b858a3ed..b2e01712 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help sync openapi-spec openapi-spec-check test test-extras test-all models-test test-models test-sdk lint lint-fix typecheck check build build-models build-server build-sdk publish publish-models publish-server publish-sdk hooks-install hooks-uninstall prepush evaluators-test evaluators-lint evaluators-lint-fix evaluators-typecheck evaluators-build galileo-test galileo-lint galileo-lint-fix galileo-typecheck galileo-build sdk-ts-generate sdk-ts-overlay-test sdk-ts-name-check sdk-ts-generate-check sdk-ts-build sdk-ts-test sdk-ts-lint sdk-ts-typecheck sdk-ts-release-check sdk-ts-publish-dry-run sdk-ts-publish +.PHONY: help sync openapi-spec openapi-spec-check test test-extras test-all models-test test-models test-sdk lint lint-fix typecheck check build build-models build-server build-sdk publish publish-models publish-server publish-sdk hooks-install hooks-uninstall prepush evaluators-test evaluators-lint evaluators-lint-fix evaluators-typecheck evaluators-build galileo-test galileo-lint galileo-lint-fix galileo-typecheck galileo-build drift-test drift-lint drift-lint-fix drift-typecheck drift-build sdk-ts-generate sdk-ts-overlay-test sdk-ts-name-check sdk-ts-generate-check sdk-ts-build sdk-ts-test sdk-ts-lint sdk-ts-typecheck sdk-ts-release-check sdk-ts-publish-dry-run sdk-ts-publish # Workspace package names PACK_MODELS := agent-control-models @@ -16,6 +16,7 @@ TS_SDK_DIR := sdks/typescript ENGINE_DIR := engine EVALUATORS_DIR := evaluators/builtin GALILEO_DIR := evaluators/contrib/galileo +DRIFT_DIR := evaluators/contrib/drift UI_DIR := ui help: @@ -33,7 +34,7 @@ help: @echo "Test:" @echo " make test - run tests for core packages (models, server, engine, sdk, evaluators)" @echo " make models-test - run shared model tests with coverage" - @echo " make test-extras - run tests for contrib evaluators (galileo, etc.)" + @echo " make test-extras - run tests for contrib evaluators (galileo, drift, etc.)" @echo " make test-all - run all tests (core + extras)" @echo " make sdk-ts-test - run TypeScript SDK tests" @echo "" @@ -90,7 +91,7 @@ models-test: test-models: models-test # Run tests for contrib evaluators (not included in default test target) -test-extras: galileo-test +test-extras: galileo-test drift-test # Run all tests (core + extras) test-all: test test-extras @@ -244,3 +245,22 @@ galileo-typecheck: galileo-build: $(MAKE) -C $(GALILEO_DIR) build + +# --------------------------- +# Contrib Evaluators (Drift) +# --------------------------- + +drift-test: + $(MAKE) -C $(DRIFT_DIR) test + +drift-lint: + $(MAKE) -C $(DRIFT_DIR) lint + +drift-lint-fix: + $(MAKE) -C $(DRIFT_DIR) lint-fix + +drift-typecheck: + $(MAKE) -C $(DRIFT_DIR) typecheck + +drift-build: + $(MAKE) -C $(DRIFT_DIR) build diff --git a/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py index 3b5d3a04..8447f908 100644 --- a/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py +++ b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py @@ -6,6 +6,7 @@ from __future__ import annotations +import fcntl import json import logging import os @@ -20,40 +21,58 @@ logger = logging.getLogger(__name__) -def _load_history(path: Path) -> list[float]: - """Load observation history from a JSON file. +def _load_and_append_history(path: Path, score: float) -> list[float]: + """Atomically load history, append a score, persist, and return the updated list. + + Uses an exclusive advisory lock (``fcntl.LOCK_EX``) on the history file so + that concurrent workers for the same agent do not race on the + read-modify-write cycle. Without this, two simultaneous calls can both read + the same stale list and the last writer silently drops the other's + observation, causing drift detection to miss events. Args: - path: Path to the history file. + path: Path to the per-agent JSON history file. + score: New observation to append (already validated, in [0.0, 1.0]). Returns: - List of float scores, or empty list if file doesn't exist. - """ - if not path.exists(): - return [] - try: - with path.open("r") as fh: - data = json.load(fh) - scores = data.get("scores", []) - return [float(s) for s in scores if isinstance(s, (int, float))] - except (json.JSONDecodeError, OSError, ValueError) as exc: - logger.warning("Failed to load drift history from %s: %s", path, exc) - return [] - - -def _save_history(path: Path, scores: list[float]) -> None: - """Persist observation history to a JSON file. - - Args: - path: Path to the history file. - scores: List of float scores to persist. + Updated list of float scores (oldest first), including *score*. Raises: - OSError: If the file cannot be written. + OSError: If the lock file or history file cannot be opened or written. + json.JSONDecodeError: If the history file contains malformed JSON. """ path.parent.mkdir(parents=True, exist_ok=True) - with path.open("w") as fh: - json.dump({"scores": scores}, fh) + lock_path = path.with_suffix(".lock") + + # Open (or create) the lock file and hold an exclusive lock for the + # duration of the read-modify-write. The lock is released automatically + # when the file descriptor is closed at the end of this block. + with lock_path.open("a") as lock_fh: + fcntl.flock(lock_fh, fcntl.LOCK_EX) + + # Read existing scores under the lock. + scores: list[float] = [] + if path.exists(): + try: + with path.open("r") as fh: + data = json.load(fh) + scores = [ + float(s) + for s in data.get("scores", []) + if isinstance(s, (int, float)) + ] + except (json.JSONDecodeError, ValueError) as exc: + logger.warning("Corrupt drift history at %s; resetting. Error: %s", path, exc) + scores = [] + + # Append and persist while the lock is still held. + scores.append(score) + with path.open("w") as fh: + json.dump({"scores": scores}, fh) + + # Lock released here when lock_fh closes. + + return scores def _compute_drift( @@ -109,7 +128,11 @@ def _compute_drift( recent_avg = sum(recent_scores) / len(recent_scores) drift_magnitude = baseline_avg - recent_avg # positive = drop - matched = drift_magnitude >= drift_threshold + # Round before threshold comparison to avoid float precision issues. + # e.g. 1.0 - 0.9 evaluates to 0.09999999... in IEEE 754, which fails a + # >= 0.10 check without rounding. Rounding to 10 decimal places preserves + # all meaningful precision while eliminating the ULP-level noise. + matched = round(drift_magnitude, 10) >= drift_threshold if matched: status = "drift_detected" @@ -152,12 +175,12 @@ class DriftEvaluator(Evaluator[DriftEvaluatorConfig]): No external API or service required — history is stored as local JSON. - Instance Caching Note: - Per the base class contract, this evaluator stores only immutable - config state in ``__init__``. All file I/O happens inside - ``evaluate()`` using local variables, making it safe to reuse - across concurrent requests (each call reads and writes atomically - via a per-agent file lock-free JSON write). + Concurrency: + Each ``evaluate()`` call uses an exclusive advisory file lock + (``fcntl.LOCK_EX``) scoped to the read-modify-write cycle, ensuring + that concurrent workers for the same ``agent_id`` never race on + history updates. The lock is per-agent (``.lock`` next to + ``.json``), so different agents remain fully parallel. Example: ```python @@ -222,16 +245,16 @@ async def evaluate(self, data: Any) -> EvaluatorResult: storage_dir = Path(self.config.storage_path) history_path = storage_dir / f"{self.config.agent_id}.json" - # Load existing history + # Atomically load, append, and persist history under a file lock. try: - scores = _load_history(history_path) + scores = _load_and_append_history(history_path, score) except Exception as exc: - logger.error("DriftEvaluator: failed to load history: %s", exc) + logger.error("DriftEvaluator: storage error: %s", exc) matched = self.config.on_error == "deny" return EvaluatorResult( matched=matched, confidence=0.0, - message=f"Storage error (load): {exc}", + message=f"Storage error: {exc}", metadata={ "error": str(exc), "agent_id": self.config.agent_id, @@ -239,28 +262,6 @@ async def evaluate(self, data: Any) -> EvaluatorResult: }, ) - # Append new score - scores.append(score) - - # Persist updated history - try: - _save_history(history_path, scores) - except Exception as exc: - logger.error("DriftEvaluator: failed to save history: %s", exc) - # Still compute drift from in-memory scores even if save fails - matched_on_error = self.config.on_error == "deny" - if matched_on_error: - return EvaluatorResult( - matched=True, - confidence=0.0, - message=f"Storage error (save): {exc}", - metadata={ - "error": str(exc), - "agent_id": self.config.agent_id, - "fallback_action": self.config.on_error, - }, - ) - # Compute drift metrics metrics = _compute_drift( scores=scores, diff --git a/scripts/build.py b/scripts/build.py index 498239ca..798b1307 100644 --- a/scripts/build.py +++ b/scripts/build.py @@ -6,7 +6,7 @@ afterward. This allows the published wheels to be self-contained. Usage: - python scripts/build.py [models|evaluators|sdk|server|galileo|all] + python scripts/build.py [models|evaluators|sdk|server|galileo|drift|all] """ import shutil @@ -222,6 +222,25 @@ def build_evaluator_galileo() -> None: print(f" Built agent-control-evaluator-galileo v{version}") +def build_evaluator_drift() -> None: + """Build agent-control-evaluator-drift (standalone, no vendoring needed).""" + version = get_global_version() + drift_dir = ROOT / "evaluators" / "contrib" / "drift" + + print(f"Building agent-control-evaluator-drift v{version}") + + # Clean previous builds + dist_dir = drift_dir / "dist" + if dist_dir.exists(): + shutil.rmtree(dist_dir) + + # Set version + set_package_version(drift_dir / "pyproject.toml", version) + + subprocess.run(["uv", "build", "-o", str(dist_dir)], cwd=drift_dir, check=True) + print(f" Built agent-control-evaluator-drift v{version}") + + def build_all() -> None: """Build all packages.""" print(f"Building all packages (version {get_global_version()})\n") @@ -230,6 +249,7 @@ def build_all() -> None: build_sdk() build_server() build_evaluator_galileo() + build_evaluator_drift() print("\nAll packages built successfully!") @@ -248,8 +268,10 @@ def build_all() -> None: build_server() elif target == "galileo": build_evaluator_galileo() + elif target == "drift": + build_evaluator_drift() elif target == "all": build_all() else: - print("Usage: python scripts/build.py [models|evaluators|sdk|server|galileo|all]") + print("Usage: python scripts/build.py [models|evaluators|sdk|server|galileo|drift|all]") sys.exit(1)