agentcontrol · nanookclaw · Mar 20, 2026 · Mar 21, 2026 · lan17 · Mar 20, 2026
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help sync openapi-spec openapi-spec-check test test-extras test-all models-test test-models test-sdk lint lint-fix typecheck check build build-models build-server build-sdk publish publish-models publish-server publish-sdk hooks-install hooks-uninstall prepush evaluators-test evaluators-lint evaluators-lint-fix evaluators-typecheck evaluators-build galileo-test galileo-lint galileo-lint-fix galileo-typecheck galileo-build sdk-ts-generate sdk-ts-overlay-test sdk-ts-name-check sdk-ts-generate-check sdk-ts-build sdk-ts-test sdk-ts-lint sdk-ts-typecheck sdk-ts-release-check sdk-ts-publish-dry-run sdk-ts-publish
+.PHONY: help sync openapi-spec openapi-spec-check test test-extras test-all models-test test-models test-sdk lint lint-fix typecheck check build build-models build-server build-sdk publish publish-models publish-server publish-sdk hooks-install hooks-uninstall prepush evaluators-test evaluators-lint evaluators-lint-fix evaluators-typecheck evaluators-build galileo-test galileo-lint galileo-lint-fix galileo-typecheck galileo-build drift-test drift-lint drift-lint-fix drift-typecheck drift-build sdk-ts-generate sdk-ts-overlay-test sdk-ts-name-check sdk-ts-generate-check sdk-ts-build sdk-ts-test sdk-ts-lint sdk-ts-typecheck sdk-ts-release-check sdk-ts-publish-dry-run sdk-ts-publish
 
 # Workspace package names
 PACK_MODELS := agent-control-models
@@ -16,6 +16,7 @@ TS_SDK_DIR := sdks/typescript
 ENGINE_DIR := engine
 EVALUATORS_DIR := evaluators/builtin
 GALILEO_DIR := evaluators/contrib/galileo
+DRIFT_DIR := evaluators/contrib/drift
 UI_DIR := ui
 
 help:
@@ -33,7 +34,7 @@ help:
 	@echo "Test:"
 	@echo "  make test            - run tests for core packages (models, server, engine, sdk, evaluators)"
 	@echo "  make models-test     - run shared model tests with coverage"
-	@echo "  make test-extras     - run tests for contrib evaluators (galileo, etc.)"
+	@echo "  make test-extras     - run tests for contrib evaluators (galileo, drift, etc.)"
 	@echo "  make test-all        - run all tests (core + extras)"
 	@echo "  make sdk-ts-test     - run TypeScript SDK tests"
 	@echo ""
@@ -90,7 +91,7 @@ models-test:
 test-models: models-test
 
 # Run tests for contrib evaluators (not included in default test target)
-test-extras: galileo-test
+test-extras: galileo-test drift-test
 
 # Run all tests (core + extras)
 test-all: test test-extras
@@ -244,3 +245,22 @@ galileo-typecheck:
 
 galileo-build:
 	$(MAKE) -C $(GALILEO_DIR) build
+
+# ---------------------------
+# Contrib Evaluators (Drift)
+# ---------------------------
+
+drift-test:
+	$(MAKE) -C $(DRIFT_DIR) test
+
+drift-lint:
+	$(MAKE) -C $(DRIFT_DIR) lint
+
+drift-lint-fix:
+	$(MAKE) -C $(DRIFT_DIR) lint-fix
+
+drift-typecheck:
+	$(MAKE) -C $(DRIFT_DIR) typecheck
+
+drift-build:
+	$(MAKE) -C $(DRIFT_DIR) build
diff --git a/evaluators/contrib/drift/Makefile b/evaluators/contrib/drift/Makefile
@@ -0,0 +1,30 @@
+.PHONY: help sync test lint lint-fix typecheck build
+
+PACKAGE := agent-control-evaluator-drift
+
+help:
+	@echo "Agent Control Evaluator - Drift - Makefile commands"
+	@echo ""
+	@echo "  make test            - run pytest"
+	@echo "  make lint            - run ruff check"
+	@echo "  make lint-fix        - run ruff check --fix"
+	@echo "  make typecheck       - run mypy"
+	@echo "  make build           - build package"
+
+sync:
+	uv sync
+
+test:
+	uv run pytest --cov=src --cov-report=xml:../../../coverage-evaluators-drift.xml -q
+
+lint:
+	uv run ruff check --config ../../../pyproject.toml src/
+
+lint-fix:
+	uv run ruff check --config ../../../pyproject.toml --fix src/
+
+typecheck:
+	uv run mypy --config-file ../../../pyproject.toml src/
+
+build:
+	uv build
diff --git a/evaluators/contrib/drift/README.md b/evaluators/contrib/drift/README.md
@@ -0,0 +1,133 @@
+# agent-control-evaluator-drift
+
+Temporal behavioral drift evaluator for [Agent Control](https://github.com/agentcontrol/agent-control).
+
+Detects gradual behavioral degradation patterns that point-in-time evaluators miss.
+
+## The Problem
+
+Agent Control's built-in evaluators (regex, list, SQL, JSON) assess individual interactions. They answer: *"Is this response safe right now?"* They don't answer: *"Is this agent becoming less reliable over time?"*
+
+Empirical observation from [published longitudinal research](https://doi.org/10.5281/zenodo.19028012) across LLM agents:
+
+- Agents scoring 1.0 on point-in-time tests showed measurable drift over 28-day windows
+- Degradation was **non-monotonic**: stability windows followed by abrupt shifts, not gradual decline
+- Regression signals were noisy below 5 observations; rolling windows of ≥5 gave actionable signal
+- Two production deployments confirmed the same pattern independently
+
+This evaluator fills that gap by tracking behavioral scores over time and flagging when recent performance diverges from an established baseline.
+
+## How It Works
+
+```
+Single run:  regex/list evaluators → pass/fail per message
+Drift eval:  records score over N runs → alerts when recent window < baseline
+```
+
+The evaluator:
+1. Records a numeric score (0.0–1.0) for each evaluation
+2. Compares the recent window (last N observations) against a baseline (first M observations)
+3. Returns `matched=True` (drift detected) when the gap exceeds the configured threshold
+4. Stores history in a local JSON file (no external dependencies required)
+
+## Installation
+
+```bash
+pip install agent-control-evaluator-drift
+```
+
+Or with `uv`:
+```bash
+uv add agent-control-evaluator-drift
+```
+
+## Usage
+
+### Basic Configuration
+
+```python
+from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig
+
+config = DriftEvaluatorConfig(
+    agent_id="sales-agent-prod",       # Track this agent separately
+    storage_path="/tmp/drift-history", # Where to persist observations
+    window_size=10,                    # Recent window: last 10 scores
+    baseline_size=20,                  # Baseline: first 20 scores
+    drift_threshold=0.10,              # Alert if recent avg drops >10% vs baseline
+)
+
+evaluator = DriftEvaluator(config)
+
+# Each call records the score and checks for drift
+result = await evaluator.evaluate(0.85)  # Score from your primary evaluator
+```
+
+### In Agent Control YAML
+
+```yaml
+controls:
+  - name: "drift-check"
+    evaluator: "drift.temporal"
+    config:
+      agent_id: "my-agent"
+      storage_path: "/var/lib/agent-control/drift"
+      window_size: 10
+      baseline_size: 20
+      drift_threshold: 0.10
+    action: alert  # or block
+```
+
+### Chaining with Other Evaluators
+
+The drift evaluator expects a numeric score (0.0–1.0) as input. Pair it with a selector that extracts a confidence or quality score from agent output:
+
+```yaml
+controls:
+  - name: "quality-score"
+    selector: "$.quality_score"      # Extract score from agent output
+    evaluator: "drift.temporal"
+    config:
+      agent_id: "customer-support"
+      drift_threshold: 0.15
+```
+
+## Configuration Reference
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `agent_id` | `str` | `"default"` | Identifier to track agents separately |
+| `storage_path` | `str` | `/tmp/drift-history` | Directory for history files |
+| `window_size` | `int` | `10` | Number of recent observations to compare |
+| `baseline_size` | `int` | `20` | Number of initial observations to establish baseline |
+| `drift_threshold` | `float` | `0.10` | Minimum score drop to trigger drift alert (0.0–1.0) |
+| `min_observations` | `int` | `5` | Minimum observations before drift detection activates |
+| `on_error` | `str` | `"allow"` | Action on storage error: `"allow"` or `"deny"` |
+
+## Output
+
+`EvaluatorResult` fields:
+
+- `matched`: `True` when drift detected (recent window below baseline by threshold)
+- `confidence`: `1.0` when drift detected, `0.0` otherwise
+- `message`: Human-readable status (e.g., "Drift detected: baseline 0.92 → recent 0.78")
+- `metadata`:
+  - `agent_id`: Agent being tracked
+  - `observation_count`: Total observations recorded
+  - `baseline_avg`: Average score during baseline period
+  - `recent_avg`: Average score in recent window
+  - `drift_magnitude`: How far recent dropped below baseline
+  - `status`: `"drift_detected"`, `"stable"`, `"baseline_building"`, or `"insufficient_data"`
+
+## Research Background
+
+This evaluator is based on empirical findings from [PDR: Probabilistic Drift Rate for Longitudinal Behavioral Reliability in LLM-based Agents](https://doi.org/10.5281/zenodo.19028012).
+
+Key findings that shaped the design:
+- **Window ≥ 5**: Drift signals become reliable only above 5 observations (noisy below)
+- **Non-monotonic patterns**: Degradation isn't gradual; agents can return to baseline without intervention
+- **Specification matters**: Ambiguous task specs cause variance that looks like drift — scope `agent_id` to well-defined tasks
+- **Independent replication**: NexusGuard production deployment confirmed the same windowed-scoring behavior
+
+## License
+
+Apache 2.0
diff --git a/evaluators/contrib/drift/pyproject.toml b/evaluators/contrib/drift/pyproject.toml
@@ -0,0 +1,37 @@
+[project]
+name = "agent-control-evaluator-drift"
+version = "1.0.0"
+description = "Temporal behavioral drift evaluator for agent-control"
+readme = "README.md"
+requires-python = ">=3.12"
+license = { text = "Apache-2.0" }
+authors = [{ name = "Nanook (nanookclaw)" }]
+dependencies = [
+    "agent-control-evaluators>=3.0.0",
+    "agent-control-models>=3.0.0",
+    "pydantic>=2.12.4",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-asyncio>=0.23.0",
+    "pytest-cov>=4.0.0",
+    "ruff>=0.1.0",
+    "mypy>=1.8.0",
+]
+
+[project.entry-points."agent_control.evaluators"]
+"drift.temporal" = "agent_control_evaluator_drift.drift:DriftEvaluator"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/agent_control_evaluator_drift"]
+
+# For local dev, use override to resolve from workspace
+[tool.uv.sources]
+agent-control-evaluators = { path = "../../builtin", editable = true }
+agent-control-models = { path = "../../../models", editable = true }
diff --git a/evaluators/contrib/drift/src/agent_control_evaluator_drift/__init__.py b/evaluators/contrib/drift/src/agent_control_evaluator_drift/__init__.py
@@ -0,0 +1,8 @@
+"""Agent Control Drift Evaluator.
+
+Temporal behavioral drift detection for LLM agents.
+"""
+
+from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig
+
+__all__ = ["DriftEvaluator", "DriftEvaluatorConfig"]
diff --git a/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/__init__.py b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/__init__.py
@@ -0,0 +1,6 @@
+"""Drift evaluator module."""
+
+from agent_control_evaluator_drift.drift.config import DriftEvaluatorConfig
+from agent_control_evaluator_drift.drift.evaluator import DriftEvaluator
+
+__all__ = ["DriftEvaluator", "DriftEvaluatorConfig"]
diff --git a/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/config.py b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/config.py
@@ -0,0 +1,84 @@
+"""Configuration model for the temporal drift evaluator."""
+
+from typing import Literal
+
+from agent_control_evaluators import EvaluatorConfig
+from pydantic import Field, model_validator
+
+
+class DriftEvaluatorConfig(EvaluatorConfig):
+    """Configuration for the temporal behavioral drift evaluator.
+
+    Tracks a numeric score over time per agent and flags when recent
+    performance diverges from an established baseline.
+
+    Example:
+        ```python
+        config = DriftEvaluatorConfig(
+            agent_id="sales-agent-prod",
+            storage_path="/var/lib/agent-control/drift",
+            window_size=10,
+            baseline_size=20,
+            drift_threshold=0.10,
+        )
+        ```
+
+    Notes:
+        - Drift detection activates only after ``min_observations`` runs.
+        - During baseline building (first ``baseline_size`` observations),
+          ``matched`` is always ``False``.
+        - Storage is local JSON files; no external service required.
+    """
+
+    agent_id: str = Field(
+        default="default",
+        description="Unique identifier for the agent being tracked. "
+        "Use distinct IDs to track multiple agents independently.",
+    )
+    storage_path: str = Field(
+        default="/tmp/drift-history",
+        description="Directory path for persisting observation history files. "
+        "Each agent gets its own JSON file at <storage_path>/<agent_id>.json.",
+    )
+    window_size: int = Field(
+        default=10,
+        ge=2,
+        le=100,
+        description="Number of most-recent observations to use as the 'current' window "
+        "when computing recent average. Must be >= 2.",
+    )
+    baseline_size: int = Field(
+        default=20,
+        ge=5,
+        le=500,
+        description="Number of initial observations used to compute the baseline average. "
+        "Must be >= 5 (research finding: signals are noisy below 5 observations).",
+    )
+    drift_threshold: float = Field(
+        default=0.10,
+        ge=0.01,
+        le=1.0,
+        description="Minimum absolute drop in average score (0.0–1.0) from baseline "
+        "to recent window that triggers a drift alert. Default 0.10 = 10 point drop.",
+    )
+    min_observations: int = Field(
+        default=5,
+        ge=1,
+        description="Minimum total observations required before drift detection activates. "
+        "Prevents false positives during ramp-up.",
+    )
+    on_error: Literal["allow", "deny"] = Field(
+        default="allow",
+        description="Behavior when storage read/write fails: "
+        "'allow' (fail open, don't block) or 'deny' (fail closed, block).",
+    )
+
+    @model_validator(mode="after")
+    def validate_window_vs_baseline(self) -> "DriftEvaluatorConfig":
+        """Validate that window_size <= baseline_size."""
+        if self.window_size > self.baseline_size:
+            raise ValueError(
+                f"window_size ({self.window_size}) must be <= baseline_size ({self.baseline_size}). "
+                "The recent window cannot be larger than the baseline."
+            )
+        return self