From ba08a77a9ea16fe5d2f525ebd4a03cf1ac47c4c5 Mon Sep 17 00:00:00 2001
From: Nanook <nanookclaw@users.noreply.github.com>
Date: Fri, 20 Mar 2026 12:43:35 +0000
Subject: [PATCH 1/2] feat(contrib): add drift.temporal evaluator for
 longitudinal behavioral monitoring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a new contrib evaluator that detects gradual behavioral degradation
patterns that point-in-time evaluators (regex, list, SQL, JSON) miss.

## Motivation

Follows from discussion in #118 (temporal behavioral drift). The maintainer
(lan17) asked for a standalone implementation; the package was built at
https://github.com/nanookclaw/agent-control-drift-evaluator. This PR
integrates it into the contrib ecosystem so it can be installed directly
alongside other Agent Control evaluators.

## What it does

- Records a numeric behavioral score (0.0–1.0) per agent per interaction
- Compares the recent window (last N observations) to a baseline
  (first M observations)
- Returns matched=True when recent average drops below baseline by
  more than the configured threshold
- Stores history as local JSON — no external API or service required

## Design decisions grounded in empirical research

Two findings from published longitudinal work (DOI: 10.5281/zenodo.19028012)
shaped the implementation:

1. **min_observations ≥ 5**: Drift signals are noisy below 5 observations.
   Default min_observations=5 prevents early false positives.

2. **Non-monotonic degradation**: Agents can drift and recover without
   intervention. The evaluator tracks the window, not just a cumulative
   average, so it detects current state rather than all-time performance.

Both patterns were independently validated by a second production deployment
(NexusGuard fleet, v0.5.36, 48 tests merged).

## Package structure

Follows the galileo contrib pattern:

  evaluators/contrib/drift/
  ├── pyproject.toml           # agent-control-evaluator-drift
  ├── Makefile                 # test / lint / typecheck / build
  ├── README.md
  └── src/
      └── agent_control_evaluator_drift/
          └── drift/
              ├── config.py    # DriftEvaluatorConfig (Pydantic)
              └── evaluator.py # DriftEvaluator (@register_evaluator)

## Tests

31 tests covering:
- Config validation (bounds, window vs baseline, on_error)
- Core drift computation (insufficient data, baseline building, stable,
  drift detected, threshold boundary conditions)
- File I/O helpers (load/save roundtrip, missing file, corrupt JSON,
  directory creation)
- Full evaluator integration (persistence across instances, independent
  agent_id tracking, score clamping, fail-open/closed error handling,
  metadata completeness)

Relates to: #118
---
 evaluators/contrib/drift/Makefile             |  30 ++
 evaluators/contrib/drift/README.md            | 133 ++++++
 evaluators/contrib/drift/pyproject.toml       |  37 ++
 .../agent_control_evaluator_drift/__init__.py |   8 +
 .../drift/__init__.py                         |   6 +
 .../drift/config.py                           |  84 ++++
 .../drift/evaluator.py                        | 286 +++++++++++
 evaluators/contrib/drift/tests/__init__.py    |   0
 .../contrib/drift/tests/drift/__init__.py     |   0
 .../contrib/drift/tests/drift/test_drift.py   | 452 ++++++++++++++++++
 10 files changed, 1036 insertions(+)
 create mode 100644 evaluators/contrib/drift/Makefile
 create mode 100644 evaluators/contrib/drift/README.md
 create mode 100644 evaluators/contrib/drift/pyproject.toml
 create mode 100644 evaluators/contrib/drift/src/agent_control_evaluator_drift/__init__.py
 create mode 100644 evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/__init__.py
 create mode 100644 evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/config.py
 create mode 100644 evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py
 create mode 100644 evaluators/contrib/drift/tests/__init__.py
 create mode 100644 evaluators/contrib/drift/tests/drift/__init__.py
 create mode 100644 evaluators/contrib/drift/tests/drift/test_drift.py

diff --git a/evaluators/contrib/drift/Makefile b/evaluators/contrib/drift/Makefile
new file mode 100644
index 00000000..5e3b60d6
--- /dev/null
+++ b/evaluators/contrib/drift/Makefile
@@ -0,0 +1,30 @@
+.PHONY: help sync test lint lint-fix typecheck build
+
+PACKAGE := agent-control-evaluator-drift
+
+help:
+	@echo "Agent Control Evaluator - Drift - Makefile commands"
+	@echo ""
+	@echo "  make test            - run pytest"
+	@echo "  make lint            - run ruff check"
+	@echo "  make lint-fix        - run ruff check --fix"
+	@echo "  make typecheck       - run mypy"
+	@echo "  make build           - build package"
+
+sync:
+	uv sync
+
+test:
+	uv run pytest --cov=src --cov-report=xml:../../../coverage-evaluators-drift.xml -q
+
+lint:
+	uv run ruff check --config ../../../pyproject.toml src/
+
+lint-fix:
+	uv run ruff check --config ../../../pyproject.toml --fix src/
+
+typecheck:
+	uv run mypy --config-file ../../../pyproject.toml src/
+
+build:
+	uv build
diff --git a/evaluators/contrib/drift/README.md b/evaluators/contrib/drift/README.md
new file mode 100644
index 00000000..8acd62ba
--- /dev/null
+++ b/evaluators/contrib/drift/README.md
@@ -0,0 +1,133 @@
+# agent-control-evaluator-drift
+
+Temporal behavioral drift evaluator for [Agent Control](https://github.com/agentcontrol/agent-control).
+
+Detects gradual behavioral degradation patterns that point-in-time evaluators miss.
+
+## The Problem
+
+Agent Control's built-in evaluators (regex, list, SQL, JSON) assess individual interactions. They answer: *"Is this response safe right now?"* They don't answer: *"Is this agent becoming less reliable over time?"*
+
+Empirical observation from [published longitudinal research](https://doi.org/10.5281/zenodo.19028012) across LLM agents:
+
+- Agents scoring 1.0 on point-in-time tests showed measurable drift over 28-day windows
+- Degradation was **non-monotonic**: stability windows followed by abrupt shifts, not gradual decline
+- Regression signals were noisy below 5 observations; rolling windows of ≥5 gave actionable signal
+- Two production deployments confirmed the same pattern independently
+
+This evaluator fills that gap by tracking behavioral scores over time and flagging when recent performance diverges from an established baseline.
+
+## How It Works
+
+```
+Single run:  regex/list evaluators → pass/fail per message
+Drift eval:  records score over N runs → alerts when recent window < baseline
+```
+
+The evaluator:
+1. Records a numeric score (0.0–1.0) for each evaluation
+2. Compares the recent window (last N observations) against a baseline (first M observations)
+3. Returns `matched=True` (drift detected) when the gap exceeds the configured threshold
+4. Stores history in a local JSON file (no external dependencies required)
+
+## Installation
+
+```bash
+pip install agent-control-evaluator-drift
+```
+
+Or with `uv`:
+```bash
+uv add agent-control-evaluator-drift
+```
+
+## Usage
+
+### Basic Configuration
+
+```python
+from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig
+
+config = DriftEvaluatorConfig(
+    agent_id="sales-agent-prod",       # Track this agent separately
+    storage_path="/tmp/drift-history", # Where to persist observations
+    window_size=10,                    # Recent window: last 10 scores
+    baseline_size=20,                  # Baseline: first 20 scores
+    drift_threshold=0.10,              # Alert if recent avg drops >10% vs baseline
+)
+
+evaluator = DriftEvaluator(config)
+
+# Each call records the score and checks for drift
+result = await evaluator.evaluate(0.85)  # Score from your primary evaluator
+```
+
+### In Agent Control YAML
+
+```yaml
+controls:
+  - name: "drift-check"
+    evaluator: "drift.temporal"
+    config:
+      agent_id: "my-agent"
+      storage_path: "/var/lib/agent-control/drift"
+      window_size: 10
+      baseline_size: 20
+      drift_threshold: 0.10
+    action: alert  # or block
+```
+
+### Chaining with Other Evaluators
+
+The drift evaluator expects a numeric score (0.0–1.0) as input. Pair it with a selector that extracts a confidence or quality score from agent output:
+
+```yaml
+controls:
+  - name: "quality-score"
+    selector: "$.quality_score"      # Extract score from agent output
+    evaluator: "drift.temporal"
+    config:
+      agent_id: "customer-support"
+      drift_threshold: 0.15
+```
+
+## Configuration Reference
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `agent_id` | `str` | `"default"` | Identifier to track agents separately |
+| `storage_path` | `str` | `/tmp/drift-history` | Directory for history files |
+| `window_size` | `int` | `10` | Number of recent observations to compare |
+| `baseline_size` | `int` | `20` | Number of initial observations to establish baseline |
+| `drift_threshold` | `float` | `0.10` | Minimum score drop to trigger drift alert (0.0–1.0) |
+| `min_observations` | `int` | `5` | Minimum observations before drift detection activates |
+| `on_error` | `str` | `"allow"` | Action on storage error: `"allow"` or `"deny"` |
+
+## Output
+
+`EvaluatorResult` fields:
+
+- `matched`: `True` when drift detected (recent window below baseline by threshold)
+- `confidence`: `1.0` when drift detected, `0.0` otherwise
+- `message`: Human-readable status (e.g., "Drift detected: baseline 0.92 → recent 0.78")
+- `metadata`:
+  - `agent_id`: Agent being tracked
+  - `observation_count`: Total observations recorded
+  - `baseline_avg`: Average score during baseline period
+  - `recent_avg`: Average score in recent window
+  - `drift_magnitude`: How far recent dropped below baseline
+  - `status`: `"drift_detected"`, `"stable"`, `"baseline_building"`, or `"insufficient_data"`
+
+## Research Background
+
+This evaluator is based on empirical findings from [PDR: Probabilistic Drift Rate for Longitudinal Behavioral Reliability in LLM-based Agents](https://doi.org/10.5281/zenodo.19028012).
+
+Key findings that shaped the design:
+- **Window ≥ 5**: Drift signals become reliable only above 5 observations (noisy below)
+- **Non-monotonic patterns**: Degradation isn't gradual; agents can return to baseline without intervention
+- **Specification matters**: Ambiguous task specs cause variance that looks like drift — scope `agent_id` to well-defined tasks
+- **Independent replication**: NexusGuard production deployment confirmed the same windowed-scoring behavior
+
+## License
+
+Apache 2.0
diff --git a/evaluators/contrib/drift/pyproject.toml b/evaluators/contrib/drift/pyproject.toml
new file mode 100644
index 00000000..c827317a
--- /dev/null
+++ b/evaluators/contrib/drift/pyproject.toml
@@ -0,0 +1,37 @@
+[project]
+name = "agent-control-evaluator-drift"
+version = "1.0.0"
+description = "Temporal behavioral drift evaluator for agent-control"
+readme = "README.md"
+requires-python = ">=3.12"
+license = { text = "Apache-2.0" }
+authors = [{ name = "Nanook (nanookclaw)" }]
+dependencies = [
+    "agent-control-evaluators>=3.0.0",
+    "agent-control-models>=3.0.0",
+    "pydantic>=2.12.4",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-asyncio>=0.23.0",
+    "pytest-cov>=4.0.0",
+    "ruff>=0.1.0",
+    "mypy>=1.8.0",
+]
+
+[project.entry-points."agent_control.evaluators"]
+"drift.temporal" = "agent_control_evaluator_drift.drift:DriftEvaluator"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/agent_control_evaluator_drift"]
+
+# For local dev, use override to resolve from workspace
+[tool.uv.sources]
+agent-control-evaluators = { path = "../../builtin", editable = true }
+agent-control-models = { path = "../../../models", editable = true }
diff --git a/evaluators/contrib/drift/src/agent_control_evaluator_drift/__init__.py b/evaluators/contrib/drift/src/agent_control_evaluator_drift/__init__.py
new file mode 100644
index 00000000..f8c8e393
--- /dev/null
+++ b/evaluators/contrib/drift/src/agent_control_evaluator_drift/__init__.py
@@ -0,0 +1,8 @@
+"""Agent Control Drift Evaluator.
+
+Temporal behavioral drift detection for LLM agents.
+"""
+
+from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig
+
+__all__ = ["DriftEvaluator", "DriftEvaluatorConfig"]
diff --git a/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/__init__.py b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/__init__.py
new file mode 100644
index 00000000..68367114
--- /dev/null
+++ b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/__init__.py
@@ -0,0 +1,6 @@
+"""Drift evaluator module."""
+
+from agent_control_evaluator_drift.drift.config import DriftEvaluatorConfig
+from agent_control_evaluator_drift.drift.evaluator import DriftEvaluator
+
+__all__ = ["DriftEvaluator", "DriftEvaluatorConfig"]
diff --git a/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/config.py b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/config.py
new file mode 100644
index 00000000..0587e31d
--- /dev/null
+++ b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/config.py
@@ -0,0 +1,84 @@
+"""Configuration model for the temporal drift evaluator."""
+
+from typing import Literal
+
+from agent_control_evaluators import EvaluatorConfig
+from pydantic import Field, model_validator
+
+
+class DriftEvaluatorConfig(EvaluatorConfig):
+    """Configuration for the temporal behavioral drift evaluator.
+
+    Tracks a numeric score over time per agent and flags when recent
+    performance diverges from an established baseline.
+
+    Example:
+        ```python
+        config = DriftEvaluatorConfig(
+            agent_id="sales-agent-prod",
+            storage_path="/var/lib/agent-control/drift",
+            window_size=10,
+            baseline_size=20,
+            drift_threshold=0.10,
+        )
+        ```
+
+    Notes:
+        - Drift detection activates only after ``min_observations`` runs.
+        - During baseline building (first ``baseline_size`` observations),
+          ``matched`` is always ``False``.
+        - Storage is local JSON files; no external service required.
+    """
+
+    agent_id: str = Field(
+        default="default",
+        description="Unique identifier for the agent being tracked. "
+        "Use distinct IDs to track multiple agents independently.",
+    )
+    storage_path: str = Field(
+        default="/tmp/drift-history",
+        description="Directory path for persisting observation history files. "
+        "Each agent gets its own JSON file at <storage_path>/<agent_id>.json.",
+    )
+    window_size: int = Field(
+        default=10,
+        ge=2,
+        le=100,
+        description="Number of most-recent observations to use as the 'current' window "
+        "when computing recent average. Must be >= 2.",
+    )
+    baseline_size: int = Field(
+        default=20,
+        ge=5,
+        le=500,
+        description="Number of initial observations used to compute the baseline average. "
+        "Must be >= 5 (research finding: signals are noisy below 5 observations).",
+    )
+    drift_threshold: float = Field(
+        default=0.10,
+        ge=0.01,
+        le=1.0,
+        description="Minimum absolute drop in average score (0.0–1.0) from baseline "
+        "to recent window that triggers a drift alert. Default 0.10 = 10 point drop.",
+    )
+    min_observations: int = Field(
+        default=5,
+        ge=1,
+        description="Minimum total observations required before drift detection activates. "
+        "Prevents false positives during ramp-up.",
+    )
+    on_error: Literal["allow", "deny"] = Field(
+        default="allow",
+        description="Behavior when storage read/write fails: "
+        "'allow' (fail open, don't block) or 'deny' (fail closed, block).",
+    )
+
+    @model_validator(mode="after")
+    def validate_window_vs_baseline(self) -> "DriftEvaluatorConfig":
+        """Validate that window_size <= baseline_size."""
+        if self.window_size > self.baseline_size:
+            raise ValueError(
+                f"window_size ({self.window_size}) must be <= baseline_size ({self.baseline_size}). "
+                "The recent window cannot be larger than the baseline."
+            )
+        return self
diff --git a/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py
new file mode 100644
index 00000000..3b5d3a04
--- /dev/null
+++ b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py
@@ -0,0 +1,286 @@
+"""Temporal behavioral drift evaluator for Agent Control.
+
+Detects gradual behavioral degradation that point-in-time evaluators miss
+by tracking numeric scores over time and comparing recent windows to baselines.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Any
+
+from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator
+from agent_control_models import EvaluatorResult
+
+from agent_control_evaluator_drift.drift.config import DriftEvaluatorConfig
+
+logger = logging.getLogger(__name__)
+
+
+def _load_history(path: Path) -> list[float]:
+    """Load observation history from a JSON file.
+
+    Args:
+        path: Path to the history file.
+
+    Returns:
+        List of float scores, or empty list if file doesn't exist.
+    """
+    if not path.exists():
+        return []
+    try:
+        with path.open("r") as fh:
+            data = json.load(fh)
+            scores = data.get("scores", [])
+            return [float(s) for s in scores if isinstance(s, (int, float))]
+    except (json.JSONDecodeError, OSError, ValueError) as exc:
+        logger.warning("Failed to load drift history from %s: %s", path, exc)
+        return []
+
+
+def _save_history(path: Path, scores: list[float]) -> None:
+    """Persist observation history to a JSON file.
+
+    Args:
+        path: Path to the history file.
+        scores: List of float scores to persist.
+
+    Raises:
+        OSError: If the file cannot be written.
+    """
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w") as fh:
+        json.dump({"scores": scores}, fh)
+
+
+def _compute_drift(
+    scores: list[float],
+    window_size: int,
+    baseline_size: int,
+    drift_threshold: float,
+    min_observations: int,
+) -> dict[str, Any]:
+    """Compute drift metrics from a score history.
+
+    Args:
+        scores: Full list of observations (oldest first).
+        window_size: Number of recent observations for current window.
+        baseline_size: Number of initial observations for baseline.
+        drift_threshold: Drop magnitude that triggers alert.
+        min_observations: Minimum observations before detection activates.
+
+    Returns:
+        Dict with keys: status, baseline_avg, recent_avg, drift_magnitude,
+        observation_count, matched, confidence, message.
+    """
+    n = len(scores)
+
+    if n < min_observations:
+        return {
+            "status": "insufficient_data",
+            "observation_count": n,
+            "baseline_avg": None,
+            "recent_avg": None,
+            "drift_magnitude": None,
+            "matched": False,
+            "confidence": 0.0,
+            "message": f"Insufficient data: {n}/{min_observations} observations",
+        }
+
+    if n < baseline_size:
+        return {
+            "status": "baseline_building",
+            "observation_count": n,
+            "baseline_avg": round(sum(scores) / n, 4),
+            "recent_avg": None,
+            "drift_magnitude": None,
+            "matched": False,
+            "confidence": 0.0,
+            "message": f"Building baseline: {n}/{baseline_size} observations collected",
+        }
+
+    baseline_scores = scores[:baseline_size]
+    recent_scores = scores[-window_size:]
+
+    baseline_avg = sum(baseline_scores) / len(baseline_scores)
+    recent_avg = sum(recent_scores) / len(recent_scores)
+    drift_magnitude = baseline_avg - recent_avg  # positive = drop
+
+    matched = drift_magnitude >= drift_threshold
+
+    if matched:
+        status = "drift_detected"
+        message = (
+            f"Drift detected: baseline {baseline_avg:.3f} → recent {recent_avg:.3f} "
+            f"(drop of {drift_magnitude:.3f}, threshold {drift_threshold:.3f})"
+        )
+        confidence = 1.0
+    else:
+        status = "stable"
+        message = (
+            f"Stable: baseline {baseline_avg:.3f}, recent {recent_avg:.3f} "
+            f"(gap {drift_magnitude:.3f} < threshold {drift_threshold:.3f})"
+        )
+        confidence = 0.0
+
+    return {
+        "status": status,
+        "observation_count": n,
+        "baseline_avg": round(baseline_avg, 4),
+        "recent_avg": round(recent_avg, 4),
+        "drift_magnitude": round(drift_magnitude, 4),
+        "matched": matched,
+        "confidence": confidence,
+        "message": message,
+    }
+
+
+@register_evaluator
+class DriftEvaluator(Evaluator[DriftEvaluatorConfig]):
+    """Temporal behavioral drift evaluator.
+
+    Tracks a numeric behavioral score over time per agent and flags when
+    recent performance diverges from an established baseline by more than
+    the configured threshold.
+
+    This evaluator fills the gap between point-in-time evaluators (which
+    answer "is this response OK now?") and longitudinal reliability analysis
+    (which answers "is this agent getting worse over time?").
+
+    No external API or service required — history is stored as local JSON.
+
+    Instance Caching Note:
+        Per the base class contract, this evaluator stores only immutable
+        config state in ``__init__``. All file I/O happens inside
+        ``evaluate()`` using local variables, making it safe to reuse
+        across concurrent requests (each call reads and writes atomically
+        via a per-agent file lock-free JSON write).
+
+    Example:
+        ```python
+        config = DriftEvaluatorConfig(
+            agent_id="customer-support",
+            drift_threshold=0.10,
+        )
+        evaluator = DriftEvaluator(config)
+        result = await evaluator.evaluate(0.92)  # Pass current score
+        ```
+    """
+
+    metadata = EvaluatorMetadata(
+        name="drift.temporal",
+        version="1.0.0",
+        description=(
+            "Temporal behavioral drift detection. Tracks numeric scores over time "
+            "and alerts when recent performance drops below baseline. "
+            "No external API required."
+        ),
+        requires_api_key=False,
+        timeout_ms=1000,  # Local file I/O only — fast
+    )
+    config_model = DriftEvaluatorConfig
+
+    async def evaluate(self, data: Any) -> EvaluatorResult:
+        """Record a behavioral score and check for drift.
+
+        Args:
+            data: Numeric score (float or int, 0.0–1.0) representing the
+                agent's behavioral quality for this interaction. Higher is better.
+                Typically sourced from a primary evaluator's confidence or
+                extracted from agent output via a selector.
+
+        Returns:
+            EvaluatorResult where matched=True indicates drift detected.
+        """
+        # Validate and extract score
+        try:
+            score = float(data)
+            if not (0.0 <= score <= 1.0):
+                logger.warning(
+                    "DriftEvaluator received score %.4f outside [0,1]; clamping.",
+                    score,
+                )
+                score = max(0.0, min(1.0, score))
+        except (TypeError, ValueError) as exc:
+            logger.error("DriftEvaluator: cannot parse score from %r: %s", data, exc)
+            matched = self.config.on_error == "deny"
+            return EvaluatorResult(
+                matched=matched,
+                confidence=0.0,
+                message=f"Invalid score value: {data!r}",
+                metadata={
+                    "error": str(exc),
+                    "agent_id": self.config.agent_id,
+                    "fallback_action": self.config.on_error,
+                },
+            )
+
+        # Determine storage path
+        storage_dir = Path(self.config.storage_path)
+        history_path = storage_dir / f"{self.config.agent_id}.json"
+
+        # Load existing history
+        try:
+            scores = _load_history(history_path)
+        except Exception as exc:
+            logger.error("DriftEvaluator: failed to load history: %s", exc)
+            matched = self.config.on_error == "deny"
+            return EvaluatorResult(
+                matched=matched,
+                confidence=0.0,
+                message=f"Storage error (load): {exc}",
+                metadata={
+                    "error": str(exc),
+                    "agent_id": self.config.agent_id,
+                    "fallback_action": self.config.on_error,
+                },
+            )
+
+        # Append new score
+        scores.append(score)
+
+        # Persist updated history
+        try:
+            _save_history(history_path, scores)
+        except Exception as exc:
+            logger.error("DriftEvaluator: failed to save history: %s", exc)
+            # Still compute drift from in-memory scores even if save fails
+            matched_on_error = self.config.on_error == "deny"
+            if matched_on_error:
+                return EvaluatorResult(
+                    matched=True,
+                    confidence=0.0,
+                    message=f"Storage error (save): {exc}",
+                    metadata={
+                        "error": str(exc),
+                        "agent_id": self.config.agent_id,
+                        "fallback_action": self.config.on_error,
+                    },
+                )
+
+        # Compute drift metrics
+        metrics = _compute_drift(
+            scores=scores,
+            window_size=self.config.window_size,
+            baseline_size=self.config.baseline_size,
+            drift_threshold=self.config.drift_threshold,
+            min_observations=self.config.min_observations,
+        )
+
+        return EvaluatorResult(
+            matched=metrics["matched"],
+            confidence=metrics["confidence"],
+            message=metrics["message"],
+            metadata={
+                "agent_id": self.config.agent_id,
+                "observation_count": metrics["observation_count"],
+                "baseline_avg": metrics["baseline_avg"],
+                "recent_avg": metrics["recent_avg"],
+                "drift_magnitude": metrics["drift_magnitude"],
+                "status": metrics["status"],
+                "current_score": round(score, 4),
+            },
+        )
diff --git a/evaluators/contrib/drift/tests/__init__.py b/evaluators/contrib/drift/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/evaluators/contrib/drift/tests/drift/__init__.py b/evaluators/contrib/drift/tests/drift/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/evaluators/contrib/drift/tests/drift/test_drift.py b/evaluators/contrib/drift/tests/drift/test_drift.py
new file mode 100644
index 00000000..a67eae50
--- /dev/null
+++ b/evaluators/contrib/drift/tests/drift/test_drift.py
@@ -0,0 +1,452 @@
+"""Tests for the temporal behavioral drift evaluator.
+
+Tests are designed to run without external dependencies — all file I/O
+is redirected to a temporary directory.
+"""
+
+from __future__ import annotations
+
+import json
+import tempfile
+from pathlib import Path
+from unittest.mock import AsyncMock, patch
+
+import pytest
+from agent_control_evaluators import Evaluator
+from agent_control_models import EvaluatorResult
+from pydantic import ValidationError
+
+
+# ---------------------------------------------------------------------------
+# Config tests
+# ---------------------------------------------------------------------------
+
+
+class TestDriftEvaluatorConfig:
+    """Tests for DriftEvaluatorConfig Pydantic model."""
+
+    def test_defaults(self):
+        from agent_control_evaluator_drift.drift import DriftEvaluatorConfig
+
+        config = DriftEvaluatorConfig()
+        assert config.agent_id == "default"
+        assert config.window_size == 10
+        assert config.baseline_size == 20
+        assert config.drift_threshold == 0.10
+        assert config.min_observations == 5
+        assert config.on_error == "allow"
+
+    def test_custom_values(self):
+        from agent_control_evaluator_drift.drift import DriftEvaluatorConfig
+
+        config = DriftEvaluatorConfig(
+            agent_id="my-agent",
+            window_size=5,
+            baseline_size=15,
+            drift_threshold=0.20,
+            min_observations=3,
+            on_error="deny",
+        )
+        assert config.agent_id == "my-agent"
+        assert config.window_size == 5
+        assert config.baseline_size == 15
+        assert config.drift_threshold == 0.20
+        assert config.min_observations == 3
+        assert config.on_error == "deny"
+
+    def test_window_cannot_exceed_baseline(self):
+        from agent_control_evaluator_drift.drift import DriftEvaluatorConfig
+
+        with pytest.raises(ValidationError, match="window_size.*must be <="):
+            DriftEvaluatorConfig(window_size=25, baseline_size=20)
+
+    def test_window_equal_to_baseline_is_valid(self):
+        from agent_control_evaluator_drift.drift import DriftEvaluatorConfig
+
+        config = DriftEvaluatorConfig(window_size=20, baseline_size=20)
+        assert config.window_size == config.baseline_size
+
+    def test_drift_threshold_bounds(self):
+        from agent_control_evaluator_drift.drift import DriftEvaluatorConfig
+
+        # Below min
+        with pytest.raises(ValidationError):
+            DriftEvaluatorConfig(drift_threshold=0.0)
+
+        # Above max
+        with pytest.raises(ValidationError):
+            DriftEvaluatorConfig(drift_threshold=1.1)
+
+    def test_on_error_validation(self):
+        from agent_control_evaluator_drift.drift import DriftEvaluatorConfig
+
+        DriftEvaluatorConfig(on_error="allow")
+        DriftEvaluatorConfig(on_error="deny")
+
+        with pytest.raises(ValidationError):
+            DriftEvaluatorConfig(on_error="ignore")
+
+    def test_baseline_size_minimum(self):
+        from agent_control_evaluator_drift.drift import DriftEvaluatorConfig
+
+        with pytest.raises(ValidationError):
+            DriftEvaluatorConfig(baseline_size=4)  # below ge=5
+
+    def test_window_size_minimum(self):
+        from agent_control_evaluator_drift.drift import DriftEvaluatorConfig
+
+        with pytest.raises(ValidationError):
+            DriftEvaluatorConfig(window_size=1)  # below ge=2
+
+
+# ---------------------------------------------------------------------------
+# Evaluator inheritance / metadata
+# ---------------------------------------------------------------------------
+
+
+class TestDriftEvaluatorInheritance:
+    def test_extends_base_evaluator(self):
+        from agent_control_evaluator_drift.drift import DriftEvaluator
+
+        assert issubclass(DriftEvaluator, Evaluator)
+
+    def test_metadata_fields(self):
+        from agent_control_evaluator_drift.drift import DriftEvaluator
+
+        meta = DriftEvaluator.metadata
+        assert meta.name == "drift.temporal"
+        assert meta.requires_api_key is False
+        assert "drift" in meta.description.lower()
+
+    def test_is_available(self):
+        from agent_control_evaluator_drift.drift import DriftEvaluator
+
+        assert DriftEvaluator.is_available() is True
+
+
+# ---------------------------------------------------------------------------
+# Core helper functions
+# ---------------------------------------------------------------------------
+
+
+class TestComputeDrift:
+    def _compute(self, scores, window=5, baseline=10, threshold=0.10, min_obs=5):
+        from agent_control_evaluator_drift.drift.evaluator import _compute_drift
+
+        return _compute_drift(scores, window, baseline, threshold, min_obs)
+
+    def test_insufficient_data(self):
+        result = self._compute(scores=[0.9, 0.8, 0.7], min_obs=5)
+        assert result["status"] == "insufficient_data"
+        assert result["matched"] is False
+
+    def test_baseline_building(self):
+        scores = [0.9] * 7  # 7 obs, min_obs=5, baseline=10
+        result = self._compute(scores=scores)
+        assert result["status"] == "baseline_building"
+        assert result["matched"] is False
+
+    def test_stable_after_baseline(self):
+        # 10 good baseline + 5 good recent → no drift
+        scores = [0.9] * 10 + [0.88] * 5
+        result = self._compute(scores=scores)
+        assert result["status"] == "stable"
+        assert result["matched"] is False
+        assert result["baseline_avg"] == pytest.approx(0.9, abs=0.01)
+        assert result["recent_avg"] == pytest.approx(0.88, abs=0.01)
+
+    def test_drift_detected(self):
+        # 10 good baseline, then 5 bad recent
+        scores = [0.9] * 10 + [0.7] * 5
+        result = self._compute(scores=scores, threshold=0.10)
+        assert result["status"] == "drift_detected"
+        assert result["matched"] is True
+        assert result["drift_magnitude"] == pytest.approx(0.20, abs=0.01)
+
+    def test_exactly_at_threshold_triggers(self):
+        # Drop of exactly 0.10 should trigger (>= threshold)
+        baseline = [1.0] * 10
+        recent_window = [0.9] * 5
+        scores = baseline + recent_window
+        result = self._compute(scores=scores, threshold=0.10)
+        assert result["matched"] is True
+
+    def test_just_below_threshold_does_not_trigger(self):
+        baseline = [1.0] * 10
+        recent_window = [0.91] * 5
+        scores = baseline + recent_window
+        result = self._compute(scores=scores, threshold=0.10)
+        assert result["matched"] is False
+
+    def test_observation_count_returned(self):
+        scores = [0.9] * 15
+        result = self._compute(scores=scores)
+        assert result["observation_count"] == 15
+
+
+class TestHistoryIO:
+    def test_load_missing_file_returns_empty(self, tmp_path):
+        from agent_control_evaluator_drift.drift.evaluator import _load_history
+
+        path = tmp_path / "nonexistent.json"
+        assert _load_history(path) == []
+
+    def test_save_and_load_roundtrip(self, tmp_path):
+        from agent_control_evaluator_drift.drift.evaluator import _load_history, _save_history
+
+        path = tmp_path / "scores.json"
+        _save_history(path, [0.9, 0.85, 0.8])
+        loaded = _load_history(path)
+        assert loaded == pytest.approx([0.9, 0.85, 0.8])
+
+    def test_save_creates_parent_dirs(self, tmp_path):
+        from agent_control_evaluator_drift.drift.evaluator import _save_history
+
+        path = tmp_path / "nested" / "deep" / "scores.json"
+        _save_history(path, [0.5])
+        assert path.exists()
+
+    def test_load_corrupted_json_returns_empty(self, tmp_path):
+        from agent_control_evaluator_drift.drift.evaluator import _load_history
+
+        path = tmp_path / "bad.json"
+        path.write_text("not valid json {{{{")
+        result = _load_history(path)
+        assert result == []
+
+
+# ---------------------------------------------------------------------------
+# Full evaluator integration tests
+# ---------------------------------------------------------------------------
+
+
+class TestDriftEvaluatorIntegration:
+    @pytest.mark.asyncio
+    async def test_insufficient_data_returns_not_matched(self, tmp_path):
+        from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig
+
+        config = DriftEvaluatorConfig(
+            agent_id="test",
+            storage_path=str(tmp_path),
+            window_size=5,
+            baseline_size=10,
+            min_observations=5,
+        )
+        evaluator = DriftEvaluator(config)
+
+        # Only 2 observations — below min_observations
+        await evaluator.evaluate(0.9)
+        result = await evaluator.evaluate(0.9)
+
+        assert isinstance(result, EvaluatorResult)
+        assert result.matched is False
+        assert result.metadata["status"] == "insufficient_data"
+
+    @pytest.mark.asyncio
+    async def test_baseline_building_phase(self, tmp_path):
+        from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig
+
+        config = DriftEvaluatorConfig(
+            agent_id="test",
+            storage_path=str(tmp_path),
+            window_size=3,
+            baseline_size=10,
+            min_observations=5,
+        )
+        evaluator = DriftEvaluator(config)
+
+        # Feed 7 observations (>= min_obs but < baseline_size)
+        for _ in range(7):
+            result = await evaluator.evaluate(0.9)
+
+        assert result.matched is False
+        assert result.metadata["status"] == "baseline_building"
+
+    @pytest.mark.asyncio
+    async def test_stable_agent_no_alert(self, tmp_path):
+        from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig
+
+        config = DriftEvaluatorConfig(
+            agent_id="stable",
+            storage_path=str(tmp_path),
+            window_size=5,
+            baseline_size=10,
+            drift_threshold=0.10,
+            min_observations=5,
+        )
+        evaluator = DriftEvaluator(config)
+
+        # 10 baseline + 5 stable recent
+        for _ in range(15):
+            result = await evaluator.evaluate(0.9)
+
+        assert result.matched is False
+        assert result.metadata["status"] == "stable"
+
+    @pytest.mark.asyncio
+    async def test_drifting_agent_triggers_alert(self, tmp_path):
+        from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig
+
+        config = DriftEvaluatorConfig(
+            agent_id="drifting",
+            storage_path=str(tmp_path),
+            window_size=5,
+            baseline_size=10,
+            drift_threshold=0.10,
+            min_observations=5,
+        )
+        evaluator = DriftEvaluator(config)
+
+        # Strong baseline
+        for _ in range(10):
+            await evaluator.evaluate(0.95)
+
+        # Degraded recent window
+        for _ in range(4):
+            await evaluator.evaluate(0.70)
+
+        result = await evaluator.evaluate(0.70)
+
+        assert result.matched is True
+        assert result.metadata["status"] == "drift_detected"
+        assert result.metadata["baseline_avg"] == pytest.approx(0.95, abs=0.01)
+        assert result.metadata["recent_avg"] == pytest.approx(0.70, abs=0.01)
+        assert result.metadata["drift_magnitude"] == pytest.approx(0.25, abs=0.01)
+
+    @pytest.mark.asyncio
+    async def test_history_persists_across_evaluator_instances(self, tmp_path):
+        from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig
+
+        def make_evaluator():
+            return DriftEvaluator(
+                DriftEvaluatorConfig(
+                    agent_id="persist-test",
+                    storage_path=str(tmp_path),
+                    window_size=5,
+                    baseline_size=10,
+                    drift_threshold=0.10,
+                    min_observations=5,
+                )
+            )
+
+        # Instance 1: record 10 baseline observations
+        ev1 = make_evaluator()
+        for _ in range(10):
+            await ev1.evaluate(0.9)
+
+        # Instance 2: picks up history, records drift
+        ev2 = make_evaluator()
+        for _ in range(5):
+            await ev2.evaluate(0.5)
+
+        result = await ev2.evaluate(0.5)
+        assert result.matched is True
+        assert result.metadata["observation_count"] == 16
+
+    @pytest.mark.asyncio
+    async def test_score_clamped_outside_range(self, tmp_path):
+        from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig
+
+        config = DriftEvaluatorConfig(
+            agent_id="clamp",
+            storage_path=str(tmp_path),
+        )
+        evaluator = DriftEvaluator(config)
+
+        # Should not raise; score is clamped
+        result = await evaluator.evaluate(1.5)
+        assert isinstance(result, EvaluatorResult)
+        assert result.metadata["current_score"] == 1.0
+
+        result = await evaluator.evaluate(-0.5)
+        assert result.metadata["current_score"] == 0.0
+
+    @pytest.mark.asyncio
+    async def test_invalid_score_fail_open(self, tmp_path):
+        from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig
+
+        config = DriftEvaluatorConfig(
+            agent_id="error",
+            storage_path=str(tmp_path),
+            on_error="allow",
+        )
+        evaluator = DriftEvaluator(config)
+
+        result = await evaluator.evaluate("not-a-number")
+        assert result.matched is False
+        assert "Invalid score value" in result.message
+
+    @pytest.mark.asyncio
+    async def test_invalid_score_fail_closed(self, tmp_path):
+        from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig
+
+        config = DriftEvaluatorConfig(
+            agent_id="error-deny",
+            storage_path=str(tmp_path),
+            on_error="deny",
+        )
+        evaluator = DriftEvaluator(config)
+
+        result = await evaluator.evaluate({"not": "a number"})
+        assert result.matched is True
+        assert result.metadata["fallback_action"] == "deny"
+
+    @pytest.mark.asyncio
+    async def test_separate_agent_ids_tracked_independently(self, tmp_path):
+        from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig
+
+        def make_evaluator(agent_id):
+            return DriftEvaluator(
+                DriftEvaluatorConfig(
+                    agent_id=agent_id,
+                    storage_path=str(tmp_path),
+                    window_size=5,
+                    baseline_size=10,
+                    drift_threshold=0.10,
+                    min_observations=5,
+                )
+            )
+
+        good_ev = make_evaluator("good-agent")
+        bad_ev = make_evaluator("bad-agent")
+
+        # Good agent stays stable
+        for _ in range(15):
+            await good_ev.evaluate(0.9)
+
+        # Bad agent drifts
+        for _ in range(10):
+            await bad_ev.evaluate(0.9)
+        for _ in range(5):
+            await bad_ev.evaluate(0.5)
+
+        good_result = await good_ev.evaluate(0.9)
+        bad_result = await bad_ev.evaluate(0.5)
+
+        assert good_result.matched is False
+        assert bad_result.matched is True
+
+    @pytest.mark.asyncio
+    async def test_metadata_contains_required_fields(self, tmp_path):
+        from agent_control_evaluator_drift.drift import DriftEvaluator, DriftEvaluatorConfig
+
+        config = DriftEvaluatorConfig(
+            agent_id="meta-test",
+            storage_path=str(tmp_path),
+            window_size=5,
+            baseline_size=10,
+        )
+        evaluator = DriftEvaluator(config)
+
+        result = await evaluator.evaluate(0.85)
+        meta = result.metadata
+
+        required_keys = {
+            "agent_id",
+            "observation_count",
+            "status",
+            "current_score",
+        }
+        assert required_keys.issubset(meta.keys())
+        assert meta["agent_id"] == "meta-test"
+        assert meta["current_score"] == pytest.approx(0.85, abs=0.001)

From 12ed7e9214fea017e2960a910eca182c23cd1d69 Mon Sep 17 00:00:00 2001
From: "Nanook (nanookclaw)" <nanook-wn8b6di5@lobster.email>
Date: Sat, 21 Mar 2026 03:04:13 +0000
Subject: [PATCH 2/2] fix(drift): float precision, file-lock atomicity, release
 wiring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three issues raised by lan17 in PR review:

1. Float precision on threshold boundary (#1)
   baseline=1.0, window=0.9, threshold=0.10: IEEE 754 gives
   drift_magnitude=0.09999999... which fails >= 0.10. Fixed with
   round(drift_magnitude, 10) >= drift_threshold in _compute_drift().

2. Race condition on concurrent history writes (#3)
   load→append→save was not atomic: two workers for the same agent_id
   would both read stale history and the last writer would silently drop
   the other's observation. Replaced _load_history() / _save_history()
   pair with _load_and_append_history() which holds fcntl.LOCK_EX for
   the full read-modify-write cycle. Lock is per-agent (.lock file),
   so independent agents remain fully parallel.

3. Release wiring missing for drift package (#2)
   test-extras, scripts/build.py, Makefile and .PHONY only referenced
   galileo. Added drift-{test,lint,lint-fix,typecheck,build} targets to
   Makefile, wired drift-test into test-extras, and added
   build_evaluator_drift() to scripts/build.py (including 'drift' and
   'all' targets).
---
 Makefile                                      |  26 +++-
 .../drift/evaluator.py                        | 119 +++++++++---------
 scripts/build.py                              |  26 +++-
 3 files changed, 107 insertions(+), 64 deletions(-)

diff --git a/Makefile b/Makefile
index b858a3ed..b2e01712 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help sync openapi-spec openapi-spec-check test test-extras test-all models-test test-models test-sdk lint lint-fix typecheck check build build-models build-server build-sdk publish publish-models publish-server publish-sdk hooks-install hooks-uninstall prepush evaluators-test evaluators-lint evaluators-lint-fix evaluators-typecheck evaluators-build galileo-test galileo-lint galileo-lint-fix galileo-typecheck galileo-build sdk-ts-generate sdk-ts-overlay-test sdk-ts-name-check sdk-ts-generate-check sdk-ts-build sdk-ts-test sdk-ts-lint sdk-ts-typecheck sdk-ts-release-check sdk-ts-publish-dry-run sdk-ts-publish
+.PHONY: help sync openapi-spec openapi-spec-check test test-extras test-all models-test test-models test-sdk lint lint-fix typecheck check build build-models build-server build-sdk publish publish-models publish-server publish-sdk hooks-install hooks-uninstall prepush evaluators-test evaluators-lint evaluators-lint-fix evaluators-typecheck evaluators-build galileo-test galileo-lint galileo-lint-fix galileo-typecheck galileo-build drift-test drift-lint drift-lint-fix drift-typecheck drift-build sdk-ts-generate sdk-ts-overlay-test sdk-ts-name-check sdk-ts-generate-check sdk-ts-build sdk-ts-test sdk-ts-lint sdk-ts-typecheck sdk-ts-release-check sdk-ts-publish-dry-run sdk-ts-publish
 
 # Workspace package names
 PACK_MODELS := agent-control-models
@@ -16,6 +16,7 @@ TS_SDK_DIR := sdks/typescript
 ENGINE_DIR := engine
 EVALUATORS_DIR := evaluators/builtin
 GALILEO_DIR := evaluators/contrib/galileo
+DRIFT_DIR := evaluators/contrib/drift
 UI_DIR := ui
 
 help:
@@ -33,7 +34,7 @@ help:
 	@echo "Test:"
 	@echo "  make test            - run tests for core packages (models, server, engine, sdk, evaluators)"
 	@echo "  make models-test     - run shared model tests with coverage"
-	@echo "  make test-extras     - run tests for contrib evaluators (galileo, etc.)"
+	@echo "  make test-extras     - run tests for contrib evaluators (galileo, drift, etc.)"
 	@echo "  make test-all        - run all tests (core + extras)"
 	@echo "  make sdk-ts-test     - run TypeScript SDK tests"
 	@echo ""
@@ -90,7 +91,7 @@ models-test:
 test-models: models-test
 
 # Run tests for contrib evaluators (not included in default test target)
-test-extras: galileo-test
+test-extras: galileo-test drift-test
 
 # Run all tests (core + extras)
 test-all: test test-extras
@@ -244,3 +245,22 @@ galileo-typecheck:
 
 galileo-build:
 	$(MAKE) -C $(GALILEO_DIR) build
+
+# ---------------------------
+# Contrib Evaluators (Drift)
+# ---------------------------
+
+drift-test:
+	$(MAKE) -C $(DRIFT_DIR) test
+
+drift-lint:
+	$(MAKE) -C $(DRIFT_DIR) lint
+
+drift-lint-fix:
+	$(MAKE) -C $(DRIFT_DIR) lint-fix
+
+drift-typecheck:
+	$(MAKE) -C $(DRIFT_DIR) typecheck
+
+drift-build:
+	$(MAKE) -C $(DRIFT_DIR) build
diff --git a/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py
index 3b5d3a04..8447f908 100644
--- a/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py
+++ b/evaluators/contrib/drift/src/agent_control_evaluator_drift/drift/evaluator.py
@@ -6,6 +6,7 @@
 
 from __future__ import annotations
 
+import fcntl
 import json
 import logging
 import os
@@ -20,40 +21,58 @@
 logger = logging.getLogger(__name__)
 
 
-def _load_history(path: Path) -> list[float]:
-    """Load observation history from a JSON file.
+def _load_and_append_history(path: Path, score: float) -> list[float]:
+    """Atomically load history, append a score, persist, and return the updated list.
+
+    Uses an exclusive advisory lock (``fcntl.LOCK_EX``) on the history file so
+    that concurrent workers for the same agent do not race on the
+    read-modify-write cycle.  Without this, two simultaneous calls can both read
+    the same stale list and the last writer silently drops the other's
+    observation, causing drift detection to miss events.
 
     Args:
-        path: Path to the history file.
+        path: Path to the per-agent JSON history file.
+        score: New observation to append (already validated, in [0.0, 1.0]).
 
     Returns:
-        List of float scores, or empty list if file doesn't exist.
-    """
-    if not path.exists():
-        return []
-    try:
-        with path.open("r") as fh:
-            data = json.load(fh)
-            scores = data.get("scores", [])
-            return [float(s) for s in scores if isinstance(s, (int, float))]
-    except (json.JSONDecodeError, OSError, ValueError) as exc:
-        logger.warning("Failed to load drift history from %s: %s", path, exc)
-        return []
-
-
-def _save_history(path: Path, scores: list[float]) -> None:
-    """Persist observation history to a JSON file.
-
-    Args:
-        path: Path to the history file.
-        scores: List of float scores to persist.
+        Updated list of float scores (oldest first), including *score*.
 
     Raises:
-        OSError: If the file cannot be written.
+        OSError: If the lock file or history file cannot be opened or written.
+        json.JSONDecodeError: If the history file contains malformed JSON.
     """
     path.parent.mkdir(parents=True, exist_ok=True)
-    with path.open("w") as fh:
-        json.dump({"scores": scores}, fh)
+    lock_path = path.with_suffix(".lock")
+
+    # Open (or create) the lock file and hold an exclusive lock for the
+    # duration of the read-modify-write.  The lock is released automatically
+    # when the file descriptor is closed at the end of this block.
+    with lock_path.open("a") as lock_fh:
+        fcntl.flock(lock_fh, fcntl.LOCK_EX)
+
+        # Read existing scores under the lock.
+        scores: list[float] = []
+        if path.exists():
+            try:
+                with path.open("r") as fh:
+                    data = json.load(fh)
+                    scores = [
+                        float(s)
+                        for s in data.get("scores", [])
+                        if isinstance(s, (int, float))
+                    ]
+            except (json.JSONDecodeError, ValueError) as exc:
+                logger.warning("Corrupt drift history at %s; resetting. Error: %s", path, exc)
+                scores = []
+
+        # Append and persist while the lock is still held.
+        scores.append(score)
+        with path.open("w") as fh:
+            json.dump({"scores": scores}, fh)
+
+        # Lock released here when lock_fh closes.
+
+    return scores
 
 
 def _compute_drift(
@@ -109,7 +128,11 @@ def _compute_drift(
     recent_avg = sum(recent_scores) / len(recent_scores)
     drift_magnitude = baseline_avg - recent_avg  # positive = drop
 
-    matched = drift_magnitude >= drift_threshold
+    # Round before threshold comparison to avoid float precision issues.
+    # e.g. 1.0 - 0.9 evaluates to 0.09999999... in IEEE 754, which fails a
+    # >= 0.10 check without rounding.  Rounding to 10 decimal places preserves
+    # all meaningful precision while eliminating the ULP-level noise.
+    matched = round(drift_magnitude, 10) >= drift_threshold
 
     if matched:
         status = "drift_detected"
@@ -152,12 +175,12 @@ class DriftEvaluator(Evaluator[DriftEvaluatorConfig]):
 
     No external API or service required — history is stored as local JSON.
 
-    Instance Caching Note:
-        Per the base class contract, this evaluator stores only immutable
-        config state in ``__init__``. All file I/O happens inside
-        ``evaluate()`` using local variables, making it safe to reuse
-        across concurrent requests (each call reads and writes atomically
-        via a per-agent file lock-free JSON write).
+    Concurrency:
+        Each ``evaluate()`` call uses an exclusive advisory file lock
+        (``fcntl.LOCK_EX``) scoped to the read-modify-write cycle, ensuring
+        that concurrent workers for the same ``agent_id`` never race on
+        history updates.  The lock is per-agent (``<agent_id>.lock`` next to
+        ``<agent_id>.json``), so different agents remain fully parallel.
 
     Example:
         ```python
@@ -222,16 +245,16 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
         storage_dir = Path(self.config.storage_path)
         history_path = storage_dir / f"{self.config.agent_id}.json"
 
-        # Load existing history
+        # Atomically load, append, and persist history under a file lock.
         try:
-            scores = _load_history(history_path)
+            scores = _load_and_append_history(history_path, score)
         except Exception as exc:
-            logger.error("DriftEvaluator: failed to load history: %s", exc)
+            logger.error("DriftEvaluator: storage error: %s", exc)
             matched = self.config.on_error == "deny"
             return EvaluatorResult(
                 matched=matched,
                 confidence=0.0,
-                message=f"Storage error (load): {exc}",
+                message=f"Storage error: {exc}",
                 metadata={
                     "error": str(exc),
                     "agent_id": self.config.agent_id,
@@ -239,28 +262,6 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
                 },
             )
 
-        # Append new score
-        scores.append(score)
-
-        # Persist updated history
-        try:
-            _save_history(history_path, scores)
-        except Exception as exc:
-            logger.error("DriftEvaluator: failed to save history: %s", exc)
-            # Still compute drift from in-memory scores even if save fails
-            matched_on_error = self.config.on_error == "deny"
-            if matched_on_error:
-                return EvaluatorResult(
-                    matched=True,
-                    confidence=0.0,
-                    message=f"Storage error (save): {exc}",
-                    metadata={
-                        "error": str(exc),
-                        "agent_id": self.config.agent_id,
-                        "fallback_action": self.config.on_error,
-                    },
-                )
-
         # Compute drift metrics
         metrics = _compute_drift(
             scores=scores,
diff --git a/scripts/build.py b/scripts/build.py
index 498239ca..798b1307 100644
--- a/scripts/build.py
+++ b/scripts/build.py
@@ -6,7 +6,7 @@
 afterward. This allows the published wheels to be self-contained.
 
 Usage:
-    python scripts/build.py [models|evaluators|sdk|server|galileo|all]
+    python scripts/build.py [models|evaluators|sdk|server|galileo|drift|all]
 """
 
 import shutil
@@ -222,6 +222,25 @@ def build_evaluator_galileo() -> None:
     print(f"  Built agent-control-evaluator-galileo v{version}")
 
 
+def build_evaluator_drift() -> None:
+    """Build agent-control-evaluator-drift (standalone, no vendoring needed)."""
+    version = get_global_version()
+    drift_dir = ROOT / "evaluators" / "contrib" / "drift"
+
+    print(f"Building agent-control-evaluator-drift v{version}")
+
+    # Clean previous builds
+    dist_dir = drift_dir / "dist"
+    if dist_dir.exists():
+        shutil.rmtree(dist_dir)
+
+    # Set version
+    set_package_version(drift_dir / "pyproject.toml", version)
+
+    subprocess.run(["uv", "build", "-o", str(dist_dir)], cwd=drift_dir, check=True)
+    print(f"  Built agent-control-evaluator-drift v{version}")
+
+
 def build_all() -> None:
     """Build all packages."""
     print(f"Building all packages (version {get_global_version()})\n")
@@ -230,6 +249,7 @@ def build_all() -> None:
     build_sdk()
     build_server()
     build_evaluator_galileo()
+    build_evaluator_drift()
     print("\nAll packages built successfully!")
 
 
@@ -248,8 +268,10 @@ def build_all() -> None:
         build_server()
     elif target == "galileo":
         build_evaluator_galileo()
+    elif target == "drift":
+        build_evaluator_drift()
     elif target == "all":
         build_all()
     else:
-        print("Usage: python scripts/build.py [models|evaluators|sdk|server|galileo|all]")
+        print("Usage: python scripts/build.py [models|evaluators|sdk|server|galileo|drift|all]")
         sys.exit(1)