diff --git a/evaluators/contrib/financial-governance/README.md b/evaluators/contrib/financial-governance/README.md new file mode 100644 index 00000000..78ea606c --- /dev/null +++ b/evaluators/contrib/financial-governance/README.md @@ -0,0 +1,220 @@ +# Financial Governance Evaluators for Agent Control + +Evaluators that enforce financial spend limits and transaction policies for autonomous AI agents. + +As agents transact autonomously via protocols like [x402](https://github.com/coinbase/x402) and payment layers like [agentpay-mcp](https://github.com/AI-Agent-Economy/agentpay-mcp), enterprises need governance over what agents spend. These evaluators bring financial policy enforcement into the Agent Control framework. + +## Evaluators + +### `financial_governance.spend_limit` + +Tracks cumulative agent spend and enforces rolling budget limits. Stateful — records approved transactions and checks new ones against accumulated spend. + +- **Per-transaction cap** — reject any single payment above a threshold +- **Rolling period budget** — reject payments that would exceed a time-windowed budget +- **Context-aware overrides** — different limits per channel, agent, or session via evaluate metadata +- **Pluggable storage** — abstract `SpendStore` protocol with built-in `InMemorySpendStore`; bring your own PostgreSQL, Redis, etc. + +### `financial_governance.transaction_policy` + +Static policy checks with no state tracking. Enforces structural rules on individual transactions. + +- **Currency allowlist** — only permit specific currencies (e.g., `["USDC", "USDT"]`) +- **Recipient blocklist/allowlist** — control which addresses an agent can pay +- **Amount bounds** — minimum and maximum per-transaction limits + +## Installation + +```bash +# From the repo root (development) — install directly from contrib path +cd evaluators/contrib/financial-governance +pip install -e ".[dev]" +``` + +> **Note:** This package is not yet wired into `agent-control-evaluators` extras. +> Install directly from the contrib path as shown above. + +## Configuration + +### Spend Limit + +```yaml +controls: + - name: spend-limit + evaluator: + type: financial_governance.spend_limit + config: + max_per_transaction: "100.00" # Max USDC per single payment (Decimal string) + max_per_period: "1000.00" # Rolling 24h budget + period_seconds: 86400 # Budget window (default: 24 hours) + currency: USDC # Currency to govern + selector: + path: input # Extract step.input (transaction dict) + action: deny +``` + +### Transaction Policy + +```yaml +controls: + - name: transaction-policy + evaluator: + type: financial_governance.transaction_policy + config: + allowed_currencies: [USDC, USDT] + blocked_recipients: ["0xDEAD..."] + allowed_recipients: ["0xALICE...", "0xBOB..."] + min_amount: "0.01" + max_amount: "5000.00" + selector: + path: input + action: deny +``` + +## Selector Paths + +Both evaluators support two selector configurations: + +- **`selector.path: "input"`** (recommended) — The evaluator receives `step.input` directly, which should be the transaction dict. Context fields (`channel`, `agent_id`, `session_id`) are merged from `step.context` into the transaction dict by the engine before evaluation. +- **`selector.path: "*"`** — The evaluator receives the full Step object. It automatically extracts `step.input` for transaction fields and `step.context` for channel/agent/session metadata. + +## Input Data Schema + +The transaction dict (from `step.input`) should contain: + +```python +# step.input — transaction payload +{ + "amount": "50.00", # required — transaction amount (Decimal-compatible) + "currency": "USDC", # required — payment currency + "recipient": "0xABC...", # required — payment recipient +} +``` + +## Context-Aware Limits + +Context fields (`channel`, `agent_id`, `session_id`) and per-context limit overrides can be provided in two ways: + +**Option A: Via `step.context`** (recommended for engine integration) + +```python +step = Step( + type="tool", + name="payment", + input={"amount": "75.00", "currency": "USDC", "recipient": "0xABC"}, + context={ + "channel": "experimental", + "agent_id": "agent-42", + "channel_max_per_transaction": "50.00", + "channel_max_per_period": "200.00", + }, +) +``` + +When using `selector.path: "input"`, context fields (channel, agent_id, session_id) are merged from `step.context` into the transaction dict by the engine. When using `selector.path: "*"`, the evaluator merges `step.context` fields itself. + +**Option B: Inline in the transaction dict** (simpler, for direct SDK use) + +```python +result = await evaluator.evaluate({ + "amount": "75.00", + "currency": "USDC", + "recipient": "0xABC", + "channel": "experimental", + "channel_max_per_transaction": "50.00", + "channel_max_per_period": "200.00", +}) +``` + +Spend budgets are **scoped by context** — spend in channel A does not count against channel B's budget. When no context fields are present, budgets are global. + +## Custom SpendStore + +The `SpendStore` protocol requires two methods. Implement them for your backend: + +```python +from decimal import Decimal +from agent_control_evaluator_financial_governance.spend_limit import ( + SpendStore, + SpendLimitConfig, + SpendLimitEvaluator, +) + +class PostgresSpendStore: + """Example: PostgreSQL-backed spend tracking.""" + + def __init__(self, connection_string: str): + self._conn = connect(connection_string) + + def record_spend(self, amount: Decimal, currency: str, metadata: dict | None = None) -> None: + self._conn.execute( + "INSERT INTO agent_spend (amount, currency, metadata, recorded_at) VALUES (%s, %s, %s, NOW())", + (str(amount), currency, json.dumps(metadata)), + ) + + def get_spend( + self, + currency: str, + start: float, + end: float | None = None, + scope: dict[str, str] | None = None, + ) -> Decimal: + end_clause = "AND recorded_at <= to_timestamp(%s)" if end is not None else "" + params = [currency, start] + if end is not None: + params.append(end) + row = self._conn.execute( + f"SELECT COALESCE(SUM(amount), 0) FROM agent_spend " + f"WHERE currency = %s AND recorded_at >= to_timestamp(%s) {end_clause}", + params, + ).fetchone() + return Decimal(str(row[0])) + +# Use it: +store = PostgresSpendStore("postgresql://...") +evaluator = SpendLimitEvaluator(config, store=store) +``` + +## Error Handling + +Malformed or incomplete runtime payloads (missing `amount`, missing `currency`, non-numeric values, etc.) return `matched=False, error=None` — they are treated as non-matching transactions, not evaluator errors. The `error` field is reserved for evaluator infrastructure failures (crashes, timeouts, missing dependencies). + +## Running Tests + +```bash +cd evaluators/contrib/financial-governance +pip install -e ".[dev]" +pytest tests/ -v +``` + +## Design Decisions + +1. **Decimal for money** — All monetary amounts use `Decimal` to avoid float precision errors in financial calculations. +2. **Decoupled from data source** — The `SpendStore` protocol means no new tables in core Agent Control. Bring your own persistence. +3. **Context-aware limits** — Override keys in the evaluate data dict allow per-channel, per-agent, or per-session limits without multiple evaluator instances. +4. **Python SDK compatible** — Uses the standard evaluator interface; works with both the server and the Python SDK evaluation engine. +5. **Fail-open on malformed data** — Missing or invalid fields return `matched=False` with `error=None`, following Agent Control conventions. + +## Known Limitations + +### Race Condition (read-then-write is not atomic) +The spend-limit evaluator reads current period spend and then writes a new record as two separate operations. Under concurrent load this can allow transactions to slip through just above the budget. For hard enforcement use a `SpendStore` implementation that provides atomic `check_and_record` semantics (e.g., a Redis `MULTI`/`EXEC` block or a PostgreSQL `SELECT ... FOR UPDATE`). The `InMemorySpendStore` is thread-safe within a single process but does not provide atomic check-and-record. + +### Tuple-Scoped Budgets +When context fields (`channel`, `agent_id`, `session_id`) are all present, they form a **single composite scope key** — not independent per-dimension budgets. For example, a scope of `{"channel": "A", "agent_id": "bot-1"}` matches only records that have *both* `channel=="A"` AND `agent_id=="bot-1"`. To enforce truly independent per-channel and per-agent budgets you would need separate `get_spend()` calls with separate scope dicts. + +### Package Not Yet in Extras +This package is not yet wired into the `agent-control-evaluators` extras install target. Install directly from the contrib path: + +```bash +pip install -e "evaluators/contrib/financial-governance" +``` + +## Related Projects + +- [x402](https://github.com/coinbase/x402) — HTTP 402 payment protocol +- [agentpay-mcp](https://github.com/up2itnow0822/agentpay-mcp) — MCP server for non-custodial agent payments + +## License + +Apache-2.0 — see [LICENSE](../../../LICENSE). diff --git a/evaluators/contrib/financial-governance/pyproject.toml b/evaluators/contrib/financial-governance/pyproject.toml new file mode 100644 index 00000000..c833a911 --- /dev/null +++ b/evaluators/contrib/financial-governance/pyproject.toml @@ -0,0 +1,55 @@ +[project] +name = "agent-control-evaluator-financial-governance" +version = "0.1.0" +description = "Financial governance evaluators for agent-control — spend limits and transaction policy enforcement" +readme = "README.md" +requires-python = ">=3.12" +license = { text = "Apache-2.0" } +authors = [{ name = "agent-control contributors" }] +keywords = ["agent-control", "evaluator", "financial", "spend-limit", "x402", "agentpay"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Libraries", +] +dependencies = [ + "agent-control-evaluators>=3.0.0", + "agent-control-models>=3.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0.0", + "pytest-asyncio>=0.23.0", + "pytest-cov>=4.0.0", + "ruff>=0.1.0", + "mypy>=1.8.0", +] + +[project.entry-points."agent_control.evaluators"] +"financial_governance.spend_limit" = "agent_control_evaluator_financial_governance.spend_limit:SpendLimitEvaluator" +"financial_governance.transaction_policy" = "agent_control_evaluator_financial_governance.transaction_policy:TransactionPolicyEvaluator" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/agent_control_evaluator_financial_governance"] + +[tool.ruff] +line-length = 100 +target-version = "py312" + +[tool.ruff.lint] +select = ["E", "F", "I"] + +[tool.pytest.ini_options] +asyncio_mode = "auto" + +[tool.uv.sources] +agent-control-evaluators = { path = "../../builtin", editable = true } +agent-control-models = { path = "../../../models", editable = true } diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/__init__.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/__init__.py new file mode 100644 index 00000000..fd8c87cb --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/__init__.py @@ -0,0 +1,55 @@ +"""Financial governance evaluators for agent-control. + +Provides two evaluators for enforcing financial policy on AI agent transactions: + +- ``financial_governance.spend_limit``: Tracks cumulative spend against rolling + period budgets and per-transaction caps. +- ``financial_governance.transaction_policy``: Static policy checks — allowlists, + blocklists, amount bounds, and permitted currencies. + +Both evaluators are registered automatically when this package is installed and +the ``agent_control.evaluators`` entry point group is discovered. + +Example usage in an agent-control control config:: + + { + "condition": { + "selector": {"path": "input"}, + "evaluator": { + "name": "financial_governance.spend_limit", + "config": { + "max_per_transaction": "100.00", + "max_per_period": "1000.00", + "period_seconds": 86400, + "currency": "USDC" + } + } + }, + "action": {"decision": "deny"} + } + +Note on ``selector.path``: + Use ``selector.path: "input"`` (recommended) to pass ``step.input`` + directly as the transaction dict. Context fields (``channel``, + ``agent_id``, ``session_id``) are merged from ``step.context`` into + the transaction dict by the engine before evaluation. + + Use ``selector.path: "*"`` to pass the full Step object; the evaluator + will extract ``step.input`` and merge ``step.context`` fields itself. +""" + +from agent_control_evaluator_financial_governance.spend_limit import ( + SpendLimitConfig, + SpendLimitEvaluator, +) +from agent_control_evaluator_financial_governance.transaction_policy import ( + TransactionPolicyConfig, + TransactionPolicyEvaluator, +) + +__all__ = [ + "SpendLimitEvaluator", + "SpendLimitConfig", + "TransactionPolicyEvaluator", + "TransactionPolicyConfig", +] diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/__init__.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/__init__.py new file mode 100644 index 00000000..cebe9fc7 --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/__init__.py @@ -0,0 +1,12 @@ +"""Spend-limit evaluator package.""" + +from .config import SpendLimitConfig +from .evaluator import SpendLimitEvaluator +from .store import InMemorySpendStore, SpendStore + +__all__ = [ + "SpendLimitEvaluator", + "SpendLimitConfig", + "SpendStore", + "InMemorySpendStore", +] diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/config.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/config.py new file mode 100644 index 00000000..dad7de01 --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/config.py @@ -0,0 +1,71 @@ +"""Configuration model for the spend-limit evaluator.""" + +from __future__ import annotations + +from decimal import Decimal + +from pydantic import Field, field_validator + +from agent_control_evaluators import EvaluatorConfig + + +class SpendLimitConfig(EvaluatorConfig): + """Configuration for :class:`~.evaluator.SpendLimitEvaluator`. + + All monetary fields are expressed in the units of *currency*. + + Attributes: + max_per_transaction: Hard cap on any single transaction amount. A + transaction whose ``amount`` exceeds this value is blocked + regardless of accumulated period spend. Set to ``Decimal("0")`` + to disable. + max_per_period: Maximum total spend allowed within the rolling + *period_seconds* window. Set to ``Decimal("0")`` to disable. + period_seconds: Length of the rolling budget window in seconds. + Defaults to ``86400`` (24 hours). + currency: Currency symbol this policy applies to (e.g. ``"USDC"``). + Transactions whose currency does not match are passed through as + *not matched* (i.e. allowed). + + Example config dict:: + + { + "max_per_transaction": "500.00", + "max_per_period": "5000.00", + "period_seconds": 86400, + "currency": "USDC" + } + """ + + max_per_transaction: Decimal = Field( + default=Decimal("0"), + ge=0, + description=( + "Per-transaction spend cap in *currency* units. " + "0 means no per-transaction limit." + ), + ) + max_per_period: Decimal = Field( + default=Decimal("0"), + ge=0, + description=( + "Maximum cumulative spend allowed in the rolling period window. " + "0 means no period limit." + ), + ) + period_seconds: int = Field( + default=86_400, + ge=1, + description="Rolling budget window length in seconds (default: 86400 = 24 h).", + ) + currency: str = Field( + ..., + min_length=1, + description="Currency symbol this policy applies to (e.g. 'USDC', 'ETH').", + ) + + @field_validator("currency") + @classmethod + def normalize_currency(cls, v: str) -> str: + """Normalize currency symbol to upper-case for consistent comparison.""" + return v.upper() diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/evaluator.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/evaluator.py new file mode 100644 index 00000000..531f3aec --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/evaluator.py @@ -0,0 +1,358 @@ +"""Spend-limit evaluator — tracks cumulative agent spend against rolling budgets.""" + +from __future__ import annotations + +import time +from decimal import Decimal, InvalidOperation +from typing import Any + +from agent_control_evaluators import ( + Evaluator, + EvaluatorMetadata, + register_evaluator, +) +from agent_control_models import EvaluatorResult + +from .config import SpendLimitConfig +from .store import InMemorySpendStore, SpendStore + + +def _extract_decimal(data: dict[str, Any], key: str) -> Decimal | None: + """Safely extract a Decimal value from *data* by *key*. + + Accepts int, float, str, and Decimal inputs. Returns None if the key is + missing or the value cannot be converted. + """ + raw = data.get(key) + if raw is None: + return None + try: + return Decimal(str(raw)) + except (InvalidOperation, TypeError, ValueError): + return None + + +@register_evaluator +class SpendLimitEvaluator(Evaluator[SpendLimitConfig]): + """Evaluator that enforces per-transaction and rolling-period spend limits. + + ``matched=True`` means the transaction **violates** the configured limits + and should be blocked. ``matched=False`` means the transaction is within + budget and may proceed. + + Thread safety: + The evaluator itself is stateless. All mutable state lives in the + injected :class:`~.store.SpendStore`. The default + :class:`~.store.InMemorySpendStore` is thread-safe. + + Instance caching note: + Evaluator instances are cached and reused across requests (see base + class docstring). Only the ``SpendStore`` instance is mutable; do not + add per-request state to ``self``. + + Evaluating context-aware limits: + The ``data`` dict may contain channel-specific override keys such as + ``channel_max_per_transaction`` or ``channel_max_per_period``. These + override the base config values for that call, implementing lan17's + requirement that rules take context/metadata into account. + + Args: + config: Validated :class:`SpendLimitConfig`. + store: Optional :class:`SpendStore` implementation. Defaults to a new + :class:`InMemorySpendStore` when not provided. + + Input ``data`` schema:: + + { + "amount": Decimal | float | str, # required — transaction amount + "currency": str, # required — payment currency + "recipient": str, # required — recipient address/identifier + # optional context fields (merged from step.context when selector.path is "input") + "channel": str, + "agent_id": str, + "session_id": str, + # optional per-call limit overrides (from evaluate() metadata) + "channel_max_per_transaction": Decimal | float | str, + "channel_max_per_period": Decimal | float | str, + } + + Example:: + + from agent_control_evaluator_financial_governance.spend_limit import ( + SpendLimitConfig, + SpendLimitEvaluator, + ) + from decimal import Decimal + + config = SpendLimitConfig( + max_per_transaction=Decimal("100"), + max_per_period=Decimal("1000"), + period_seconds=86400, + currency="USDC", + ) + evaluator = SpendLimitEvaluator(config) + result = await evaluator.evaluate({ + "amount": "50.00", + "currency": "USDC", + "recipient": "0xABC...", + }) + # result.matched == False → transaction is within limits + """ + + metadata = EvaluatorMetadata( + name="financial_governance.spend_limit", + version="0.1.0", + description=( + "Tracks cumulative agent spend and enforces per-transaction caps " + "and rolling period budgets. Supports pluggable SpendStore backends." + ), + ) + config_model = SpendLimitConfig + + def __init__( + self, + config: SpendLimitConfig, + store: SpendStore | None = None, + ) -> None: + super().__init__(config) + self._store: SpendStore = store if store is not None else InMemorySpendStore() + + # ------------------------------------------------------------------ + # Main evaluation entry point + # ------------------------------------------------------------------ + + @staticmethod + def _normalize_data(data: Any) -> tuple[dict[str, Any] | None, dict[str, Any]]: + """Extract transaction fields and step context from selector output. + + Handles two selector paths: + + - ``selector.path: "input"`` → data IS the transaction dict. + Context fields (channel, agent_id, session_id) must be included in + the transaction dict itself, or merged by the engine before calling. + - ``selector.path: "*"`` → data is the full Step dict with ``input`` + and ``context`` sub-keys. Context fields are merged from + ``step.context`` into the returned transaction dict automatically. + + Note: When using ``selector.path: "input"``, context fields + (channel, agent_id, session_id) are merged from ``step.context`` by + the engine before the evaluator is called, so they appear directly in + the transaction dict. + + Returns: + (tx_data, step_context) where tx_data is the transaction dict + (or None if missing) and step_context holds channel/agent_id/etc. + """ + if not isinstance(data, dict): + return None, {} + + # If data looks like a Step (has "input" + "type" keys), extract + # the transaction payload from "input" and context from "context". + if "type" in data and "input" in data: + tx = data.get("input") + ctx = data.get("context") or {} + if not isinstance(tx, dict): + return None, ctx if isinstance(ctx, dict) else {} + # Merge step context into tx so downstream logic sees channel/agent_id. + # input fields take priority — step.context does NOT overwrite them. + merged = {**tx} + if isinstance(ctx, dict): + for k in ("channel", "agent_id", "session_id"): + if k in ctx and k not in merged: + merged[k] = ctx[k] + # Support context-level limit overrides + for k in ("channel_max_per_transaction", "channel_max_per_period"): + if k in ctx and k not in merged: + merged[k] = ctx[k] + return merged, ctx if isinstance(ctx, dict) else {} + + # Otherwise assume data IS the transaction dict (selector.path: "input") + return data, {} + + async def evaluate(self, data: Any) -> EvaluatorResult: + """Evaluate a transaction against configured spend limits. + + Args: + data: Transaction dict (when ``selector.path`` is ``"input"``) + or full Step dict (when path is ``"*"``). Transaction fields: + ``amount``, ``currency``, ``recipient``. Context fields + (``channel``, ``agent_id``, ``session_id``) can live in the + transaction dict or in ``step.context``. + + Returns: + ``EvaluatorResult`` where ``matched=True`` indicates a limit + violation (transaction should be denied). + """ + if data is None: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="No transaction data provided; skipping spend-limit check", + ) + + tx_data, _step_ctx = self._normalize_data(data) + if tx_data is None: + return EvaluatorResult( + matched=False, + confidence=1.0, + message=( + "Could not extract transaction data from selector output; " + "skipping spend-limit check" + ), + ) + + # Replace data with normalized transaction dict for the rest of evaluate + data = tx_data + + # ---- Extract required fields ---- + # NOTE: Malformed selector output is NOT an evaluator error. The + # ``error`` field is reserved for evaluator crashes / timeouts / + # missing dependencies. Missing or invalid fields in the data dict + # are normal "does not match" results. + amount = _extract_decimal(data, "amount") + if amount is None: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="Transaction data missing required field 'amount'; cannot evaluate", + ) + if amount <= Decimal("0"): + return EvaluatorResult( + matched=False, + confidence=1.0, + message=f"Transaction amount must be positive, got {amount}; cannot evaluate", + ) + + tx_currency: str = str(data.get("currency", "")).upper() + if not tx_currency: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="Transaction data missing required field 'currency'; cannot evaluate", + ) + + recipient: str = str(data.get("recipient", "")).strip() + + # ---- Currency filter — only enforce policy for configured currency ---- + if tx_currency != self.config.currency: + return EvaluatorResult( + matched=False, + confidence=1.0, + message=( + f"Transaction currency '{tx_currency}' does not match policy " + f"currency '{self.config.currency}'; skipping" + ), + metadata={"tx_currency": tx_currency, "policy_currency": self.config.currency}, + ) + + # ---- Resolve effective limits (context/metadata overrides) ---- + # Callers can embed channel-specific overrides directly in the data dict. + # This satisfies lan17's guidance that rules take context/metadata into account. + effective_max_per_tx = _extract_decimal(data, "channel_max_per_transaction") + if effective_max_per_tx is None: + effective_max_per_tx = self.config.max_per_transaction + + effective_max_per_period = _extract_decimal(data, "channel_max_per_period") + if effective_max_per_period is None: + effective_max_per_period = self.config.max_per_period + + # ---- Per-transaction cap ---- + if effective_max_per_tx > Decimal("0") and amount > effective_max_per_tx: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=( + f"Transaction amount {amount} {tx_currency} exceeds per-transaction " + f"cap of {effective_max_per_tx} {tx_currency}" + ), + metadata={ + "violation": "per_transaction_cap", + "amount": float(amount), + "max_per_transaction": float(effective_max_per_tx), + "currency": tx_currency, + "recipient": recipient, + }, + ) + + # ---- Rolling period budget ---- + if effective_max_per_period > Decimal("0"): + since = time.time() - self.config.period_seconds + + # Build scope for context-aware budget isolation. + # + # Scope semantics (tuple-scoped budgets): + # All present keys together form a SINGLE composite scope key. + # For example, scope={"channel": "A", "agent_id": "bot"} only + # matches records that have BOTH channel=="A" AND agent_id=="bot". + # This means channel+agent_id+session_id form one combined budget, + # not independent per-channel and per-agent budgets. + # + # To enforce truly independent per-channel and per-agent budgets + # you would need separate get_spend() calls: + # channel_spend = store.get_spend(cur, since, scope={"channel": ch}) + # agent_spend = store.get_spend(cur, since, scope={"agent_id": aid}) + # That pattern is intentionally NOT implemented here to avoid + # surprising double-counting; operators who need it should subclass + # or wrap the evaluator. + scope: dict[str, str] | None = None + if any(k in data for k in ("channel", "agent_id", "session_id")): + scope = { + k: str(data[k]) + for k in ("channel", "agent_id", "session_id") + if k in data and data[k] is not None + } + if not scope: + scope = None + + period_spend = self._store.get_spend(tx_currency, since, scope=scope) + projected = period_spend + amount + + if projected > effective_max_per_period: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=( + f"Transaction would bring period spend to " + f"{projected} {tx_currency}, exceeding the " + f"{self.config.period_seconds}s budget of " + f"{effective_max_per_period} {tx_currency} " + f"(current period spend: {period_spend})" + ), + metadata={ + "violation": "period_budget", + "amount": float(amount), + "current_period_spend": float(period_spend), + "projected_period_spend": float(projected), + "max_per_period": float(effective_max_per_period), + "period_seconds": self.config.period_seconds, + "currency": tx_currency, + "recipient": recipient, + }, + ) + + # ---- Transaction is within limits — record it ---- + spend_metadata: dict[str, Any] = { + k: data[k] + for k in ("channel", "agent_id", "session_id") + if k in data and data[k] is not None + } + spend_metadata["recipient"] = recipient + + self._store.record_spend( + amount=amount, + currency=tx_currency, + metadata=spend_metadata if spend_metadata else None, + ) + + return EvaluatorResult( + matched=False, + confidence=1.0, + message=( + f"Transaction of {amount} {tx_currency} to '{recipient}' is within limits" + ), + metadata={ + "amount": float(amount), + "currency": tx_currency, + "recipient": recipient, + }, + ) diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/store.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/store.py new file mode 100644 index 00000000..3976eaf3 --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/store.py @@ -0,0 +1,211 @@ +"""SpendStore protocol and built-in InMemorySpendStore implementation. + +The SpendStore abstraction decouples the spend-limit evaluator from any +particular persistence backend. The default ``InMemorySpendStore`` requires no +external dependencies and is suitable for single-process deployments or testing. + +For production multi-process or multi-replica deployments you should implement a +custom SpendStore backed by a durable store such as PostgreSQL or Redis. See +README.md for an example. +""" + +from __future__ import annotations + +import time +from collections import deque +from decimal import Decimal +from threading import Lock +from typing import Any, Protocol, runtime_checkable + + +@runtime_checkable +class SpendStore(Protocol): + """Protocol that all spend store implementations must satisfy. + + Implementations are free to choose any persistence mechanism (in-memory, + Redis, PostgreSQL, …). Both methods must be thread-safe. + """ + + def record_spend( + self, + amount: Decimal, + currency: str, + metadata: dict[str, Any] | None = None, + ) -> None: + """Persist a completed (or pending) spend record. + + Args: + amount: Positive monetary amount that was spent (Decimal for + precision — never use float for money). + currency: ISO-4217 or token symbol (e.g. ``"USDC"``). + metadata: Optional key-value bag for agent_id, session_id, etc. + """ + ... + + def get_spend( + self, + currency: str, + start: float, + end: float | None = None, + scope: dict[str, str] | None = None, + ) -> Decimal: + """Return total spend for *currency* within the given time range. + + Args: + currency: Currency symbol to query (case-sensitive). + start: Unix timestamp (seconds, inclusive lower bound). Only + records whose ``recorded_at`` is >= this value are included. + end: Unix timestamp (seconds, inclusive upper bound). When + ``None`` (the default), all records up to "now" are included. + This makes the API backward-compatible: callers that only pass + ``start`` get the same behaviour as before. + scope: Optional key-value pairs to filter by metadata fields. + For example, ``{"channel": "slack"}`` returns only spend + recorded with that channel in metadata. When None, returns + all spend regardless of metadata. + + **Scope semantics (tuple-scoped budgets):** + All present keys together form a single composite scope key. + A record with ``{"channel": "A", "agent_id": "bot-1"}`` will + only match a scope of ``{"channel": "A", "agent_id": "bot-1"}`` + — NOT a query for ``{"channel": "A"}`` alone. This means + channel+agent_id+session_id form one combined scope, not + independent budgets. To enforce independent per-channel and + per-agent budgets you would need separate ``get_spend`` calls + with separate scope dicts. + + Returns: + Sum of all matching spend amounts as a Decimal. Returns + ``Decimal("0")`` when no records match. + """ + ... + + +class _SpendRecord: + """Internal record stored by :class:`InMemorySpendStore`.""" + + __slots__ = ("amount", "currency", "recorded_at", "metadata") + + def __init__( + self, + amount: Decimal, + currency: str, + recorded_at: float, + metadata: dict[str, Any] | None, + ) -> None: + self.amount = amount + self.currency = currency + self.recorded_at = recorded_at + self.metadata = metadata + + def matches_scope(self, scope: dict[str, str]) -> bool: + """Check if this record's metadata matches all scope key-value pairs.""" + if not self.metadata: + return False + return all( + self.metadata.get(k) == v + for k, v in scope.items() + ) + + +class InMemorySpendStore: + """Thread-safe in-memory implementation of :class:`SpendStore`. + + Records are kept in a ``deque`` ordered by insertion time. A background + sweep prunes records older than *max_age_seconds* to prevent unbounded + memory growth. + + This implementation is **not** suitable for multi-process or distributed + deployments because each process maintains an independent ledger. Use it + for single-process services, local development, and tests. + + Args: + max_age_seconds: Records older than this many seconds are eligible for + pruning. Defaults to 7 days (604 800 s). + """ + + def __init__(self, max_age_seconds: int = 604_800) -> None: + self._max_age_seconds = max_age_seconds + self._records: deque[_SpendRecord] = deque() + self._lock = Lock() + + # ------------------------------------------------------------------ + # SpendStore protocol implementation + # ------------------------------------------------------------------ + + def record_spend( + self, + amount: Decimal, + currency: str, + metadata: dict[str, Any] | None = None, + ) -> None: + """Record a spend event at the current wall-clock time. + + Args: + amount: Positive monetary amount (Decimal). + currency: Currency symbol (e.g. ``"USDC"``). + metadata: Optional context bag (agent_id, session_id, channel, …). + """ + if amount <= Decimal("0"): + raise ValueError(f"amount must be positive, got {amount!r}") + + now = time.time() + record = _SpendRecord( + amount=amount, + currency=currency, + recorded_at=now, + metadata=metadata, + ) + with self._lock: + self._records.append(record) + self._prune_locked(now) + + def get_spend( + self, + currency: str, + start: float, + end: float | None = None, + scope: dict[str, str] | None = None, + ) -> Decimal: + """Sum all spend for *currency* in the time range [start, end]. + + Args: + currency: Currency symbol (case-sensitive). + start: Unix epoch seconds (inclusive lower bound). + end: Unix epoch seconds (inclusive upper bound). ``None`` means + "up to now" — no upper bound is applied. + scope: Optional metadata filter. When provided, only records + whose metadata contains all specified key-value pairs are + included. See :class:`SpendStore` for scope tuple semantics. + + Returns: + Total spend as a Decimal. + """ + with self._lock: + total = Decimal("0") + for r in self._records: + if r.currency != currency: + continue + if r.recorded_at < start: + continue + if end is not None and r.recorded_at > end: + continue + if scope is not None and not r.matches_scope(scope): + continue + total += r.amount + return total + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _prune_locked(self, now: float) -> None: + """Remove records older than *max_age_seconds* (called with lock held).""" + cutoff = now - self._max_age_seconds + while self._records and self._records[0].recorded_at < cutoff: + self._records.popleft() + + def record_count(self) -> int: + """Return the current number of stored records (useful for tests).""" + with self._lock: + return len(self._records) diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/__init__.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/__init__.py new file mode 100644 index 00000000..693b8ccc --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/__init__.py @@ -0,0 +1,9 @@ +"""Transaction-policy evaluator package.""" + +from .config import TransactionPolicyConfig +from .evaluator import TransactionPolicyEvaluator + +__all__ = [ + "TransactionPolicyEvaluator", + "TransactionPolicyConfig", +] diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/config.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/config.py new file mode 100644 index 00000000..ef55201f --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/config.py @@ -0,0 +1,86 @@ +"""Configuration model for the transaction-policy evaluator.""" + +from __future__ import annotations + +from decimal import Decimal +from typing import Any + +from pydantic import Field, field_validator, model_validator + +from agent_control_evaluators import EvaluatorConfig + + +class TransactionPolicyConfig(EvaluatorConfig): + """Configuration for :class:`~.evaluator.TransactionPolicyEvaluator`. + + All list fields default to empty lists (no restriction applied). A field + is only enforced when it contains at least one entry. + + Attributes: + allowed_recipients: If non-empty, **only** recipients in this list are + permitted. Transactions to any other address are blocked. + blocked_recipients: Recipients that are explicitly prohibited. Checked + before ``allowed_recipients``. + min_amount: Minimum transaction amount (inclusive). ``Decimal("0")`` + disables the lower bound check. + max_amount: Maximum transaction amount (inclusive). ``Decimal("0")`` + disables the upper bound check. + allowed_currencies: If non-empty, **only** currencies in this list are + permitted. + + Example config dict:: + + { + "allowed_recipients": ["0xABC...", "0xDEF..."], + "blocked_recipients": ["0xDEAD..."], + "min_amount": "0.01", + "max_amount": "10000.00", + "allowed_currencies": ["USDC", "USDT"] + } + """ + + allowed_recipients: list[str] = Field( + default_factory=list, + description=( + "Allowlisted recipient addresses. When non-empty, only these " + "recipients are permitted." + ), + ) + blocked_recipients: list[str] = Field( + default_factory=list, + description="Blocklisted recipient addresses that are always denied.", + ) + min_amount: Decimal = Field( + default=Decimal("0"), + ge=0, + description="Minimum transaction amount (inclusive). 0 = no minimum.", + ) + max_amount: Decimal = Field( + default=Decimal("0"), + ge=0, + description="Maximum transaction amount (inclusive). 0 = no maximum.", + ) + allowed_currencies: list[str] = Field( + default_factory=list, + description=( + "Permitted currency symbols. When non-empty, only these " + "currencies are accepted." + ), + ) + + @field_validator("allowed_currencies", mode="before") + @classmethod + def normalize_currencies(cls, v: Any) -> list[str]: + """Normalize all currency symbols to upper-case.""" + if not isinstance(v, list): + return v + return [c.upper() for c in v] + + @model_validator(mode="after") + def validate_amount_bounds(self) -> TransactionPolicyConfig: + """Ensure max_amount >= min_amount when both are non-zero.""" + if self.max_amount > Decimal("0") and self.min_amount > Decimal("0") and self.max_amount < self.min_amount: + raise ValueError( + f"max_amount ({self.max_amount}) must be >= min_amount ({self.min_amount})" + ) + return self diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/evaluator.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/evaluator.py new file mode 100644 index 00000000..bf1d2125 --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/evaluator.py @@ -0,0 +1,262 @@ +"""Transaction-policy evaluator — static policy checks with no state tracking.""" + +from __future__ import annotations + +from decimal import Decimal, InvalidOperation +from typing import Any + +from agent_control_evaluators import ( + Evaluator, + EvaluatorMetadata, + register_evaluator, +) +from agent_control_models import EvaluatorResult + +from .config import TransactionPolicyConfig + + +@register_evaluator +class TransactionPolicyEvaluator(Evaluator[TransactionPolicyConfig]): + """Stateless evaluator for static transaction policy checks. + + Checks are applied in this order (first violation wins): + + 1. Currency allowlist (if configured) + 2. Recipient blocklist + 3. Recipient allowlist (if configured) + 4. Minimum amount bound + 5. Maximum amount bound + + ``matched=True`` means the transaction **violates** the policy and should be + blocked. ``matched=False`` means the transaction passed all checks. + + Thread safety: + This evaluator has no mutable instance state. Concurrent calls to + :meth:`evaluate` are safe. + + Input ``data`` schema:: + + { + "amount": Decimal | float | str, # required — transaction amount + "currency": str, # required — payment currency + "recipient": str, # required — recipient address/identifier + # optional context fields (logged in result metadata) + "channel": str, + "agent_id": str, + "session_id": str + } + + Example:: + + from agent_control_evaluator_financial_governance.transaction_policy import ( + TransactionPolicyConfig, + TransactionPolicyEvaluator, + ) + from decimal import Decimal + + config = TransactionPolicyConfig( + allowed_currencies=["USDC", "USDT"], + blocked_recipients=["0xDEAD..."], + max_amount=Decimal("5000"), + ) + evaluator = TransactionPolicyEvaluator(config) + result = await evaluator.evaluate({ + "amount": "100.00", + "currency": "USDC", + "recipient": "0xABC...", + }) + # result.matched == False → transaction passes all policy checks + """ + + metadata = EvaluatorMetadata( + name="financial_governance.transaction_policy", + version="0.1.0", + description=( + "Static transaction policy enforcement: recipient allowlists/blocklists, " + "amount bounds, and currency restrictions. No state tracking." + ), + ) + config_model = TransactionPolicyConfig + + @staticmethod + def _normalize_data(data: Any) -> dict[str, Any] | None: + """Extract transaction fields from selector output. + + Handles ``selector.path: "input"`` (data is the transaction dict) + and ``selector.path: "*"`` (data is the full Step dict). + """ + if not isinstance(data, dict): + return None + if "type" in data and "input" in data: + tx = data.get("input") + ctx = data.get("context") or {} + if not isinstance(tx, dict): + return None + merged = {**tx} + if isinstance(ctx, dict): + for k in ("channel", "agent_id", "session_id"): + if k in ctx and k not in merged: + merged[k] = ctx[k] + return merged + return data + + async def evaluate(self, data: Any) -> EvaluatorResult: + """Evaluate a transaction against the static policy. + + Args: + data: Transaction dict (when ``selector.path`` is ``"input"``) + or full Step dict (when path is ``"*"``). + + Returns: + ``EvaluatorResult`` where ``matched=True`` indicates a policy + violation (transaction should be denied). + """ + if data is None: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="No transaction data provided; skipping policy check", + ) + + tx_data = self._normalize_data(data) + if tx_data is None: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="Could not extract transaction data from selector output; skipping", + ) + + # Use normalized transaction dict for the rest of evaluate + data = tx_data + + # ---- Extract and validate required fields ---- + currency_raw = data.get("currency") + if not currency_raw: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="Transaction data missing required field 'currency'", + ) + currency: str = str(currency_raw).upper() + + recipient_raw = data.get("recipient") + if not recipient_raw: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="Transaction data missing required field 'recipient'", + ) + recipient: str = str(recipient_raw).strip() + + amount_raw = data.get("amount") + if amount_raw is None: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="Transaction data missing required field 'amount'", + ) + try: + amount = Decimal(str(amount_raw)) + except (InvalidOperation, TypeError, ValueError): + return EvaluatorResult( + matched=False, + confidence=1.0, + message=f"Transaction 'amount' is not numeric: {amount_raw!r}", + ) + + # Build shared metadata for result context + base_meta: dict[str, Any] = { + "amount": float(amount), + "currency": currency, + "recipient": recipient, + } + for ctx_key in ("channel", "agent_id", "session_id"): + if ctx_key in data and data[ctx_key] is not None: + base_meta[ctx_key] = data[ctx_key] + + # ---- Check 1: Currency allowlist ---- + if self.config.allowed_currencies: + if currency not in self.config.allowed_currencies: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=( + f"Currency '{currency}' is not in the allowed currencies list: " + f"{self.config.allowed_currencies}" + ), + metadata={ + **base_meta, + "violation": "currency_not_allowed", + "allowed_currencies": self.config.allowed_currencies, + }, + ) + + # ---- Check 2: Recipient blocklist ---- + if self.config.blocked_recipients and recipient in self.config.blocked_recipients: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=f"Recipient '{recipient}' is on the blocklist", + metadata={ + **base_meta, + "violation": "recipient_blocked", + }, + ) + + # ---- Check 3: Recipient allowlist ---- + if self.config.allowed_recipients: + if recipient not in self.config.allowed_recipients: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=( + f"Recipient '{recipient}' is not in the allowed recipients list" + ), + metadata={ + **base_meta, + "violation": "recipient_not_allowed", + }, + ) + + # ---- Check 4: Minimum amount ---- + if self.config.min_amount > Decimal("0") and amount < self.config.min_amount: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=( + f"Transaction amount {amount} {currency} is below the minimum " + f"of {self.config.min_amount} {currency}" + ), + metadata={ + **base_meta, + "violation": "amount_below_minimum", + "min_amount": float(self.config.min_amount), + }, + ) + + # ---- Check 5: Maximum amount ---- + if self.config.max_amount > Decimal("0") and amount > self.config.max_amount: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=( + f"Transaction amount {amount} {currency} exceeds the maximum " + f"of {self.config.max_amount} {currency}" + ), + metadata={ + **base_meta, + "violation": "amount_exceeds_maximum", + "max_amount": float(self.config.max_amount), + }, + ) + + # ---- All checks passed ---- + return EvaluatorResult( + matched=False, + confidence=1.0, + message=( + f"Transaction of {amount} {currency} to '{recipient}' " + "passed all policy checks" + ), + metadata=base_meta, + ) diff --git a/evaluators/contrib/financial-governance/tests/__init__.py b/evaluators/contrib/financial-governance/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/evaluators/contrib/financial-governance/tests/test_spend_limit.py b/evaluators/contrib/financial-governance/tests/test_spend_limit.py new file mode 100644 index 00000000..1054663b --- /dev/null +++ b/evaluators/contrib/financial-governance/tests/test_spend_limit.py @@ -0,0 +1,544 @@ +"""Tests for the spend_limit evaluator and supporting infrastructure.""" + +from __future__ import annotations + +import time +from decimal import Decimal +from typing import Any + +import pytest + +from agent_control_evaluator_financial_governance.spend_limit import ( + InMemorySpendStore, + SpendLimitConfig, + SpendLimitEvaluator, +) + + +# --------------------------------------------------------------------------- +# InMemorySpendStore unit tests +# --------------------------------------------------------------------------- + + +def test_store_record_and_query() -> None: + """Basic record/query round-trip.""" + store = InMemorySpendStore() + since = time.time() - 1 # slightly in the past + + store.record_spend(Decimal("100"), "USDC") + store.record_spend(Decimal("50"), "USDC") + store.record_spend(Decimal("200"), "ETH") # different currency — should not be counted + + assert store.get_spend("USDC", since) == Decimal("150") + assert store.get_spend("ETH", since) == Decimal("200") + assert store.get_spend("USDT", since) == Decimal("0") + + +def test_store_since_timestamp_filters_old_records() -> None: + """Records before since_timestamp are excluded from get_spend.""" + store = InMemorySpendStore() + + store.record_spend(Decimal("1000"), "USDC") + future_since = time.time() + 1 # everything is "before" this + + assert store.get_spend("USDC", future_since) == Decimal("0") + + +def test_store_end_timestamp_filters_future_records() -> None: + """Records after end timestamp are excluded from get_spend.""" + store = InMemorySpendStore() + past_end = time.time() - 1 # record is after this + + store.record_spend(Decimal("100"), "USDC") + + # With end in the past, the just-recorded spend should be excluded + assert store.get_spend("USDC", time.time() - 10, end=past_end) == Decimal("0") + + +def test_store_end_none_includes_all_current_records() -> None: + """end=None means no upper bound — current records are included.""" + store = InMemorySpendStore() + store.record_spend(Decimal("100"), "USDC") + + # end=None is the default — should include the record + assert store.get_spend("USDC", time.time() - 5) == Decimal("100") + + +def test_store_record_count() -> None: + store = InMemorySpendStore() + assert store.record_count() == 0 + store.record_spend(Decimal("1"), "USDC") + store.record_spend(Decimal("2"), "USDC") + assert store.record_count() == 2 + + +def test_store_rejects_non_positive_amount() -> None: + store = InMemorySpendStore() + with pytest.raises(ValueError, match="amount must be positive"): + store.record_spend(Decimal("0"), "USDC") + with pytest.raises(ValueError, match="amount must be positive"): + store.record_spend(Decimal("-5"), "USDC") + + +def test_store_metadata_accepted() -> None: + """Metadata kwarg is stored without error.""" + store = InMemorySpendStore() + store.record_spend(Decimal("10"), "USDC", metadata={"agent_id": "agent-1", "session_id": "s-99"}) + assert store.record_count() == 1 + + +# --------------------------------------------------------------------------- +# SpendLimitConfig validation tests +# --------------------------------------------------------------------------- + + +def test_config_currency_normalized_to_upper() -> None: + cfg = SpendLimitConfig(currency="usdc", max_per_transaction=Decimal("100")) + assert cfg.currency == "USDC" + + +def test_config_defaults() -> None: + cfg = SpendLimitConfig(currency="USDC") + assert cfg.max_per_transaction == Decimal("0") + assert cfg.max_per_period == Decimal("0") + assert cfg.period_seconds == 86_400 + + +def test_config_rejects_negative_max_per_transaction() -> None: + with pytest.raises(Exception): + SpendLimitConfig(currency="USDC", max_per_transaction=Decimal("-1")) + + +def test_config_rejects_zero_period_seconds() -> None: + with pytest.raises(Exception): + SpendLimitConfig(currency="USDC", period_seconds=0) + + +def test_config_accepts_decimal_from_string() -> None: + """Pydantic should coerce string values to Decimal for money fields.""" + cfg = SpendLimitConfig(currency="USDC", max_per_transaction="100.50", max_per_period="999.99") + assert cfg.max_per_transaction == Decimal("100.50") + assert cfg.max_per_period == Decimal("999.99") + + +# --------------------------------------------------------------------------- +# SpendLimitEvaluator tests +# --------------------------------------------------------------------------- + + +def _make_evaluator( + max_per_transaction: Decimal | float | str = Decimal("0"), + max_per_period: Decimal | float | str = Decimal("0"), + period_seconds: int = 86400, + currency: str = "USDC", + store: InMemorySpendStore | None = None, +) -> SpendLimitEvaluator: + cfg = SpendLimitConfig( + max_per_transaction=max_per_transaction, + max_per_period=max_per_period, + period_seconds=period_seconds, + currency=currency, + ) + return SpendLimitEvaluator(cfg, store=store) + + +def _tx( + amount: Any = "10.00", + currency: str = "USDC", + recipient: str = "0xABC", + **extra: Any, +) -> dict[str, Any]: + return {"amount": amount, "currency": currency, "recipient": recipient, **extra} + + +@pytest.mark.asyncio +async def test_none_data_is_allowed() -> None: + ev = _make_evaluator(max_per_transaction=Decimal("100")) + result = await ev.evaluate(None) + assert result.matched is False + assert result.error is None + + +@pytest.mark.asyncio +async def test_non_dict_data_is_allowed() -> None: + ev = _make_evaluator(max_per_transaction=Decimal("100")) + result = await ev.evaluate("not a dict") + assert result.matched is False + assert result.error is None + + +@pytest.mark.asyncio +async def test_missing_amount_not_matched() -> None: + """Missing amount is a non-match, NOT an evaluator error.""" + ev = _make_evaluator(max_per_transaction=Decimal("100")) + result = await ev.evaluate({"currency": "USDC", "recipient": "0xABC"}) + assert result.matched is False + assert result.error is None + assert "amount" in (result.message or "").lower() + + +@pytest.mark.asyncio +async def test_missing_currency_not_matched() -> None: + """Missing currency is a non-match, NOT an evaluator error.""" + ev = _make_evaluator(max_per_transaction=Decimal("100")) + result = await ev.evaluate({"amount": "10.00", "recipient": "0xABC"}) + assert result.matched is False + assert result.error is None + assert "currency" in (result.message or "").lower() + + +@pytest.mark.asyncio +async def test_wrong_currency_is_skipped() -> None: + """Transaction in a different currency should be allowed (not matched).""" + ev = _make_evaluator(max_per_transaction=Decimal("1"), currency="USDC") + # Amount 99999 but in ETH — policy only governs USDC + result = await ev.evaluate(_tx(amount="99999.00", currency="ETH")) + assert result.matched is False + assert result.metadata and result.metadata.get("tx_currency") == "ETH" + + +@pytest.mark.asyncio +async def test_per_transaction_cap_violation() -> None: + ev = _make_evaluator(max_per_transaction=Decimal("100")) + result = await ev.evaluate(_tx(amount="101.00")) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "per_transaction_cap" + assert result.error is None + + +@pytest.mark.asyncio +async def test_per_transaction_cap_exact_boundary_allowed() -> None: + ev = _make_evaluator(max_per_transaction=Decimal("100")) + result = await ev.evaluate(_tx(amount="100.00")) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_per_transaction_cap_disabled_at_zero() -> None: + ev = _make_evaluator(max_per_transaction=Decimal("0")) + result = await ev.evaluate(_tx(amount="9999999.00")) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_period_budget_violation() -> None: + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=Decimal("500"), store=store) + + # Pre-load 480 of spend + store.record_spend(Decimal("480"), "USDC") + + # Next transaction of 25 would push us to 505 — over budget + result = await ev.evaluate(_tx(amount="25.00")) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "period_budget" + assert result.metadata["current_period_spend"] == pytest.approx(480.0) + assert result.metadata["projected_period_spend"] == pytest.approx(505.0) + + +@pytest.mark.asyncio +async def test_period_budget_exact_boundary_allowed() -> None: + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=Decimal("500"), store=store) + + store.record_spend(Decimal("490"), "USDC") + + # Exactly 10 remaining — should be allowed and recorded + result = await ev.evaluate(_tx(amount="10.00")) + assert result.matched is False + # The spend should now be recorded + assert store.get_spend("USDC", time.time() - 1) == Decimal("500") + + +@pytest.mark.asyncio +async def test_period_budget_disabled_at_zero() -> None: + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=Decimal("0"), store=store) + + store.record_spend(Decimal("1000000"), "USDC") + result = await ev.evaluate(_tx(amount="1000000.00")) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_successful_transaction_is_recorded() -> None: + store = InMemorySpendStore() + ev = _make_evaluator(max_per_transaction=Decimal("100"), max_per_period=Decimal("1000"), store=store) + + assert store.record_count() == 0 + result = await ev.evaluate(_tx(amount="50.00")) + assert result.matched is False + assert store.record_count() == 1 + since = time.time() - 5 + assert store.get_spend("USDC", since) == Decimal("50") + + +@pytest.mark.asyncio +async def test_context_override_channel_max_per_transaction() -> None: + """channel_max_per_transaction in data overrides config.""" + # Base config allows up to 1000 per tx, but channel caps at 50 + ev = _make_evaluator(max_per_transaction=Decimal("1000")) + result = await ev.evaluate(_tx(amount="75.00", channel_max_per_transaction="50.00")) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "per_transaction_cap" + assert result.metadata["max_per_transaction"] == pytest.approx(50.0) + + +@pytest.mark.asyncio +async def test_context_override_channel_max_per_period() -> None: + """channel_max_per_period in data overrides config.""" + store = InMemorySpendStore() + store.record_spend(Decimal("90"), "USDC") + + # Base config has 1000 budget, but channel caps at 100 + ev = _make_evaluator(max_per_period=Decimal("1000"), store=store) + result = await ev.evaluate(_tx(amount="20.00", channel_max_per_period="100.00")) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "period_budget" + + +@pytest.mark.asyncio +async def test_multiple_sequential_transactions_accumulate() -> None: + """Verify spend accumulates correctly across multiple calls.""" + store = InMemorySpendStore() + ev = _make_evaluator(max_per_transaction=Decimal("100"), max_per_period=Decimal("250"), store=store) + + for amount in ("80.00", "80.00", "80.00"): + r = await ev.evaluate(_tx(amount=amount)) + # First two succeed; third should breach period budget (240 + 80 = 320 > 250) + if amount == "80.00" and store.record_count() < 3: + pass # may or may not be matched depending on order + + # After two successful txns (160 total), third of 80 → 240 which is ≤ 250 → allowed + # But a fourth of 80 → 320 which is > 250 → blocked + result_4 = await ev.evaluate(_tx(amount="80.00")) + assert result_4.matched is True + assert result_4.metadata and result_4.metadata["violation"] == "period_budget" + + +@pytest.mark.asyncio +async def test_currency_case_insensitive_in_data() -> None: + """Currency in transaction data is normalized to upper-case before comparison.""" + ev = _make_evaluator(max_per_transaction=Decimal("100"), currency="USDC") + result = await ev.evaluate(_tx(amount="10.00", currency="usdc")) + assert result.matched is False # lower-case usdc should match USDC policy + + +# --------------------------------------------------------------------------- +# Context-scoped budget isolation tests (requested by lan17) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_scoped_budget_channel_isolation() -> None: + """Spend in channel A should NOT count against channel B's budget. + + Scenario: 90 USDC in channel A, then 20 USDC in channel B with + channel_max_per_period=100. Channel B should be allowed because + its scoped spend is 0, not 90. + """ + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=Decimal("1000"), store=store) + + # Record 90 USDC in channel A + r1 = await ev.evaluate(_tx(amount="90.00", channel="channel-A")) + assert r1.matched is False + + # 20 USDC in channel B with a per-channel budget of 100 + # Should be allowed: channel B has 0 spend, not 90. + r2 = await ev.evaluate(_tx(amount="20.00", channel="channel-B", channel_max_per_period="100.00")) + assert r2.matched is False + + +@pytest.mark.asyncio +async def test_scoped_budget_same_channel_accumulates() -> None: + """Spend within the same channel accumulates correctly.""" + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=Decimal("1000"), store=store) + + # 60 USDC in channel A + r1 = await ev.evaluate(_tx(amount="60.00", channel="channel-A")) + assert r1.matched is False + + # Another 50 USDC in channel A with channel cap of 100 + # 60 + 50 = 110 > 100 → should be denied + r2 = await ev.evaluate(_tx(amount="50.00", channel="channel-A", channel_max_per_period="100.00")) + assert r2.matched is True + assert r2.metadata and r2.metadata["violation"] == "period_budget" + + +@pytest.mark.asyncio +async def test_scoped_budget_agent_id_isolation() -> None: + """Spend by agent-1 should NOT count against agent-2's budget.""" + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=Decimal("1000"), store=store) + + r1 = await ev.evaluate(_tx(amount="90.00", agent_id="agent-1")) + assert r1.matched is False + + # agent-2 with tight budget — should be allowed (agent-2 has 0 spend) + r2 = await ev.evaluate(_tx(amount="20.00", agent_id="agent-2", channel_max_per_period="100.00")) + assert r2.matched is False + + +@pytest.mark.asyncio +async def test_global_budget_without_scope() -> None: + """When no channel/agent/session context, budget is global.""" + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=Decimal("100"), store=store) + + # No context fields → global spend + r1 = await ev.evaluate(_tx(amount="90.00")) + assert r1.matched is False + + # Still no context → global spend of 90 + 20 = 110 > 100 + r2 = await ev.evaluate(_tx(amount="20.00")) + assert r2.matched is True + + +@pytest.mark.asyncio +async def test_malformed_input_is_not_evaluator_error() -> None: + """Malformed input should be matched=False with error=None, not an evaluator error. + + This is the engine-level test lan17 requested to ensure we don't + accidentally lock in result.error as a policy outcome. + """ + ev = _make_evaluator(max_per_transaction=Decimal("100")) + + # Missing amount + r1 = await ev.evaluate({"currency": "USDC", "recipient": "0xABC"}) + assert r1.matched is False + assert r1.error is None + + # Missing currency + r2 = await ev.evaluate({"amount": "10.00", "recipient": "0xABC"}) + assert r2.matched is False + assert r2.error is None + + # Negative amount + r3 = await ev.evaluate({"amount": "-5.00", "currency": "USDC", "recipient": "0xABC"}) + assert r3.matched is False + assert r3.error is None + + # Non-dict input + r4 = await ev.evaluate("not a dict") + assert r4.matched is False + assert r4.error is None + + # None input + r5 = await ev.evaluate(None) + assert r5.matched is False + assert r5.error is None + + +# --------------------------------------------------------------------------- +# Step normalization tests (selector.path: "*" vs "input") +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_step_object_input_extraction() -> None: + """When selector.path is '*', data is a full Step dict. + Evaluator should extract transaction from 'input' key.""" + ev = _make_evaluator(max_per_transaction=Decimal("100")) + step_data = { + "type": "tool", + "name": "payment", + "input": {"amount": "50.00", "currency": "USDC", "recipient": "0xABC"}, + "context": None, + } + result = await ev.evaluate(step_data) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_step_context_merged_into_transaction() -> None: + """Context fields from step.context should be available for scoped budgets.""" + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=Decimal("1000"), store=store) + + # First: 90 USDC in channel-A via step context + step1 = { + "type": "tool", + "name": "payment", + "input": {"amount": "90.00", "currency": "USDC", "recipient": "0xABC"}, + "context": {"channel": "channel-A"}, + } + r1 = await ev.evaluate(step1) + assert r1.matched is False + + # Second: 20 USDC in channel-B with tight cap via step context + step2 = { + "type": "tool", + "name": "payment", + "input": {"amount": "20.00", "currency": "USDC", "recipient": "0xABC"}, + "context": {"channel": "channel-B", "channel_max_per_period": 100.0}, + } + r2 = await ev.evaluate(step2) + # Channel B has 0 scoped spend → should be allowed + assert r2.matched is False + + +@pytest.mark.asyncio +async def test_step_context_overrides_not_clobbered_by_input() -> None: + """If input already has channel, step.context should not overwrite it. + + When input contains "channel": "from-input" and step.context has + "channel": "from-context", the input value wins. We verify this by + checking that the store recorded the spend under channel="from-input", + not "from-context". + """ + store = InMemorySpendStore() + ev = _make_evaluator(max_per_transaction=Decimal("100"), store=store) + step_data = { + "type": "tool", + "name": "payment", + "input": {"amount": "10.00", "currency": "USDC", "recipient": "0xABC", "channel": "from-input"}, + "context": {"channel": "from-context"}, + } + result = await ev.evaluate(step_data) + assert result.matched is False + + # The spend must have been recorded under channel="from-input", not "from-context". + # Query with scope={"channel": "from-input"} should return 10; "from-context" should return 0. + since = time.time() - 5 + assert store.get_spend("USDC", since, scope={"channel": "from-input"}) == Decimal("10") + assert store.get_spend("USDC", since, scope={"channel": "from-context"}) == Decimal("0") + + +# --------------------------------------------------------------------------- +# lan17's scoped budget test (Fix #7) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_lan17_channel_scope_independence() -> None: + """lan17's specific test: 90 USDC in channel A, then 20 USDC in channel B. + + With channel_max_per_period=100, the second transaction should be ALLOWED + because it is in a different channel scope — channel B has 0 spend. + """ + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=Decimal("1000"), store=store) + + # Step 1: 90 USDC in channel A — should be allowed + r1 = await ev.evaluate(_tx(amount="90.00", channel="channel-A")) + assert r1.matched is False, f"Channel A 90 USDC should be allowed, got: {r1.message}" + + # Verify channel A spend is recorded + since = time.time() - 5 + assert store.get_spend("USDC", since, scope={"channel": "channel-A"}) == Decimal("90") + + # Step 2: 20 USDC in channel B with channel_max_per_period=100 + # Channel B has 0 spend, so 0 + 20 = 20 ≤ 100 → ALLOWED + r2 = await ev.evaluate(_tx(amount="20.00", channel="channel-B", channel_max_per_period="100.00")) + assert r2.matched is False, ( + f"Channel B 20 USDC should be allowed (channel B has 0 spend), " + f"but got matched=True: {r2.message}" + ) + + # Verify channel B spend is also recorded correctly + assert store.get_spend("USDC", since, scope={"channel": "channel-B"}) == Decimal("20") + + # Verify the scopes are truly independent — channel A's spend is unchanged + assert store.get_spend("USDC", since, scope={"channel": "channel-A"}) == Decimal("90") diff --git a/evaluators/contrib/financial-governance/tests/test_transaction_policy.py b/evaluators/contrib/financial-governance/tests/test_transaction_policy.py new file mode 100644 index 00000000..d8bab417 --- /dev/null +++ b/evaluators/contrib/financial-governance/tests/test_transaction_policy.py @@ -0,0 +1,362 @@ +"""Tests for the transaction_policy evaluator.""" + +from __future__ import annotations + +from decimal import Decimal +from typing import Any + +import pytest +from pydantic import ValidationError + +from agent_control_evaluator_financial_governance.transaction_policy import ( + TransactionPolicyConfig, + TransactionPolicyEvaluator, +) + + +# --------------------------------------------------------------------------- +# TransactionPolicyConfig validation tests +# --------------------------------------------------------------------------- + + +def test_config_currencies_normalized() -> None: + cfg = TransactionPolicyConfig(allowed_currencies=["usdc", "Usdt"]) + assert cfg.allowed_currencies == ["USDC", "USDT"] + + +def test_config_defaults_are_permissive() -> None: + cfg = TransactionPolicyConfig() + assert cfg.allowed_recipients == [] + assert cfg.blocked_recipients == [] + assert cfg.min_amount == Decimal("0") + assert cfg.max_amount == Decimal("0") + assert cfg.allowed_currencies == [] + + +def test_config_max_amount_lt_min_raises() -> None: + with pytest.raises(ValidationError, match="max_amount"): + TransactionPolicyConfig(min_amount=Decimal("100"), max_amount=Decimal("10")) + + +def test_config_max_equals_min_is_valid() -> None: + cfg = TransactionPolicyConfig(min_amount=Decimal("50"), max_amount=Decimal("50")) + assert cfg.min_amount == Decimal("50") + assert cfg.max_amount == Decimal("50") + + +# --------------------------------------------------------------------------- +# Helper factory +# --------------------------------------------------------------------------- + + +def _make_evaluator(**kwargs: Any) -> TransactionPolicyEvaluator: + cfg = TransactionPolicyConfig(**kwargs) + return TransactionPolicyEvaluator(cfg) + + +def _tx( + amount: float = 100.0, + currency: str = "USDC", + recipient: str = "0xABC", + **extra: Any, +) -> dict[str, Any]: + return {"amount": amount, "currency": currency, "recipient": recipient, **extra} + + +# --------------------------------------------------------------------------- +# Edge cases: None / non-dict inputs +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_none_data_passes() -> None: + ev = _make_evaluator(allowed_currencies=["USDC"]) + result = await ev.evaluate(None) + assert result.matched is False + assert result.error is None + + +@pytest.mark.asyncio +async def test_non_dict_data_passes() -> None: + ev = _make_evaluator(allowed_currencies=["USDC"]) + result = await ev.evaluate(["not", "a", "dict"]) + assert result.matched is False + + +# --------------------------------------------------------------------------- +# Missing required fields +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_missing_currency_not_matched() -> None: + """Missing currency is a non-match, NOT an evaluator error.""" + ev = _make_evaluator() + result = await ev.evaluate({"amount": 10.0, "recipient": "0xABC"}) + assert result.matched is False + assert result.error is None + assert "currency" in (result.message or "").lower() + + +@pytest.mark.asyncio +async def test_missing_recipient_not_matched() -> None: + """Missing recipient is a non-match, NOT an evaluator error.""" + ev = _make_evaluator() + result = await ev.evaluate({"amount": 10.0, "currency": "USDC"}) + assert result.matched is False + assert result.error is None + assert "recipient" in (result.message or "").lower() + + +@pytest.mark.asyncio +async def test_missing_amount_not_matched() -> None: + """Missing amount is a non-match, NOT an evaluator error.""" + ev = _make_evaluator() + result = await ev.evaluate({"currency": "USDC", "recipient": "0xABC"}) + assert result.matched is False + assert result.error is None + assert "amount" in (result.message or "").lower() + + +@pytest.mark.asyncio +async def test_non_numeric_amount_not_matched() -> None: + """Non-numeric amount is a non-match, NOT an evaluator error.""" + ev = _make_evaluator() + result = await ev.evaluate({"amount": "lots", "currency": "USDC", "recipient": "0xABC"}) + assert result.matched is False + assert result.error is None + + +# --------------------------------------------------------------------------- +# No restrictions configured → everything passes +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_empty_config_allows_everything() -> None: + ev = _make_evaluator() + result = await ev.evaluate(_tx(amount=999_999.0, currency="XYZ", recipient="0xANY")) + assert result.matched is False + + +# --------------------------------------------------------------------------- +# Currency allowlist +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_currency_not_in_allowlist_is_blocked() -> None: + ev = _make_evaluator(allowed_currencies=["USDC", "USDT"]) + result = await ev.evaluate(_tx(currency="DAI")) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "currency_not_allowed" + + +@pytest.mark.asyncio +async def test_currency_in_allowlist_passes() -> None: + ev = _make_evaluator(allowed_currencies=["USDC", "USDT"]) + result = await ev.evaluate(_tx(currency="USDT")) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_currency_allowlist_case_insensitive_in_data() -> None: + """Currency from incoming data is uppercased before comparison.""" + ev = _make_evaluator(allowed_currencies=["USDC"]) + result = await ev.evaluate(_tx(currency="usdc")) + assert result.matched is False + + +# --------------------------------------------------------------------------- +# Recipient blocklist +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_blocked_recipient_is_denied() -> None: + ev = _make_evaluator(blocked_recipients=["0xDEAD", "0xBAD"]) + result = await ev.evaluate(_tx(recipient="0xDEAD")) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "recipient_blocked" + + +@pytest.mark.asyncio +async def test_non_blocked_recipient_passes() -> None: + ev = _make_evaluator(blocked_recipients=["0xDEAD"]) + result = await ev.evaluate(_tx(recipient="0xGOOD")) + assert result.matched is False + + +# --------------------------------------------------------------------------- +# Recipient allowlist +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_recipient_not_in_allowlist_is_blocked() -> None: + ev = _make_evaluator(allowed_recipients=["0xALICE", "0xBOB"]) + result = await ev.evaluate(_tx(recipient="0xEVE")) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "recipient_not_allowed" + + +@pytest.mark.asyncio +async def test_recipient_in_allowlist_passes() -> None: + ev = _make_evaluator(allowed_recipients=["0xALICE", "0xBOB"]) + result = await ev.evaluate(_tx(recipient="0xBOB")) + assert result.matched is False + + +# --------------------------------------------------------------------------- +# Blocklist takes priority over allowlist +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_blocked_beats_allowlist() -> None: + """A recipient on the blocklist should be denied even if also allowlisted.""" + ev = _make_evaluator( + allowed_recipients=["0xALICE"], + blocked_recipients=["0xALICE"], # deliberately in both + ) + result = await ev.evaluate(_tx(recipient="0xALICE")) + assert result.matched is True + # Violation should be blocklist (checked first) + assert result.metadata and result.metadata["violation"] == "recipient_blocked" + + +# --------------------------------------------------------------------------- +# Amount bounds +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_amount_below_minimum_is_blocked() -> None: + ev = _make_evaluator(min_amount=10.0) + result = await ev.evaluate(_tx(amount=9.99)) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "amount_below_minimum" + + +@pytest.mark.asyncio +async def test_amount_at_minimum_passes() -> None: + ev = _make_evaluator(min_amount=10.0) + result = await ev.evaluate(_tx(amount=10.0)) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_amount_above_maximum_is_blocked() -> None: + ev = _make_evaluator(max_amount=1000.0) + result = await ev.evaluate(_tx(amount=1000.01)) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "amount_exceeds_maximum" + + +@pytest.mark.asyncio +async def test_amount_at_maximum_passes() -> None: + ev = _make_evaluator(max_amount=1000.0) + result = await ev.evaluate(_tx(amount=1000.0)) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_amount_bounds_disabled_at_zero() -> None: + ev = _make_evaluator(min_amount=0.0, max_amount=0.0) + result = await ev.evaluate(_tx(amount=0.001)) + assert result.matched is False + result2 = await ev.evaluate(_tx(amount=1_000_000_000.0)) + assert result2.matched is False + + +# --------------------------------------------------------------------------- +# Full policy (all fields configured) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_full_policy_passes_compliant_transaction() -> None: + ev = _make_evaluator( + allowed_currencies=["USDC", "USDT"], + blocked_recipients=["0xDEAD"], + allowed_recipients=["0xALICE", "0xBOB"], + min_amount=1.0, + max_amount=5000.0, + ) + result = await ev.evaluate(_tx(amount=250.0, currency="USDC", recipient="0xALICE")) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_context_fields_appear_in_metadata() -> None: + """Optional context fields (channel, agent_id, session_id) should surface in result metadata.""" + ev = _make_evaluator() + result = await ev.evaluate(_tx(channel="discord", agent_id="agent-42", session_id="sess-1")) + assert result.metadata + assert result.metadata.get("channel") == "discord" + assert result.metadata.get("agent_id") == "agent-42" + assert result.metadata.get("session_id") == "sess-1" + + +# --------------------------------------------------------------------------- +# Check ordering: currency first, then blocklist, then allowlist, then bounds +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_currency_check_before_recipient_check() -> None: + """Currency violation should be reported even if recipient is also blocked.""" + ev = _make_evaluator( + allowed_currencies=["USDC"], + blocked_recipients=["0xDEAD"], + ) + result = await ev.evaluate(_tx(currency="DAI", recipient="0xDEAD")) + # Currency checked first + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "currency_not_allowed" + + +@pytest.mark.asyncio +async def test_blocklist_before_allowlist() -> None: + """Blocklist violation should be reported even if recipient not in allowlist.""" + ev = _make_evaluator( + allowed_recipients=["0xGOOD"], + blocked_recipients=["0xBAD"], + ) + result = await ev.evaluate(_tx(recipient="0xBAD")) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "recipient_blocked" + + +# --------------------------------------------------------------------------- +# Step normalization tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_step_object_input_extraction() -> None: + """When data is a full Step dict, extract transaction from 'input'.""" + ev = _make_evaluator(allowed_currencies=["USDC"]) + step_data = { + "type": "tool", + "name": "payment", + "input": {"amount": 100.0, "currency": "USDC", "recipient": "0xABC"}, + "context": {"channel": "slack"}, + } + result = await ev.evaluate(step_data) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_step_blocked_recipient_via_step() -> None: + """Blocklist check should work when data comes as a Step dict.""" + ev = _make_evaluator(blocked_recipients=["0xDEAD"]) + step_data = { + "type": "tool", + "name": "payment", + "input": {"amount": 10.0, "currency": "USDC", "recipient": "0xDEAD"}, + "context": None, + } + result = await ev.evaluate(step_data) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "recipient_blocked"