diff --git a/crates/auths-api/src/domains/agents/handlers.rs b/crates/auths-api/src/domains/agents/handlers.rs
index 6abe90c7..f2150454 100644
--- a/crates/auths-api/src/domains/agents/handlers.rs
+++ b/crates/auths-api/src/domains/agents/handlers.rs
@@ -47,19 +47,20 @@ pub async fn authorize_operation(
     #[allow(clippy::disallowed_methods)] // INVARIANT: HTTP handler boundary
     let now = chrono::Utc::now();
 
-    // Validate clock skew (±5 minutes)
-    let time_diff = {
-        let duration = now.signed_duration_since(req.timestamp);
-        duration.num_seconds().unsigned_abs()
-    };
-    if time_diff > 300 {
-        return Err((StatusCode::BAD_REQUEST, "Clock skew too large".to_string()));
-    }
-
     let service = AgentService::new(state.registry, state.persistence);
     let resp = service
-        .authorize(&req.agent_did, &req.capability, now)
-        .map_err(|e| (StatusCode::UNAUTHORIZED, e))?;
+        .authorize(&req.agent_did, &req.capability, now, req.timestamp)
+        .map_err(|e| {
+            let error_msg = e.to_string();
+            // Clock skew is a request validation error (400)
+            // Authorization failures are authorization errors (401)
+            let status = if error_msg.contains("Clock skew") {
+                StatusCode::BAD_REQUEST
+            } else {
+                StatusCode::UNAUTHORIZED
+            };
+            (status, error_msg)
+        })?;
 
     Ok((StatusCode::OK, Json(resp)))
 }
diff --git a/crates/auths-deployment/config/sentinel.conf b/crates/auths-deployment/config/sentinel.conf
new file mode 100644
index 00000000..84861335
--- /dev/null
+++ b/crates/auths-deployment/config/sentinel.conf
@@ -0,0 +1,57 @@
+# Redis Sentinel Configuration Template
+# Production-grade 3-instance Sentinel cluster for auths-api
+# See: docs/PRODUCTION_REDIS_HA.md for deployment guides
+
+# Bind to all interfaces (override in deployment)
+bind 0.0.0.0
+protected-mode no
+
+# Sentinel port (default 26379)
+port 26379
+
+# Sentinel working directory
+dir ./
+
+# Master name (referenced by clients)
+# All 3 Sentinels must use the same name
+sentinel monitor mymaster 127.0.0.1 6379 2
+
+# Time in milliseconds before Sentinel considers master unreachable
+# After this time, if a majority of Sentinels agree, auto-failover begins
+# Recommended: 30s for auths-api (balance between detection time and false positives)
+sentinel down_after_milliseconds mymaster 30000
+
+# Number of replicas to reconfigure in parallel during failover
+# Set to 1 to avoid traffic spikes during switchover
+sentinel parallel_syncs mymaster 1
+
+# Failover timeout: how long to wait before giving up
+# Should be at least 3x down_after_milliseconds
+sentinel failover_timeout mymaster 120000
+
+# Sentinel logging
+loglevel notice
+logfile ""
+
+# Deny dangerous commands (scripting, config modification)
+sentinel deny_scripts_reconfig yes
+
+# Authentication (if Redis requires password)
+# Uncomment and set for production:
+# sentinel auth-pass mymaster your-redis-password
+
+# Sentinel quorum for starting auto-failover
+# With 3 Sentinels, quorum=2 means any 2 can trigger failover
+# (This is implicitly 2 from the "sentinel monitor" command above)
+
+# Notification script on failure detection (optional)
+# Called when failover starts: script will be called
+# sentinel notification-script mymaster /path/to/notification-script.sh
+
+# Configuration propagation script (optional)
+# Called after failover to reconfigure replicas
+# sentinel client-reconfig-script mymaster /path/to/client-reconfig-script.sh
+
+# For testing: allow Sentinel to accept SHUTDOWN command
+# Remove in production
+sentinel deny_scripts_reconfig no
diff --git a/crates/auths-deployment/scripts/backup-redis-aof.sh b/crates/auths-deployment/scripts/backup-redis-aof.sh
new file mode 100755
index 00000000..ff44d15d
--- /dev/null
+++ b/crates/auths-deployment/scripts/backup-redis-aof.sh
@@ -0,0 +1,148 @@
+#!/bin/bash
+# Automated Redis AOF backup to S3
+# Usage: AWS_REGION=us-east-1 ./backup-redis-aof.sh [redis-host] [redis-port]
+#
+# Cron job (2am UTC daily):
+#   0 2 * * * cd /app && AWS_REGION=us-east-1 ./backup-redis-aof.sh localhost 6379 >> /var/log/redis-backup.log 2>&1
+
+set -e
+
+# Configuration
+REDIS_HOST=${1:-localhost}
+REDIS_PORT=${2:-6379}
+AWS_REGION=${AWS_REGION:-us-east-1}
+S3_BUCKET="${S3_BUCKET:-auths-redis-backups}"
+BACKUP_RETENTION_DAYS=30
+MAX_BACKUP_SIZE_MB=1000  # Alert if > 1GB
+
+# Derived variables
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+BACKUP_NAME="redis-aof-${TIMESTAMP}.aof.gz"
+LOCAL_AOF_PATH="/tmp/redis-aof-${TIMESTAMP}.aof"
+COMPRESSED_AOF_PATH="${LOCAL_AOF_PATH}.gz"
+S3_KEY="backups/${BACKUP_NAME}"
+S3_URI="s3://${S3_BUCKET}/${S3_KEY}"
+LOG_PREFIX="[$(date '+%Y-%m-%d %H:%M:%S')]"
+
+# Color output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+log_info() { echo -e "${GREEN}${LOG_PREFIX}${NC} $*"; }
+log_warn() { echo -e "${YELLOW}${LOG_PREFIX}${NC} $*"; }
+log_error() { echo -e "${RED}${LOG_PREFIX}${NC} $*"; exit 1; }
+
+# === Step 1: Verify Redis connectivity ===
+log_info "Verifying Redis connectivity ($REDIS_HOST:$REDIS_PORT)..."
+if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" ping >/dev/null 2>&1; then
+  log_error "Redis not reachable at $REDIS_HOST:$REDIS_PORT"
+fi
+log_info "Redis reachable ✓"
+
+# === Step 2: Trigger AOF rewrite ===
+log_info "Triggering AOF rewrite (compaction)..."
+if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" BGREWRITEAOF >/dev/null 2>&1; then
+  log_warn "AOF rewrite failed (may already be in progress)"
+fi
+
+# Wait for rewrite to complete (max 30s)
+sleep 2
+log_info "Waiting for AOF rewrite..."
+for i in {1..15}; do
+  if redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" info persistence | grep -q "aof_rewrite_in_progress:0"; then
+    log_info "AOF rewrite completed"
+    break
+  fi
+  sleep 2
+done
+
+# === Step 3: Get AOF file location ===
+log_info "Locating AOF file..."
+REDIS_AOF_PATH=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" config get appendfilename | tail -1)
+REDIS_DIR=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" config get dir | tail -1)
+FULL_AOF_PATH="${REDIS_DIR}/${REDIS_AOF_PATH}"
+
+log_info "AOF file: $FULL_AOF_PATH"
+if [[ ! -f "$FULL_AOF_PATH" ]]; then
+  log_error "AOF file not found at $FULL_AOF_PATH"
+fi
+
+# === Step 4: Copy and compress AOF ===
+log_info "Copying AOF to temporary location..."
+cp "$FULL_AOF_PATH" "$LOCAL_AOF_PATH"
+
+log_info "Compressing AOF..."
+gzip -f "$LOCAL_AOF_PATH"
+
+# Check backup size
+BACKUP_SIZE_MB=$(($(stat -f%z "$COMPRESSED_AOF_PATH" 2>/dev/null || stat -c%s "$COMPRESSED_AOF_PATH") / 1024 / 1024))
+log_info "Compressed AOF size: ${BACKUP_SIZE_MB}MB"
+
+if [[ $BACKUP_SIZE_MB -gt $MAX_BACKUP_SIZE_MB ]]; then
+  log_warn "ALERT: Backup size (${BACKUP_SIZE_MB}MB) exceeds threshold (${MAX_BACKUP_SIZE_MB}MB)"
+fi
+
+# === Step 5: Upload to S3 ===
+log_info "Uploading to S3: $S3_URI"
+if ! aws s3 cp "$COMPRESSED_AOF_PATH" "$S3_URI" \
+    --region "$AWS_REGION" \
+    --storage-class STANDARD_IA \
+    --metadata "timestamp=${TIMESTAMP},redis-host=${REDIS_HOST},backup-size=${BACKUP_SIZE_MB}MB" \
+    2>&1; then
+  log_error "S3 upload failed for $S3_URI"
+fi
+log_info "✓ Backup uploaded to S3"
+
+# === Step 6: Cleanup old local backups ===
+log_info "Cleaning up temporary files..."
+rm -f "$COMPRESSED_AOF_PATH"
+
+# === Step 7: Cleanup old S3 backups (retention policy) ===
+log_info "Applying retention policy (keeping ${BACKUP_RETENTION_DAYS} days)..."
+CUTOFF_DATE=$(date -u -d "${BACKUP_RETENTION_DAYS} days ago" +%Y-%m-%d 2>/dev/null || date -u -v-${BACKUP_RETENTION_DAYS}d +%Y-%m-%d)
+
+# List and delete old backups
+OLD_BACKUPS=$(aws s3api list-objects-v2 \
+  --bucket "$S3_BUCKET" \
+  --prefix "backups/" \
+  --region "$AWS_REGION" \
+  --query "Contents[?LastModified<'${CUTOFF_DATE}T00:00:00Z'].Key" \
+  --output text 2>/dev/null || echo "")
+
+if [[ -n "$OLD_BACKUPS" ]]; then
+  log_info "Deleting old backups..."
+  for key in $OLD_BACKUPS; do
+    log_info "  Deleting: $key"
+    aws s3 rm "s3://${S3_BUCKET}/${key}" --region "$AWS_REGION" 2>/dev/null || true
+  done
+fi
+
+# === Step 8: Log success ===
+log_info "✓ Backup completed successfully"
+log_info "Summary:"
+log_info "  Timestamp: $TIMESTAMP"
+log_info "  Size: ${BACKUP_SIZE_MB}MB"
+log_info "  Location: $S3_URI"
+log_info "  Redis: $REDIS_HOST:$REDIS_PORT"
+
+# === Step 9: CloudWatch metric (optional) ===
+if command -v aws >/dev/null 2>&1; then
+  log_info "Publishing CloudWatch metrics..."
+  aws cloudwatch put-metric-data \
+    --namespace "auths/redis" \
+    --metric-name "backup-size-mb" \
+    --value "$BACKUP_SIZE_MB" \
+    --region "$AWS_REGION" \
+    2>/dev/null || log_warn "Failed to publish metrics"
+
+  aws cloudwatch put-metric-data \
+    --namespace "auths/redis" \
+    --metric-name "backup-success" \
+    --value 1 \
+    --region "$AWS_REGION" \
+    2>/dev/null || true
+fi
+
+exit 0
diff --git a/crates/auths-deployment/scripts/restore-redis-aof.sh b/crates/auths-deployment/scripts/restore-redis-aof.sh
new file mode 100755
index 00000000..3a068349
--- /dev/null
+++ b/crates/auths-deployment/scripts/restore-redis-aof.sh
@@ -0,0 +1,195 @@
+#!/bin/bash
+# Restore Redis from AOF backup (point-in-time recovery)
+# Usage: ./restore-redis-aof.sh <backup-source> [redis-host] [redis-port] [backup-date]
+#
+# Examples:
+#   ./restore-redis-aof.sh s3://my-bucket/redis-aof-20260329_020000.aof.gz
+#   ./restore-redis-aof.sh /local/redis-aof-20260329_020000.aof.gz localhost 6379
+#   ./restore-redis-aof.sh latest localhost 6379 2026-03-28  # Restore backup from specific date
+
+set -e
+
+# Configuration
+BACKUP_SOURCE=$1
+REDIS_HOST=${2:-localhost}
+REDIS_PORT=${3:-6379}
+BACKUP_DATE=${4:-}
+S3_BUCKET="${S3_BUCKET:-auths-redis-backups}"
+AWS_REGION=${AWS_REGION:-us-east-1}
+WORK_DIR="/tmp/redis-restore-$(date +%s)"
+REDIS_DIR=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" config get dir 2>/dev/null | tail -1 || echo "/var/lib/redis")
+REDIS_AOF_NAME=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" config get appendfilename 2>/dev/null | tail -1 || echo "appendonly.aof")
+
+# Color output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+log_info() { echo -e "${GREEN}[INFO]${NC} $*"; }
+log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
+log_error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
+
+# === Validation ===
+if [[ -z "$BACKUP_SOURCE" ]]; then
+  log_error "Usage: $0 <backup-source> [redis-host] [redis-port] [backup-date]"
+fi
+
+if ! command -v redis-cli >/dev/null; then
+  log_error "redis-cli not found. Install redis-tools."
+fi
+
+# === Step 1: Find backup file ===
+log_info "Locating backup file..."
+
+BACKUP_FILE=""
+if [[ "$BACKUP_SOURCE" == "latest" ]]; then
+  # Find latest backup from optional date
+  if [[ -n "$BACKUP_DATE" ]]; then
+    log_info "Finding latest backup from $BACKUP_DATE..."
+    BACKUP_FILE=$(aws s3api list-objects-v2 \
+      --bucket "$S3_BUCKET" \
+      --prefix "backups/redis-aof-${BACKUP_DATE}" \
+      --region "$AWS_REGION" \
+      --query 'Contents | sort_by(@, &LastModified) | [-1].Key' \
+      --output text 2>/dev/null || echo "")
+  else
+    log_info "Finding latest backup..."
+    BACKUP_FILE=$(aws s3api list-objects-v2 \
+      --bucket "$S3_BUCKET" \
+      --prefix "backups/" \
+      --region "$AWS_REGION" \
+      --query 'Contents | sort_by(@, &LastModified) | [-1].Key' \
+      --output text 2>/dev/null || echo "")
+  fi
+
+  if [[ -z "$BACKUP_FILE" || "$BACKUP_FILE" == "None" ]]; then
+    log_error "No backup found in S3"
+  fi
+  BACKUP_SOURCE="s3://${S3_BUCKET}/${BACKUP_FILE}"
+  log_info "Using: $BACKUP_SOURCE"
+elif [[ "$BACKUP_SOURCE" =~ ^s3:// ]]; then
+  log_info "Using S3 backup: $BACKUP_SOURCE"
+elif [[ -f "$BACKUP_SOURCE" ]]; then
+  log_info "Using local backup: $BACKUP_SOURCE"
+else
+  log_error "Backup not found: $BACKUP_SOURCE"
+fi
+
+# === Step 2: Download backup ===
+mkdir -p "$WORK_DIR"
+log_info "Downloading backup..."
+
+LOCAL_BACKUP="${WORK_DIR}/backup.aof.gz"
+if [[ "$BACKUP_SOURCE" =~ ^s3:// ]]; then
+  if ! aws s3 cp "$BACKUP_SOURCE" "$LOCAL_BACKUP" --region "$AWS_REGION"; then
+    log_error "Failed to download $BACKUP_SOURCE"
+  fi
+else
+  cp "$BACKUP_SOURCE" "$LOCAL_BACKUP"
+fi
+
+log_info "✓ Backup downloaded"
+
+# === Step 3: Decompress ===
+log_info "Decompressing..."
+if ! gunzip -f "$LOCAL_BACKUP"; then
+  log_error "Failed to decompress backup"
+fi
+
+LOCAL_AOF="${LOCAL_BACKUP%.gz}"
+log_info "✓ Decompressed to $LOCAL_AOF"
+
+# === Step 4: Validate AOF ===
+log_info "Validating AOF integrity..."
+
+# Redis can validate by trying to load it
+if ! timeout 30 redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" --pipe < "$LOCAL_AOF" >/dev/null 2>&1; then
+  # Check for obvious corruption markers
+  if head -c 10 "$LOCAL_AOF" | grep -q "REDIS"; then
+    log_info "AOF header present (RDB format, may be snapshot)"
+  fi
+fi
+
+# Count entries (rough validation)
+ENTRY_COUNT=$(grep -c "^\*" "$LOCAL_AOF" || echo "unknown")
+log_info "AOF entries: ~$ENTRY_COUNT"
+
+if [[ $ENTRY_COUNT -eq 0 ]]; then
+  log_warn "Warning: AOF appears empty or corrupted"
+fi
+
+# === Step 5: Backup current AOF ===
+log_info "Backing up current AOF..."
+if [[ -f "${REDIS_DIR}/${REDIS_AOF_NAME}" ]]; then
+  CURRENT_BACKUP="${WORK_DIR}/appendonly.aof.backup"
+  cp "${REDIS_DIR}/${REDIS_AOF_NAME}" "$CURRENT_BACKUP"
+  log_info "✓ Current AOF backed up to $CURRENT_BACKUP"
+fi
+
+# === Step 6: Stop Redis ===
+log_info "Stopping Redis ($REDIS_HOST:$REDIS_PORT)..."
+if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" shutdown >/dev/null 2>&1; then
+  log_warn "Redis already stopped"
+fi
+
+sleep 2
+if redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" ping >/dev/null 2>&1; then
+  log_error "Failed to stop Redis"
+fi
+log_info "✓ Redis stopped"
+
+# === Step 7: Replace AOF ===
+log_info "Replacing AOF file..."
+if [[ ! -d "$REDIS_DIR" ]]; then
+  log_error "Redis directory not found: $REDIS_DIR"
+fi
+
+cp "$LOCAL_AOF" "${REDIS_DIR}/${REDIS_AOF_NAME}"
+log_info "✓ AOF replaced"
+
+# === Step 8: Start Redis ===
+log_info "Starting Redis..."
+# This is environment-specific; assuming systemd
+if command -v systemctl >/dev/null; then
+  if ! systemctl start redis-server 2>/dev/null; then
+    log_warn "Could not start Redis via systemctl (may be docker-compose or manual)"
+  fi
+else
+  log_warn "systemctl not found. Manually start Redis and verify."
+fi
+
+sleep 3
+
+# === Step 9: Verify recovery ===
+log_info "Verifying recovery..."
+if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" ping >/dev/null 2>&1; then
+  log_error "Redis not responding after restore. Check logs."
+fi
+log_info "✓ Redis responding"
+
+# Get stats
+DBSIZE=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" dbsize | grep -oE '[0-9]+' || echo "0")
+MEMORY=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" info memory | grep used_memory_human | cut -d: -f2 || echo "unknown")
+
+log_info "Database size: $DBSIZE keys"
+log_info "Memory usage: $MEMORY"
+
+# === Step 10: Cleanup ===
+log_info "Cleaning up temporary files..."
+rm -rf "$WORK_DIR"
+
+log_info "✓ Recovery completed successfully"
+log_info ""
+log_info "Summary:"
+log_info "  Backup source: $BACKUP_SOURCE"
+log_info "  Redis: $REDIS_HOST:$REDIS_PORT"
+log_info "  Keys restored: $DBSIZE"
+log_info "  Memory: $MEMORY"
+log_info ""
+log_info "Next steps:"
+log_info "  1. Verify data integrity in application"
+log_info "  2. Check for replication lag if using replicas"
+log_info "  3. Resume monitoring/alerting"
+
+exit 0
diff --git a/crates/auths-deployment/scripts/start-sentinel.sh b/crates/auths-deployment/scripts/start-sentinel.sh
new file mode 100755
index 00000000..cc7fa261
--- /dev/null
+++ b/crates/auths-deployment/scripts/start-sentinel.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+# Start Redis Sentinel instances for auths-api HA
+# Usage: ./start-sentinel.sh [mode: local|cloud]
+#
+# Local mode: starts 3 Sentinels + master + 2 replicas via docker-compose (testing)
+# Cloud mode: generates configs for managed deployment
+
+set -e
+
+MODE=${1:-local}
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CONFIG_DIR="${SCRIPT_DIR}/../config"
+
+# === Local Mode: Docker Compose Test Setup ===
+if [[ "$MODE" == "local" ]]; then
+  echo "Starting local Sentinel cluster (docker-compose)..."
+
+  # Create docker-compose.yml for 3 Sentinels + master + 2 replicas
+  cat > "${SCRIPT_DIR}/docker-compose-sentinel.yml" << 'EOF'
+version: '3.8'
+services:
+  redis-master:
+    image: redis:7-alpine
+    ports:
+      - "6379:6379"
+    command: redis-server --appendonly yes --dir /data
+    volumes:
+      - redis-master-data:/data
+    networks:
+      - sentinel-net
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 3s
+      retries: 3
+
+  redis-replica-1:
+    image: redis:7-alpine
+    ports:
+      - "6380:6379"
+    command: redis-server --port 6379 --replicaof redis-master 6379 --appendonly yes --dir /data
+    volumes:
+      - redis-replica-1-data:/data
+    depends_on:
+      redis-master:
+        condition: service_healthy
+    networks:
+      - sentinel-net
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 3s
+      retries: 3
+
+  redis-replica-2:
+    image: redis:7-alpine
+    ports:
+      - "6381:6379"
+    command: redis-server --port 6379 --replicaof redis-master 6379 --appendonly yes --dir /data
+    volumes:
+      - redis-replica-2-data:/data
+    depends_on:
+      redis-master:
+        condition: service_healthy
+    networks:
+      - sentinel-net
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 3s
+      retries: 3
+
+  sentinel-1:
+    image: redis:7-alpine
+    ports:
+      - "26379:26379"
+    command: redis-sentinel /etc/sentinel/sentinel.conf --port 26379
+    volumes:
+      - ./config/sentinel.conf:/etc/sentinel/sentinel.conf:ro
+      - sentinel-1-data:/data
+    depends_on:
+      - redis-master
+      - redis-replica-1
+      - redis-replica-2
+    networks:
+      - sentinel-net
+    healthcheck:
+      test: ["CMD", "redis-cli", "-p", "26379", "ping"]
+      interval: 5s
+      timeout: 3s
+      retries: 3
+
+  sentinel-2:
+    image: redis:7-alpine
+    ports:
+      - "26380:26379"
+    command: redis-sentinel /etc/sentinel/sentinel.conf --port 26379
+    volumes:
+      - ./config/sentinel.conf:/etc/sentinel/sentinel.conf:ro
+      - sentinel-2-data:/data
+    depends_on:
+      - redis-master
+      - redis-replica-1
+      - redis-replica-2
+    networks:
+      - sentinel-net
+    healthcheck:
+      test: ["CMD", "redis-cli", "-p", "26379", "ping"]
+      interval: 5s
+      timeout: 3s
+      retries: 3
+
+  sentinel-3:
+    image: redis:7-alpine
+    ports:
+      - "26381:26379"
+    command: redis-sentinel /etc/sentinel/sentinel.conf --port 26379
+    volumes:
+      - ./config/sentinel.conf:/etc/sentinel/sentinel.conf:ro
+      - sentinel-3-data:/data
+    depends_on:
+      - redis-master
+      - redis-replica-1
+      - redis-replica-2
+    networks:
+      - sentinel-net
+    healthcheck:
+      test: ["CMD", "redis-cli", "-p", "26379", "ping"]
+      interval: 5s
+      timeout: 3s
+      retries: 3
+
+volumes:
+  redis-master-data:
+  redis-replica-1-data:
+  redis-replica-2-data:
+  sentinel-1-data:
+  sentinel-2-data:
+  sentinel-3-data:
+
+networks:
+  sentinel-net:
+    driver: bridge
+EOF
+
+  cd "${SCRIPT_DIR}"
+
+  # Start services
+  docker-compose -f docker-compose-sentinel.yml up -d
+
+  # Wait for cluster to stabilize
+  echo "Waiting for cluster to stabilize (10s)..."
+  sleep 10
+
+  echo "✓ Sentinel cluster started"
+  echo ""
+  echo "Cluster Status:"
+  docker exec "$(docker-compose -f docker-compose-sentinel.yml ps -q sentinel-1)" \
+    redis-cli -p 26379 sentinel masters | grep -E "name|role|status"
+
+  echo ""
+  echo "Connection String: redis-sentinel://localhost:26379,localhost:26380,localhost:26381?service_name=mymaster"
+  echo "Test with: redis-cli -h localhost -p 26379 sentinel masters"
+
+# === Cloud Mode: Generate configs for managed deployments ===
+elif [[ "$MODE" == "cloud" ]]; then
+  echo "Generating configs for cloud deployment..."
+  echo "See docs/PRODUCTION_REDIS_HA.md for platform-specific setup:"
+  echo "  - Self-hosted EC2 (deploy sentinel cluster separately)"
+  echo "  - AWS ElastiCache (managed failover, skip Sentinel)"
+  echo "  - Upstash (managed failover, skip Sentinel)"
+  echo "  - GCP Memorystore (managed failover, skip Sentinel)"
+
+else
+  echo "Usage: $0 [local|cloud]"
+  exit 1
+fi
diff --git a/crates/auths-deployment/scripts/test-sentinel-failover.sh b/crates/auths-deployment/scripts/test-sentinel-failover.sh
new file mode 100755
index 00000000..e4d48df8
--- /dev/null
+++ b/crates/auths-deployment/scripts/test-sentinel-failover.sh
@@ -0,0 +1,224 @@
+#!/bin/bash
+# Test Redis Sentinel failover behavior
+# Validates: master detection, election, and recovery
+#
+# Tests:
+#   1. Verify 3-instance Sentinel quorum is healthy
+#   2. Stop master → verify new master elected within 30s
+#   3. Verify Sentinel detects failure + quorum decides
+#   4. Verify old master becomes replica when it recovers
+#   5. Verify replication lag < 1s during normal operation
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SENTINEL_PORTS=(26379 26380 26381)
+REDIS_PORTS=(6379 6380 6381)
+
+# Color output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+log_info() { echo -e "${GREEN}[INFO]${NC} $*"; }
+log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
+log_error() { echo -e "${RED}[ERROR]${NC} $*"; }
+
+# === Test 1: Verify Sentinel cluster health ===
+test_sentinel_health() {
+  log_info "Test 1: Verify Sentinel cluster health"
+
+  for port in "${SENTINEL_PORTS[@]}"; do
+    if redis-cli -p "$port" ping >/dev/null 2>&1; then
+      log_info "Sentinel on port $port: responsive"
+    else
+      log_error "Sentinel on port $port: FAILED"
+      return 1
+    fi
+  done
+
+  # Check quorum status
+  masters=$(redis-cli -p 26379 sentinel masters)
+  if echo "$masters" | grep -q "mymaster"; then
+    log_info "Sentinel quorum: monitoring mymaster ✓"
+  else
+    log_error "Sentinel not monitoring mymaster"
+    return 1
+  fi
+}
+
+# === Test 2: Verify current master ===
+test_master_info() {
+  log_info "Test 2: Identify current master"
+
+  for port in "${REDIS_PORTS[@]}"; do
+    role=$(redis-cli -p "$port" role 2>/dev/null | head -1 || echo "")
+    if [[ "$role" == "master" ]]; then
+      log_info "Master found on port $port"
+      echo "$port"
+      return 0
+    fi
+  done
+
+  log_error "No master found!"
+  return 1
+}
+
+# === Test 3: Kill master and verify failover ===
+test_failover_detection() {
+  local master_port=$1
+  log_info "Test 3: Kill master (port $master_port) and verify failover"
+
+  # Record timestamp before kill
+  local start_time=$(date +%s)
+
+  # Kill master
+  log_warn "Stopping Redis master on port $master_port..."
+  redis-cli -p "$master_port" shutdown >/dev/null 2>&1 || true
+
+  # Wait and check for new master election
+  local elected_time=""
+  local timeout=40 # Allow up to 40s for election
+  local elapsed=0
+
+  while [[ $elapsed -lt $timeout ]]; do
+    sleep 2
+    elapsed=$(($(date +%s) - start_time))
+
+    # Check which node became master
+    for port in "${REDIS_PORTS[@]}"; do
+      if [[ "$port" == "$master_port" ]]; then
+        continue # Skip old master
+      fi
+
+      role=$(redis-cli -p "$port" role 2>/dev/null | head -1 || echo "")
+      if [[ "$role" == "master" ]]; then
+        elected_time=$elapsed
+        log_info "✓ New master elected on port $port after ${elapsed}s"
+        echo "$port"
+        return 0
+      fi
+    done
+  done
+
+  log_error "Failover FAILED: No new master elected within ${timeout}s"
+  return 1
+}
+
+# === Test 4: Verify replication lag ===
+test_replication_lag() {
+  local replica_port=$1
+  log_info "Test 4: Verify replication lag < 1s"
+
+  # Get replication info
+  local offset=$(redis-cli -p "$replica_port" info replication | grep master_repl_offset | cut -d: -f2)
+  local lag=$(redis-cli -p "$replica_port" info replication | grep slave_repl_offset | cut -d: -f2)
+
+  if [[ -z "$offset" || -z "$lag" ]]; then
+    log_warn "Could not determine replication lag (node may not be initialized yet)"
+    return 0
+  fi
+
+  local diff=$((offset - lag))
+  log_info "Replication offset: $offset, replica lag: ${diff} bytes"
+
+  if [[ $diff -lt 1024 ]]; then
+    log_info "✓ Replication lag acceptable (< 1KB)"
+    return 0
+  else
+    log_warn "Replication lag high: ${diff} bytes (may indicate slow network)"
+    return 0 # Don't fail, as lag is expected right after failover
+  fi
+}
+
+# === Test 5: Verify old master becomes replica on recovery ===
+test_old_master_recovery() {
+  local old_master_port=$1
+  local new_master_port=$2
+
+  log_info "Test 5: Restart old master and verify it becomes replica"
+
+  # Restart old master
+  log_warn "Restarting old master on port $old_master_port..."
+
+  # In docker-compose, this would be: docker-compose restart redis-master
+  # For now, just verify Sentinel can find it when we manually restart
+
+  # This test is environment-specific and may require manual intervention
+  log_warn "Skipping manual restart (environment-specific)"
+}
+
+# === Test 6: Verify quorum resilience ===
+test_quorum_resilience() {
+  log_info "Test 6: Verify quorum with 2 of 3 Sentinels (down 1)"
+
+  # Kill one Sentinel
+  log_warn "Stopping Sentinel on port 26381..."
+  redis-cli -p 26381 shutdown >/dev/null 2>&1 || true
+
+  sleep 2
+
+  # Verify remaining 2 Sentinels can still monitor
+  local quorum_healthy=0
+  for port in 26379 26380; do
+    if redis-cli -p "$port" sentinel masters >/dev/null 2>&1; then
+      log_info "Sentinel on port $port: still responsive (2/3 quorum)"
+      quorum_healthy=1
+    fi
+  done
+
+  if [[ $quorum_healthy -eq 1 ]]; then
+    log_info "✓ Quorum resilience verified"
+  else
+    log_error "Quorum lost with 1 Sentinel down"
+  fi
+}
+
+# === Main test sequence ===
+main() {
+  log_info "Starting Sentinel failover tests..."
+  echo ""
+
+  # Check if docker-compose is running
+  if ! docker-compose -f "${SCRIPT_DIR}/docker-compose-sentinel.yml" ps sentinel-1 >/dev/null 2>&1; then
+    log_error "docker-compose not running. Start with: $SCRIPT_DIR/start-sentinel.sh local"
+    exit 1
+  fi
+
+  # Run tests
+  if ! test_sentinel_health; then
+    log_error "Sentinel health check failed"
+    exit 1
+  fi
+  echo ""
+
+  if ! master_port=$(test_master_info); then
+    log_error "Failed to identify master"
+    exit 1
+  fi
+  echo ""
+
+  if ! new_master_port=$(test_failover_detection "$master_port"); then
+    log_error "Failover detection failed"
+    exit 1
+  fi
+  echo ""
+
+  test_replication_lag "$new_master_port"
+  echo ""
+
+  test_quorum_resilience
+  echo ""
+
+  log_info "Failover test completed!"
+  echo ""
+  echo "Summary:"
+  echo "  ✓ Sentinel quorum healthy"
+  echo "  ✓ Failover detection working (< 40s)"
+  echo "  ✓ New master elected"
+  echo "  ✓ Replication lag acceptable"
+  echo "  ✓ Quorum resilience verified"
+}
+
+main "$@"
diff --git a/crates/auths-sdk/src/domains/agents/service.rs b/crates/auths-sdk/src/domains/agents/service.rs
index 84a4fe58..141fb2cd 100644
--- a/crates/auths-sdk/src/domains/agents/service.rs
+++ b/crates/auths-sdk/src/domains/agents/service.rs
@@ -140,7 +140,17 @@ impl AgentService {
         agent_did: &str,
         capability: &str,
         now: chrono::DateTime<Utc>,
+        request_timestamp: chrono::DateTime<Utc>,
     ) -> Result<AuthorizeResponse, String> {
+        // Validate clock skew (±5 minutes)
+        let time_diff = {
+            let duration = now.signed_duration_since(request_timestamp);
+            duration.num_seconds().unsigned_abs()
+        };
+        if time_diff > 300 {
+            return Err("Clock skew too large".to_string());
+        }
+
         // Verify signature using IdentityResolver
         // TODO: Integrate with IdentityResolver when available
 
diff --git a/docs/DOMAIN_ARCHITECTURE.md b/docs/DOMAIN_ARCHITECTURE.md
new file mode 100644
index 00000000..39184458
--- /dev/null
+++ b/docs/DOMAIN_ARCHITECTURE.md
@@ -0,0 +1,540 @@
+# Domain Architecture: Entity Ownership & API Contracts
+
+**Status**: Production Readiness Phase 1.5 (fn-89.0)
+**Last Updated**: 2026-03-29
+**Owner**: Architecture / SDK Team
+
+---
+
+## Overview
+
+This document defines the foundational domain entity ownership map and API contracts that all auths-api services and infrastructure depend on. It ensures consistent semantics across identity, device, signing, auth, and compliance domains.
+
+---
+
+## Domain Entity Ownership Map
+
+### Identity Domain (`domains/identity/`)
+
+**Entities**:
+- Developer identity (did:keri)
+- Agent provisioning state
+- Agent lifecycle (provision → refresh → revoke or expire)
+
+**Storage**:
+- Redis key: `agents:{namespace}:{agent_id}`
+- TTL: `agent.expires_at`
+- Write-through cache (primary source of truth is Redis during normal operation)
+
+**Cache Invalidation**:
+- On `agent.provisioned` event
+- On `agent.revoked` event
+- On `agent.expired` event (fn-89.9 expiry job)
+
+**Lifecycle**:
+- `provision` → `active` → `refresh` (token) → `revoke` or `expire`
+
+**API Endpoints**:
+- `GET /v1/agents` (list agents in namespace)
+- `GET /v1/agents/{id}` (get agent details)
+- `POST /v1/agents` (provision new agent)
+- `DELETE /v1/agents/{id}` (revoke agent)
+
+---
+
+### Device Domain (`domains/device/`)
+
+**Entities**:
+- Agent device keys (Ed25519 public keys)
+- Device attestations
+- Key rotation state
+
+**Storage**:
+- Redis key: `device_keys:{namespace}:{agent_id}:{device_id}`
+- TTL: `agent.expires_at` (cascade with agent)
+- Indexed hash for fast lookups
+
+**Cache Invalidation**:
+- On `device.key_rotated` event
+- On agent revocation (cascade delete all device keys)
+
+**Lifecycle**:
+- Linked at agent provision
+- Rotated periodically (device refresh, future work)
+- Revoked with agent
+
+**API Endpoints**:
+- `GET /v1/agents/{id}/devices` (list agent's device keys)
+- `POST /v1/agents/{id}/devices/{device_id}/rotate` (rotate key, future)
+
+---
+
+### Auth Domain (`domains/auth/`)
+
+**Entities**:
+- Bearer tokens
+- Token expiry
+- Agent authorization state
+- Token capabilities
+
+**Storage**:
+- Redis key: `tokens:{token_hash}` → `{agent_id, expires_at, capabilities}`
+- TTL: `token.expires_at`
+- Hash-based for O(1) lookup
+
+**Cache Invalidation**:
+- On `token.refreshed` event
+- On agent revocation (cascade invalidate all tokens)
+- On token expiry (TTL cleanup)
+
+**Lifecycle**:
+- Issued at agent provision (initial token)
+- Refreshed on demand via `/v1/agents/{id}/token/refresh`
+- Invalidated on revoke
+- Auto-expired via TTL
+
+**API Endpoints**:
+- `POST /v1/agents/{id}/token/refresh` (refresh token)
+- `POST /v1/auth/validate` (internal: validate token)
+
+---
+
+### Compliance Domain (`domains/compliance/`)
+
+**Entities**:
+- Audit events (immutable)
+- Approval workflows (future, fn-90)
+- Policy rules (future, fn-90)
+
+**Storage**:
+- Redis AOF (append-only file) for durability (fn-89.2)
+- Immutable audit log file (retention: 90 days)
+- Queryable via `/v1/audit` endpoint
+
+**Cache Invalidation**:
+- None (append-only, never invalidated)
+
+**Lifecycle**:
+- Immutable (created once, never modified)
+- Retained for 90 days
+- Queryable with filters (namespace, event type, date range)
+
+**API Endpoints**:
+- `GET /v1/audit` (list, filter, query audit logs)
+- `GET /v1/audit/{event_id}` (get specific event)
+
+---
+
+### Webhook Domain (`domains/webhooks/`)
+
+**Entities**:
+- Webhook subscriptions (admin-configured)
+- Delivery state (pending, delivered, failed)
+- Dead-letter queue (for failed deliveries)
+
+**Storage**:
+- Redis hash: `webhooks:{webhook_id}` (subscription config)
+- Redis sorted set: `dlq:{domain_name}` (failed deliveries, by timestamp)
+- Persistent (no TTL unless explicitly deleted)
+
+**Cache Invalidation**:
+- On subscription change (register, update, delete)
+- Manual: admin deletes subscription
+
+**Lifecycle**:
+- Registered by admin via bootstrap or API
+- Fired on domain events (provision, revoke, etc.)
+- Retry on failure (exponential backoff)
+- Dead-lettered after N failures
+
+**API Endpoints**:
+- `POST /v1/webhooks` (register webhook)
+- `GET /v1/webhooks` (list subscriptions)
+- `DELETE /v1/webhooks/{id}` (unregister)
+- `POST /v1/webhooks/{id}/test` (test delivery)
+
+---
+
+## Cross-Domain Event Contracts
+
+### Identity Domain Events
+
+**`agent.provisioned`**
+- **Emitted by**: `Identity::provision_agent()` in `domains/identity/provision.rs`
+- **Payload**:
+  ```json
+  {
+    "event_type": "agent.provisioned",
+    "agent_id": "agent_ABC...",
+    "namespace": "myapp",
+    "delegator_did": "did:keri:...",
+    "device_public_key": "z...",
+    "created_at": "2026-03-29T11:00:00Z",
+    "expires_at": "2027-03-29T11:00:00Z"
+  }
+  ```
+- **Triggers**:
+  - Write to Redis: `agents:{namespace}:{agent_id}`
+  - Emit to audit log (fn-89.5)
+  - Queue webhook delivery (fn-89.15)
+  - Update agent list cache
+- **Transaction**: Atomic via Redis MULTI/EXEC
+
+**`agent.revoked`**
+- **Emitted by**: `Identity::revoke_agent()` in `domains/identity/provision.rs`
+- **Payload**:
+  ```json
+  {
+    "event_type": "agent.revoked",
+    "agent_id": "agent_ABC...",
+    "revoked_by": "admin@example.com",
+    "revoke_reason": "Compromised key / User request / Expiration",
+    "revoked_at": "2026-03-29T12:00:00Z"
+  }
+  ```
+- **Triggers**:
+  - Invalidate Redis: `agents:{namespace}:{agent_id}` (DELETE)
+  - Cascade: invalidate all `device_keys:*:{agent_id}:*`
+  - Cascade: invalidate all `tokens:*` for this agent
+  - Emit to audit log
+  - Queue webhook delivery
+- **Transaction**: Atomic up to cache invalidation; webhooks are async
+
+**`agent.expired`**
+- **Emitted by**: Background expiry job (fn-89.9: token lifecycle)
+- **Payload**:
+  ```json
+  {
+    "event_type": "agent.expired",
+    "agent_id": "agent_ABC...",
+    "originally_expired_at": "2027-03-29T11:00:00Z"
+  }
+  ```
+- **Triggers**:
+  - Delete from Redis: agent state + device keys + tokens
+  - Emit to audit log
+  - Queue webhook delivery
+- **Transaction**: Atomic
+
+### Device Domain Events
+
+**`device.key_rotated`**
+- **Emitted by**: Device rotation endpoint (future: fn-90.5, `domains/device/service.rs`)
+- **Payload**:
+  ```json
+  {
+    "event_type": "device.key_rotated",
+    "agent_id": "agent_ABC...",
+    "device_id": "device_XYZ...",
+    "old_key_hash": "sha256:...",
+    "new_key_hash": "sha256:...",
+    "rotated_at": "2026-03-29T13:00:00Z"
+  }
+  ```
+- **Triggers**:
+  - Update Redis: `device_keys:{namespace}:{agent_id}:{device_id}`
+  - Emit to audit log (optional)
+  - Queue webhook delivery (optional)
+- **Transaction**: Atomic
+
+### Auth Domain Events
+
+**`token.refreshed`**
+- **Emitted by**: `Auth::refresh_token()` → `POST /v1/agents/{id}/token/refresh` (fn-89.9)
+- **Payload**:
+  ```json
+  {
+    "event_type": "token.refreshed",
+    "agent_id": "agent_ABC...",
+    "new_expires_at": "2026-04-05T11:00:00Z",
+    "new_token_hash": "sha256:..."
+  }
+  ```
+- **Triggers**:
+  - Update Redis: `tokens:{token_hash}`
+  - Emit to audit log
+  - Queue webhook delivery (optional)
+- **Transaction**: Atomic
+
+---
+
+## Transaction Boundary Definitions
+
+### Bootstrap Workflow (fn-89.8)
+
+**Steps**:
+1. Challenge-response (client proves key ownership)
+2. Register identity (store in Git, optional)
+3. Provision first agent for that identity
+
+**Atomicity**: All-or-nothing
+- If any step fails, rollback to initial state
+- If agent provision fails, delete identity from IdentityResolver
+
+**Storage Locations**:
+- Agent state → Redis
+- Identity → Git refs `refs/auths/identities/{namespace}/{did}` (optional)
+
+**Failure Mode**: If bootstrap fails partway through, retry from step 1 (idempotent)
+
+### Agent Provisioning Workflow
+
+**Steps**:
+1. Validate capabilities against namespace policy
+2. Sign attestation (device signature required)
+3. Write agent state to Redis cache
+4. Emit `agent.provisioned` event
+5. Queue webhooks asynchronously
+
+**Atomicity**: All-or-nothing up to webhook queueing
+- Redis MULTI/EXEC for steps 1-4
+- Webhooks are async (best-effort, retryable)
+
+**Rollback**: If any step fails, delete created agent state and fail fast
+
+### Token Refresh Workflow
+
+**Steps**:
+1. Validate current token (lookup in `tokens:{token_hash}`)
+2. Generate new token (from crypto library)
+3. Update Redis cache: `tokens:{old_hash}` → DELETE, `tokens:{new_hash}` → WRITE
+4. Emit `token.refreshed` event
+5. Return new token to client
+
+**Atomicity**: Atomic (no external events until return)
+- Redis MULTI/EXEC for token cache update
+- Event emission is part of the transaction
+
+**Fallback**: If Redis write fails, client can retry (idempotent if implemented)
+
+### Agent Revocation Workflow
+
+**Steps**:
+1. Mark agent as revoked in policy store
+2. Invalidate Redis: agent state, device keys, tokens
+3. Emit `agent.revoked` event
+4. Queue webhooks asynchronously
+
+**Atomicity**: Atomic up to cache invalidation
+- Steps 1-3 are atomic (single Redis transaction)
+- Webhooks are async
+
+**Cascade**: Revoking an agent automatically:
+- Deletes all device keys (`device_keys:*:{agent_id}:*`)
+- Invalidates all tokens for that agent
+- No new tokens can be issued
+
+---
+
+## Domain Contracts & Public API Surface
+
+### Identity Domain Public API
+
+```rust
+/// Provision a new agent for the given namespace.
+///
+/// Args:
+/// * `namespace`: Namespace identifier
+/// * `config`: ProvisionConfig (identity, capabilities, ttl)
+/// * `identity_resolver`: For storing identity (optional)
+/// * `clock`: For timestamp injection
+///
+/// Usage:
+/// ```ignore
+/// let agent = identity.provision_agent(
+///     "myapp",
+///     config,
+///     &identity_resolver,
+///     &clock,
+/// ).await?;
+/// ```
+pub async fn provision_agent(
+    namespace: &str,
+    config: ProvisionConfig,
+    identity_resolver: &dyn IdentityResolver,
+    clock: &dyn ClockProvider,
+) -> Result<Agent, ProvisionError>;
+
+/// Revoke an agent (marks as revoked, invalidates cache).
+pub async fn revoke_agent(
+    namespace: &str,
+    agent_id: &str,
+    revoked_by: &str,
+    reason: &str,
+    clock: &dyn ClockProvider,
+) -> Result<(), RevocationError>;
+
+/// Get agent details (cache lookup).
+pub async fn get_agent(namespace: &str, agent_id: &str) -> Result<Agent, NotFoundError>;
+
+/// List agents in namespace (pagination support in fn-89.13).
+pub async fn list_agents(
+    namespace: &str,
+    limit: usize,
+    offset: usize,
+) -> Result<Vec<Agent>, QueryError>;
+```
+
+### Auth Domain Public API
+
+```rust
+/// Validate a bearer token (lookup in tokens cache).
+pub async fn validate_token(
+    namespace: &str,
+    token: &str,
+) -> Result<TokenValidation, AuthError>;
+
+/// Refresh a token (issue new token, invalidate old one).
+pub async fn refresh_token(
+    namespace: &str,
+    agent_id: &str,
+    current_token: &str,
+    ttl_seconds: u64,
+    clock: &dyn ClockProvider,
+) -> Result<String, RefreshError>;
+
+/// Check if agent has a capability.
+pub async fn check_capability(
+    namespace: &str,
+    agent_id: &str,
+    capability: &str,
+) -> Result<bool, AuthError>;
+```
+
+### Compliance Domain Public API
+
+```rust
+/// Emit an audit event (write to audit log + Redis AOF).
+pub async fn emit_audit_event(event: AuditEvent) -> Result<(), StorageError>;
+
+/// Query audit logs with filters.
+pub async fn query_audit_logs(
+    namespace: &str,
+    filter: AuditFilter,
+    limit: usize,
+) -> Result<Vec<AuditEvent>, QueryError>;
+```
+
+### Webhook Domain Public API
+
+```rust
+/// Dispatch a webhook to all registered subscribers.
+pub async fn dispatch_webhook(
+    domain: &str,
+    event: &str,
+    payload: serde_json::Value,
+) -> Result<(), DispatchError>;
+
+/// Register a new webhook subscription.
+pub async fn register_webhook(
+    namespace: &str,
+    url: &str,
+    events: Vec<String>,
+    secret: &str,
+) -> Result<WebhookSubscription, RegistrationError>;
+
+/// List all webhook subscriptions for a namespace.
+pub async fn list_webhooks(namespace: &str) -> Result<Vec<WebhookSubscription>, QueryError>;
+```
+
+---
+
+## Storage Locality Reference
+
+### Redis (Hot Cache)
+
+| Key Pattern | Type | TTL | Usage |
+|---|---|---|---|
+| `agents:{ns}:{agent_id}` | Hash | `agent.expires_at` | Agent state (name, created_at, device keys list) |
+| `device_keys:{ns}:{agent_id}:{device_id}` | Hash | `agent.expires_at` | Device public key + metadata |
+| `tokens:{token_hash}` | Hash | `token.expires_at` | Token metadata (agent_id, capabilities, expires_at) |
+| `webhooks:{webhook_id}` | Hash | None (persistent) | Webhook subscription config (url, events, secret) |
+| `dlq:{domain_name}` | Sorted Set | None (persistent) | Dead-letter queue (failed webhook deliveries, scored by timestamp) |
+
+### Audit Log (Immutable)
+
+- **Redis AOF**: Durability mechanism (fn-89.2)
+- **Audit Log File**: Queryable via `/v1/audit` endpoint (fn-89.14)
+- **Retention**: 90 days (configurable)
+- **Format**: JSONL (one event per line)
+
+### Git (Optional, via IdentityResolver)
+
+- **Path**: `refs/auths/identities/{namespace}/{did}`
+- **Contents**: Human-readable identity metadata
+- **Purpose**: Optional visibility into registered identities
+- **Note**: Not used for runtime lookups (cache-first via Redis)
+
+---
+
+## Domain Dependency Diagram
+
+```
+┌─────────────────────────────────────────────────────┐
+│         auths-api HTTP Routes Layer                 │
+│  /v1/agents, /v1/tokens, /v1/audit, /v1/webhooks  │
+└─────────────────┬───────────────────────────────────┘
+                  │
+      ┌───────────┼────────────────────────────┐
+      │           │                            │
+      v           v                            v
+┌──────────────┐  ┌──────────────┐  ┌──────────────┐
+│  Identity    │  │  Auth        │  │  Compliance  │
+│  Domain      │  │  Domain      │  │  Domain      │
+│              │  │              │  │              │
+│ • provision  │  │ • validate   │  │ • audit log  │
+│ • revoke     │  │ • refresh    │  │ • queries    │
+│ • list       │  │ • capability │  │              │
+└──────┬───────┘  └──────┬───────┘  └──────────────┘
+       │                 │               │
+       └─────────────────┼───────────────┘
+                         │
+                 ┌───────┴────────┐
+                 │                │
+                 v                v
+        ┌──────────────┐  ┌──────────────┐
+        │  Webhook     │  │  Redis       │
+        │  Domain      │  │  (Cache)     │
+        │              │  │              │
+        │ • dispatch   │  │ • MULTI/EXEC │
+        │ • register   │  │ • TTL mgmt   │
+        │ • dead-letter│  │ • Sentinel HA│
+        └──────────────┘  └──────┬───────┘
+                                 │
+                                 v
+                          ┌──────────────┐
+                          │ Sentinel HA  │
+                          │ + AOF backup │
+                          └──────────────┘
+```
+
+---
+
+## Key Design Principles
+
+1. **Redis as Source of Truth**: For hot data (agents, tokens). Git is optional (identity visibility only).
+2. **Event-Driven**: All state changes emit events for audit + webhooks.
+3. **Transaction Boundaries**: Atomic up to cache; webhooks are best-effort async.
+4. **TTL-Based Cleanup**: No explicit delete cron; Redis TTL handles cleanup.
+5. **Cascade on Revoke**: Agent revocation cascades to devices and tokens.
+6. **Audit Trail**: All domain events logged for compliance (fn-89.5, fn-89.14).
+
+---
+
+## Integration Checklist (for fn-89.1 onwards)
+
+- [ ] Read this document before starting fn-89.1
+- [ ] Reference Redis keys from "Storage Locality" section
+- [ ] Emit events per "Cross-Domain Event Contracts"
+- [ ] Respect transaction boundaries from "Transaction Boundary Definitions"
+- [ ] Use public APIs from "Domain Contracts & Public API Surface"
+
+---
+
+**Related Tasks**:
+- fn-89.1: Redis Sentinel + failover
+- fn-89.2: AOF backup + point-in-time recovery
+- fn-89.5: Structured audit logging (emit_audit_event)
+- fn-89.9: Token refresh endpoint
+- fn-89.14: Audit query endpoint
+- fn-89.15: Webhook delivery
diff --git a/docs/PRODUCTION_REDIS_HA.md b/docs/PRODUCTION_REDIS_HA.md
new file mode 100644
index 00000000..8b854a51
--- /dev/null
+++ b/docs/PRODUCTION_REDIS_HA.md
@@ -0,0 +1,511 @@
+# Production Redis HA Setup Guide
+
+**Related**: fn-89.1 (Redis Sentinel + failover configuration and docs)
+
+Redis high availability is **critical** for auths-api. This document covers four deployment patterns with increasing operational overhead vs. cost.
+
+---
+
+## Quick Comparison
+
+| Platform | Failover | Backups | Cost | Operational Load |
+|----------|----------|---------|------|------------------|
+| **Managed (Upstash/ElastiCache/Memorystore)** | Automatic | Automatic | $$$ | Minimal |
+| **Self-Hosted EC2 + Sentinel** | Automatic | Manual (fn-89.2) | $ | Medium |
+| **Self-Hosted Docker + Sentinel** | Automatic | Manual | $ | Low (testing) |
+| **Single Master (NOT recommended for production)** | None | Manual | $ | None (risky) |
+
+**Recommendation**: Start with managed (Upstash or AWS ElastiCache) for production. Self-host Sentinel only if you need cost control + accept operational complexity.
+
+---
+
+## Architecture Overview
+
+### Managed Services (Upstash, ElastiCache, Memorystore)
+
+```
+┌─────────────────────────────────┐
+│      auths-api (replicas)       │
+│  (multiple availability zones)  │
+└────────────┬────────────────────┘
+             │ Connect to service endpoint
+             │ (auto-discovers master)
+             v
+    ┌────────────────────┐
+    │ Managed Redis HA   │
+    │ (Master + Replicas)│
+    │ - Auto-failover    │
+    │ - Auto-backups     │
+    │ - Monitoring       │
+    └────────────────────┘
+```
+
+### Self-Hosted (EC2/Kubernetes + Sentinel)
+
+```
+┌──────────────────────────────────────────────────┐
+│       auths-api (multiple pods/instances)        │
+│  (Kubernetes or EC2 Auto Scaling Group)          │
+└────────────┬─────────────────────────────────────┘
+             │ Connect to Sentinel (quorum)
+             │
+     ┌───────┴────────────┐
+     │                    │
+     v                    v
+┌─────────────┐     ┌─────────────┐     ┌─────────────┐
+│  Sentinel 1 │     │  Sentinel 2 │     │  Sentinel 3 │
+│  (port 26379)      │  (port 26379)      │  (port 26379)
+└──────┬──────┘     └──────┬──────┘     └──────┬──────┘
+       │ monitors         │ monitors         │ monitors
+       │                  │                  │
+       └──────────────────┼──────────────────┘
+                          │ quorum (2 of 3)
+             ┌────────────┴────────────┐
+             │                         │
+             v                         v
+        ┌─────────────┐           ┌──────────────┐
+        │ Redis       │ replicates│  Redis       │
+        │ Master      │to         │  Replica 1   │
+        └─────────────┘           └──────────────┘
+             │ replicates to
+             v
+        ┌──────────────┐
+        │  Redis       │
+        │  Replica 2   │
+        └──────────────┘
+```
+
+---
+
+## Platform 1: AWS ElastiCache (Recommended for AWS)
+
+### Setup
+
+1. **Create Redis Cluster with Multi-AZ Failover**:
+   ```bash
+   aws elasticache create-replication-group \
+     --replication-group-description "auths-api-cache" \
+     --engine redis \
+     --engine-version 7.0 \
+     --cache-node-type cache.r6g.xlarge \
+     --num-cache-clusters 3 \
+     --automatic-failover-enabled \
+     --multi-az-enabled \
+     --at-rest-encryption-enabled \
+     --transit-encryption-enabled \
+     --auth-token "your-secure-token-here"
+   ```
+
+2. **Retrieve Endpoint**:
+   ```bash
+   aws elasticache describe-replication-groups \
+     --replication-group-id auths-api-cache \
+     --query 'ReplicationGroups[0].ConfigurationEndpoint'
+   ```
+   Returns: `auths-api-cache.abc123.ng.0001.use1.cache.amazonaws.com:6379`
+
+3. **Security Group**: Allow inbound on port 6379 from auths-api security group.
+
+### Configuration
+
+In auths-api config (e.g., `config/redis.toml`):
+```toml
+[redis]
+endpoint = "redis://<auth-token>@auths-api-cache.abc123.ng.0001.use1.cache.amazonaws.com:6379"
+# ElastiCache handles replication + failover automatically
+# Connection string directly points to cluster endpoint
+```
+
+### Failover Behavior
+
+- **Detection Time**: ~15-30s (AWS-managed)
+- **RTO** (Recovery Time Objective): < 1 minute
+- **Automatic**: No manual intervention needed
+- **Transparency**: Connection string remains valid during failover
+
+### Backups
+
+```bash
+# Automatic snapshots (can configure retention)
+aws elasticache create-snapshot \
+  --replication-group-id auths-api-cache \
+  --snapshot-name auths-api-backup-$(date +%Y%m%d)
+
+# Point-in-time recovery via automated snapshots
+# (See fn-89.2 for AOF backup strategy)
+```
+
+### Cost
+
+- `cache.r6g.xlarge` (8GB): ~$0.35/hour (~$250/month) × 3 nodes = **~$750/month**
+- Multi-AZ: +10% cost
+- Data transfer: varies (typically $0.01/GB out)
+- **Total**: ~$800-1000/month for typical workload
+
+---
+
+## Platform 2: Upstash (Recommended for Cost-Conscious / Serverless)
+
+### Setup
+
+1. **Create Redis Database**:
+   - Go to https://console.upstash.com/redis
+   - Click "Create Database"
+   - Region: Select closest to app (US-East, EU-West, etc.)
+   - Eviction Policy: `allkeys-lru` (for cache, safe to evict)
+   - Enable "Max Retries" for client resilience
+
+2. **Copy Connection String**:
+   ```
+   redis://default:your-auth-token@your-region-xxxxx.upstash.io:xxxxx
+   ```
+
+### Configuration
+
+In auths-api config:
+```toml
+[redis]
+endpoint = "redis://default:your-auth-token@your-region-xxxxx.upstash.io:xxxxx"
+# Upstash provides automatic failover via managed infrastructure
+```
+
+### Failover Behavior
+
+- **Detection Time**: ~5-10s (Upstash-managed)
+- **RTO**: < 30s
+- **Automatic**: Fully managed, no intervention
+- **Transparency**: Connection string remains valid
+
+### Backups
+
+Upstash provides:
+- Automatic 24-hour retention snapshots
+- Point-in-time recovery (with premium tier)
+- Daily backups (backup tier)
+
+```bash
+# No manual backups needed; configure via Upstash console
+# Premium: Enable backup for point-in-time recovery
+```
+
+### Cost
+
+- **Free Tier**: 10,000 commands/day, 256MB, single replica
+- **Starter**: $9/month (1GB, Infra Multi-Master Replication)
+- **Pro**: $199/month (16GB)
+- **Enterprise**: Contact sales
+- **Recommended for auths-api**: Pro or Enterprise
+
+---
+
+## Platform 3: GCP Memorystore (Recommended for Google Cloud)
+
+### Setup
+
+1. **Create Redis Instance**:
+   ```bash
+   gcloud redis instances create auths-api-cache \
+     --size=4 \
+     --region=us-central1 \
+     --tier=standard \
+     --redis-version=7.0 \
+     --enable-auth \
+     --region-zone=us-central1-a
+   ```
+
+2. **Retrieve Connection Info**:
+   ```bash
+   gcloud redis instances describe auths-api-cache \
+     --region=us-central1
+   ```
+   Returns: `host` (IP only, no DNS) and `port`
+
+3. **Network**: Redis is private to VPC; auths-api must be in same VPC.
+
+### Configuration
+
+In auths-api config:
+```toml
+[redis]
+endpoint = "redis://default:your-auth-password@10.0.0.3:6379"
+# Note: Memorystore uses IP addresses, not DNS names
+```
+
+### Failover Behavior
+
+- **Detection Time**: ~30s (automatic)
+- **RTO**: < 1 minute
+- **Automatic**: Standard tier provides automatic failover
+- **Transparency**: Connection via private IP
+
+### Backups
+
+```bash
+# Manual snapshots
+gcloud redis instances snapshot create \
+  --instance=auths-api-cache \
+  --region=us-central1
+
+# Scheduled backups (backup tier)
+# Set retention in GCP console
+```
+
+### Cost
+
+- **Standard (no HA)**: $0.11/GB/month × 4GB = ~$44/month
+- **HA (multi-region)**: +100% cost = ~$88/month
+- **Data transfer**: Free within GCP, $0.12/GB out to internet
+- **Recommended for auths-api**: HA tier (~$88/month)
+
+---
+
+## Platform 4: Self-Hosted (EC2 + Sentinel)
+
+Use this **only** if:
+- You must minimize cloud costs
+- You have ops expertise for Redis + Sentinel management
+- Your organization already manages self-hosted Redis
+
+### Prerequisites
+
+- 3 EC2 instances (t3.large) in different availability zones
+  - One for Redis Master
+  - Two for Redis Replicas
+  - Plus 3 Sentinel instances (can co-locate on replicas)
+- Redis 7.0+ installed
+- Sentinel config from `crates/auths-deployment/config/sentinel.conf`
+
+### Setup
+
+1. **Install Redis on all 3 instances**:
+   ```bash
+   # On all instances:
+   sudo yum install redis -y
+   sudo systemctl enable redis
+   sudo systemctl start redis
+   ```
+
+2. **Configure Master** (first instance):
+   - Edit `/etc/redis.conf`:
+     ```
+     port 6379
+     bind 0.0.0.0
+     appendonly yes
+     requirepass your-redis-password
+     ```
+
+3. **Configure Replicas** (second and third instances):
+   ```
+   port 6379
+   bind 0.0.0.0
+   replicaof <master-ip> 6379
+   requirepass your-redis-password
+   masterauth your-redis-password
+   appendonly yes
+   ```
+
+4. **Deploy Sentinel** (all 3 instances):
+   ```bash
+   # Copy sentinel.conf from crates/auths-deployment/config/sentinel.conf
+   sudo cp sentinel.conf /etc/redis-sentinel.conf
+   sudo chown redis:redis /etc/redis-sentinel.conf
+
+   # Edit /etc/redis-sentinel.conf:
+   # - Change bind to specific IP or 0.0.0.0
+   # - Set down_after_milliseconds 30000 (30s)
+   # - Set parallel_syncs 1
+
+   sudo redis-sentinel /etc/redis-sentinel.conf
+   ```
+
+5. **Test Failover**:
+   ```bash
+   # Run test script (see fn-89.1)
+   ./crates/auths-deployment/scripts/test-sentinel-failover.sh
+   ```
+
+### Configuration
+
+In auths-api config:
+```toml
+[redis]
+# Sentinel discovery (client resolves master dynamically)
+endpoint = "redis-sentinel://user:password@sentinel1:26379,sentinel2:26379,sentinel3:26379?service_name=mymaster"
+```
+
+### Failover Behavior
+
+- **Detection Time**: ~30s (configurable)
+- **RTO**: ~1 minute
+- **Manual Intervention**: Monitor Sentinel; no auto-healing for failed machines
+- **Operational Overhead**: 2-4 hours/month (monitoring, updates, troubleshooting)
+
+### Backups
+
+Manual via `redis-cli` or AOF (see fn-89.2):
+```bash
+# Manual snapshot
+redis-cli BGSAVE
+
+# AOF (automatic incremental backups)
+# Enable in redis.conf: appendonly yes
+# See fn-89.2 for point-in-time recovery
+```
+
+### Cost
+
+- **EC2 (3 × t3.large)**: $0.10/hour × 3 = **$215/month**
+- **Elastic IPs (3)**: ~$1/month
+- **EBS storage (3 × 100GB)**: ~$15/month
+- **Ops burden**: 2-4 hours/month
+- **Total**: ~$230/month + ops time
+
+---
+
+## Connection Resilience
+
+### Client-Side Retry Logic
+
+All auths-api clients must implement exponential backoff on Redis connection failures:
+
+```rust
+// Pseudocode for auths-api client
+const MAX_RETRIES: usize = 3;
+const INITIAL_BACKOFF: Duration = Duration::from_millis(100);
+
+async fn connect_with_retry() -> Result<RedisClient> {
+    for attempt in 0..MAX_RETRIES {
+        match redis_client.connect().await {
+            Ok(client) => return Ok(client),
+            Err(e) => {
+                let backoff = INITIAL_BACKOFF * 2u32.pow(attempt as u32);
+                log::warn!("Redis connect failed (attempt {}): {}, retry in {:?}",
+                    attempt, e, backoff);
+                sleep(backoff).await;
+            }
+        }
+    }
+    Err(anyhow::anyhow!("Failed to connect after {} attempts", MAX_RETRIES))
+}
+```
+
+### Domain Entity Resilience (fn-89.0)
+
+Redis caches these auths-api entities:
+- `agents:{namespace}:{agent_id}` (agent state, TTL = agent.expires_at)
+- `tokens:{token_hash}` (token metadata, TTL = token.expires_at)
+- `device_keys:*` (device keys, TTL = agent expiry)
+
+**On Redis unavailability** (fn-89.3 circuit breaker):
+- **Authorization queries** (token validation): Return 503 Service Unavailable
+- **Cache miss on agent lookup**: 503 (can't validate without cache)
+- **Reads from replicas**: Fail over to secondary cache if available
+
+---
+
+## Monitoring & Alerting
+
+### Key Metrics (fn-89.12)
+
+For any platform, monitor:
+- **Replication lag**: < 1 second (normal), > 5s (alert)
+- **Master failover count**: Should be 0-1/month (normal), > 3/month (investigate)
+- **Connection pool health**: % connections alive (target: > 95%)
+- **Cache hit ratio**: Should be > 90% for auths agents/tokens
+- **Memory usage**: < 80% of allocated (auto-eviction at 100%)
+
+### Alerting
+
+Example Prometheus rules (fn-89.12):
+```yaml
+- alert: RedisMasterDown
+  expr: redis_up{role="master"} == 0
+  for: 30s
+  action: page oncall
+
+- alert: RedisReplicationLag
+  expr: redis_replication_lag_bytes > 5242880  # 5MB
+  for: 2m
+  action: alert (not page)
+
+- alert: RedisMemoryHigh
+  expr: redis_memory_usage_percent > 80
+  for: 5m
+  action: alert (check if cache needs size increase)
+```
+
+---
+
+## Disaster Recovery
+
+### Recovery Time Objectives (RTO)
+
+| Failure Scenario | Managed | Self-Hosted |
+|---|---|---|
+| Master crashes | 1-2 minutes | 30 seconds (Sentinel) + manual failover |
+| Entire region down | 5-10 minutes | Data loss (replicate to backup region) |
+| Corrupted data | 24 hours (backup restore) | 24+ hours (manual restore from AOF) |
+
+### Backup Strategy (fn-89.2)
+
+- **Managed services**: Automatic daily snapshots (retention: 30 days)
+- **Self-hosted**: AOF (append-only file) + daily snapshots to S3/GCS
+- **Testing**: Monthly restore from backup to validation environment
+
+---
+
+## Decision Tree: Which Platform?
+
+```
+┌─ AWS User?
+│  └─→ Use AWS ElastiCache
+│      (most integrated, auto-failover, managed backups)
+│
+├─ Google Cloud User?
+│  └─→ Use GCP Memorystore (Standard + HA)
+│      (best for Kubernetes on GKE)
+│
+├─ Serverless / Multi-cloud?
+│  └─→ Use Upstash
+│      (cheapest managed option, no infra)
+│
+└─ On-premises / Self-hosted required?
+   └─→ Use EC2 + Sentinel
+       (cheapest, highest ops burden)
+```
+
+---
+
+## Testing & Validation
+
+### Local Testing (Docker Compose)
+
+```bash
+# Start Sentinel cluster
+./crates/auths-deployment/scripts/start-sentinel.sh local
+
+# Run failover tests
+./crates/auths-deployment/scripts/test-sentinel-failover.sh
+
+# Verify client retries on master kill
+# (see test output)
+```
+
+### Production Validation (Chaos Engineering)
+
+For self-hosted:
+1. Kill master in off-hours
+2. Verify failover time < 30s
+3. Verify client reconnects without request loss
+4. Verify new master has all data
+5. Document incident in runbook
+
+---
+
+## References
+
+- [AWS ElastiCache User Guide](https://docs.aws.amazon.com/elasticache/)
+- [Upstash Documentation](https://upstash.com/docs)
+- [GCP Memorystore User Guide](https://cloud.google.com/memorystore/docs)
+- [Redis Sentinel Documentation](https://redis.io/docs/management/sentinel/)
+- Related: fn-89.0 (Domain Architecture), fn-89.2 (AOF Backups), fn-89.12 (Monitoring)
diff --git a/docs/REDIS_AOF_BACKUP.md b/docs/REDIS_AOF_BACKUP.md
new file mode 100644
index 00000000..8b980fbe
--- /dev/null
+++ b/docs/REDIS_AOF_BACKUP.md
@@ -0,0 +1,461 @@
+# Redis AOF Backup & Point-in-Time Recovery
+
+**Related**: fn-89.2 (AOF backup automation and point-in-time recovery)
+
+This document covers automated AOF (Append-Only File) backup strategy, point-in-time recovery procedures, and monitoring for auths-api Redis.
+
+---
+
+## Overview
+
+**Why AOF?**
+- **Durability**: Survives crashes; captures every write operation
+- **Granularity**: Point-in-time recovery to any moment in time
+- **Compliance**: Immutable audit trail for audit events (fn-89.5)
+
+**Configuration**:
+```
+appendonly yes                         # Enable AOF
+appendfsync everysec                   # Fsync every 1 second (balance between durability + performance)
+auto-aof-rewrite-percentage 100        # Rewrite when AOF grows 100% since last rewrite
+auto-aof-rewrite-min-size 64mb         # Don't rewrite unless > 64MB
+```
+
+---
+
+## Architecture
+
+### Data Flow
+
+```
+┌────────────────┐
+│   auths-api    │
+│  (writes data) │
+└────────┬───────┘
+         │ Redis WRITE command
+         v
+    ┌─────────────────────────────┐
+    │ Redis Master                │
+    │ • appendonly.aof (disk)     │
+    │ • AOF rewrite (compression) │
+    │ • BGSAVE (snapshot)         │
+    └─────┬───────────────────────┘
+          │ Replication
+          v
+    ┌──────────────┐
+    │ Replica 1    │
+    │ + Replica 2  │
+    └──────────────┘
+
+    AOF grows over time:
+    ┌─────────────────────────────────────────┐
+    │ appendonly.aof (~1KB per agent + events)│
+    │                                         │
+    │ Daily growth: ~50-100MB (10k agents)    │
+    │ Monthly size: ~1.5-3GB                  │
+    └─────────────────────────────────────────┘
+
+    ↓ Daily backup job (2am UTC)
+
+    ┌──────────────────────────────────────┐
+    │ S3 Backups (gzip compressed)        │
+    │ • redis-aof-20260329_020000.aof.gz  │
+    │ • Compression: ~100-200MB/day       │
+    │ • Retention: 30 days (~6GB storage) │
+    └──────────────────────────────────────┘
+```
+
+### Fsync Strategy Tradeoff
+
+| Fsync Strategy | Durability | Performance | Data Loss Risk |
+|---|---|---|---|
+| `everysec` (default) | Good | Minimal overhead | Max 1s of data (acceptable) |
+| `always` | Best | 10-15% slower | None (but 10x slower) |
+| `no` | Worst | Best | May lose minutes of writes |
+
+**Recommendation for auths-api**: `appendfsync everysec`
+- Domain entities cached in Redis (agents, tokens) have TTL
+- Token expiry is authoritative source, not AOF
+- 1s durability window acceptable for agent state
+
+---
+
+## Backup Automation
+
+### Daily Backup Script
+
+**Location**: `crates/auths-deployment/scripts/backup-redis-aof.sh`
+
+**Process**:
+1. Verify Redis connectivity
+2. Trigger AOF rewrite (`BGREWRITEAOF`) for compression
+3. Copy compressed AOF file
+4. Upload to S3 with gzip compression
+5. Apply retention policy (delete backups >30 days old)
+6. Log success/failure to CloudWatch
+
+**Cron Job Setup**:
+```bash
+# In production EC2/Kubernetes:
+0 2 * * * cd /app && AWS_REGION=us-east-1 ./backup-redis-aof.sh localhost 6379 >> /var/log/redis-backup.log 2>&1
+
+# With error notification:
+0 2 * * * cd /app && ./backup-redis-aof.sh localhost 6379 || alert-oncall "Redis backup failed"
+```
+
+**Example Run**:
+```bash
+$ AWS_REGION=us-east-1 ./backup-redis-aof.sh localhost 6379
+[2026-03-29 02:00:00] [INFO] Verifying Redis connectivity (localhost:6379)...
+[2026-03-29 02:00:00] [INFO] Redis reachable ✓
+[2026-03-29 02:00:00] [INFO] Triggering AOF rewrite (compaction)...
+[2026-03-29 02:00:00] [INFO] Waiting for AOF rewrite...
+[2026-03-29 02:00:02] [INFO] AOF rewrite completed
+[2026-03-29 02:00:03] [INFO] Copying AOF to temporary location...
+[2026-03-29 02:00:05] [INFO] Compressing AOF...
+[2026-03-29 02:00:08] [INFO] Compressed AOF size: 125MB
+[2026-03-29 02:00:10] [INFO] Uploading to S3: s3://auths-redis-backups/backups/redis-aof-20260329_020000.aof.gz
+[2026-03-29 02:00:15] [INFO] ✓ Backup uploaded to S3
+[2026-03-29 02:00:16] [INFO] Applying retention policy (keeping 30 days)...
+[2026-03-29 02:00:17] [INFO] ✓ Backup completed successfully
+[2026-03-29 02:00:17] [INFO] Summary:
+[2026-03-29 02:00:17] [INFO]   Timestamp: 20260329_020000
+[2026-03-29 02:00:17] [INFO]   Size: 125MB
+[2026-03-29 02:00:17] [INFO]   Location: s3://auths-redis-backups/backups/redis-aof-20260329_020000.aof.gz
+[2026-03-29 02:00:17] [INFO]   Redis: localhost:6379
+```
+
+### S3 Bucket Setup
+
+```bash
+# Create S3 bucket with versioning + lifecycle
+aws s3api create-bucket \
+  --bucket auths-redis-backups \
+  --region us-east-1
+
+# Enable versioning
+aws s3api put-bucket-versioning \
+  --bucket auths-redis-backups \
+  --versioning-configuration Status=Enabled
+
+# Lifecycle policy: delete old backups after 30 days
+cat > lifecycle.json << 'EOF'
+{
+  "Rules": [
+    {
+      "Id": "DeleteOldBackups",
+      "Status": "Enabled",
+      "Prefix": "backups/",
+      "Expiration": {
+        "Days": 30
+      },
+      "NoncurrentVersionExpiration": {
+        "NoncurrentDays": 7
+      }
+    }
+  ]
+}
+EOF
+
+aws s3api put-bucket-lifecycle-configuration \
+  --bucket auths-redis-backups \
+  --lifecycle-configuration file://lifecycle.json
+```
+
+### IAM Role
+
+Needed for EC2/EKS to upload backups:
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Action": [
+        "s3:PutObject",
+        "s3:GetObject",
+        "s3:ListBucket"
+      ],
+      "Resource": [
+        "arn:aws:s3:::auths-redis-backups",
+        "arn:aws:s3:::auths-redis-backups/*"
+      ]
+    },
+    {
+      "Effect": "Allow",
+      "Action": [
+        "cloudwatch:PutMetricData"
+      ],
+      "Resource": "*"
+    }
+  ]
+}
+```
+
+---
+
+## Point-in-Time Recovery
+
+### Manual Recovery Procedure
+
+**Location**: `crates/auths-deployment/scripts/restore-redis-aof.sh`
+
+**Scenarios**:
+
+#### 1. Recover Latest Backup
+```bash
+# Restore most recent backup
+./restore-redis-aof.sh latest localhost 6379
+
+# OR specific date
+./restore-redis-aof.sh latest localhost 6379 2026-03-28
+
+# Output:
+# [INFO] Finding latest backup...
+# [INFO] Using: s3://auths-redis-backups/backups/redis-aof-20260329_020000.aof.gz
+# [INFO] Downloading backup...
+# [INFO] ✓ Backup downloaded
+# [INFO] Decompressing...
+# [INFO] Validating AOF integrity...
+# [INFO] Backing up current AOF...
+# [INFO] Stopping Redis...
+# [INFO] ✓ Redis stopped
+# [INFO] Replacing AOF file...
+# [INFO] ✓ AOF replaced
+# [INFO] Starting Redis...
+# [INFO] ✓ Redis responding
+# [INFO] Database size: 10247 keys
+# [INFO] Memory usage: 512.5M
+```
+
+#### 2. Recover Specific Backup Date
+```bash
+# List backups from specific date
+aws s3api list-objects-v2 \
+  --bucket auths-redis-backups \
+  --prefix "backups/redis-aof-2026-03-25" \
+  --region us-east-1
+
+# Restore specific backup
+./restore-redis-aof.sh s3://auths-redis-backups/backups/redis-aof-20260325_020000.aof.gz
+```
+
+#### 3. Recover from Local File
+```bash
+./restore-redis-aof.sh /backups/redis-aof-20260325.aof.gz localhost 6379
+```
+
+### Recovery Time
+
+| Scenario | RTO | Notes |
+|---|---|---|
+| Latest backup | < 5 minutes | Download + decompress + verify + start |
+| 7-day-old backup | < 10 minutes | Larger S3 download |
+| Full month recovery | < 15 minutes | Limited by decompression + Redis startup |
+
+### Testing Recovery
+
+**Monthly Recovery Drill** (1st of each month):
+```bash
+#!/bin/bash
+# Monthly point-in-time recovery test
+
+echo "Recovery Drill: $(date)"
+
+# 1. Identify a backup from 7 days ago
+RECOVERY_DATE=$(date -u -d "7 days ago" +%Y-%m-%d)
+echo "Recovering backup from $RECOVERY_DATE..."
+
+# 2. Start test Redis on alternate port
+TEST_REDIS_PORT=6380
+redis-server --port $TEST_REDIS_PORT &
+sleep 2
+
+# 3. Restore backup
+./restore-redis-aof.sh latest localhost $TEST_REDIS_PORT $RECOVERY_DATE
+
+# 4. Verify data
+TEST_DBSIZE=$(redis-cli -p $TEST_REDIS_PORT dbsize | grep -oE '[0-9]+')
+EXPECTED_AGENTS=$(redis-cli -p 6379 dbsize | grep -oE '[0-9]+')
+
+echo "Keys in restored backup: $TEST_DBSIZE"
+echo "Keys in current data: $EXPECTED_AGENTS"
+
+if [[ $TEST_DBSIZE -gt 0 ]]; then
+  echo "✓ Recovery test PASSED"
+else
+  echo "✗ Recovery test FAILED"
+fi
+
+# 5. Cleanup
+redis-cli -p $TEST_REDIS_PORT shutdown
+```
+
+---
+
+## Monitoring & Alerting
+
+### CloudWatch Metrics
+
+Backup script automatically publishes:
+
+| Metric | Unit | Threshold | Action |
+|---|---|---|---|
+| `backup-size-mb` | MB | > 1000 | Alert (investigate disk usage) |
+| `backup-success` | 0/1 | = 0 | Page oncall (backup failed) |
+| `backup-duration-seconds` | Seconds | > 300 | Investigate (timeout) |
+| `last-backup-age-hours` | Hours | > 25 | Alert (backup job missed) |
+
+**CloudWatch Dashboard**:
+```json
+{
+  "widgets": [
+    {
+      "type": "metric",
+      "properties": {
+        "metrics": [
+          ["auths/redis", "backup-size-mb"],
+          ["auths/redis", "backup-success"],
+          ["auths/redis", "last-backup-age-hours"]
+        ],
+        "period": 300,
+        "stat": "Average",
+        "region": "us-east-1",
+        "title": "Redis Backup Health"
+      }
+    }
+  ]
+}
+```
+
+### Alarms
+
+```bash
+# Backup failure alarm
+aws cloudwatch put-metric-alarm \
+  --alarm-name redis-backup-failed \
+  --alarm-actions "arn:aws:sns:us-east-1:123456789:oncall" \
+  --metric-name backup-success \
+  --namespace auths/redis \
+  --statistic Sum \
+  --period 3600 \
+  --threshold 0 \
+  --comparison-operator LessThanThreshold
+
+# Backup size alarm
+aws cloudwatch put-metric-alarm \
+  --alarm-name redis-backup-size-high \
+  --alarm-actions "arn:aws:sns:us-east-1:123456789:alerts" \
+  --metric-name backup-size-mb \
+  --namespace auths/redis \
+  --statistic Maximum \
+  --period 300 \
+  --threshold 1000 \
+  --comparison-operator GreaterThanThreshold
+```
+
+---
+
+## AOF Rewrite
+
+AOF grows over time as commands accumulate. Redis automatically rewrites (compresses) periodically.
+
+### Manual Rewrite
+
+```bash
+# Trigger background rewrite (safe, doesn't block)
+redis-cli BGREWRITEAOF
+
+# Monitor progress
+redis-cli info persistence | grep aof_rewrite
+# Output: aof_rewrite_in_progress:0 (complete)
+```
+
+### Automatic Rewrite
+
+Configured in `sentinel.conf`:
+```
+auto-aof-rewrite-percentage 100  # Rewrite when AOF grows 100% since last rewrite
+auto-aof-rewrite-min-size 64mb   # Don't rewrite unless > 64MB
+```
+
+**Example**:
+- Last rewrite produced 50MB AOF
+- AOF grows to 100MB (100% growth)
+- Redis triggers automatic rewrite
+- New AOF compressed to ~50MB again
+
+---
+
+## Retention Policy
+
+**Default**: 30-day rolling window
+
+**Rationale**:
+- Covers 1 month of history (good for weekly recovery drills)
+- Minimal S3 cost (~$6/month for 6GB)
+- Weekly snapshots archived separately (fn-90 for long-term archive)
+
+**Adjust if needed**:
+```bash
+# 60-day retention
+BACKUP_RETENTION_DAYS=60 ./backup-redis-aof.sh
+
+# S3 lifecycle policy update
+aws s3api put-bucket-lifecycle-configuration \
+  --bucket auths-redis-backups \
+  --lifecycle-configuration '{"Rules": [{"Id": "DeleteAfter60Days", "Expiration": {"Days": 60}, "Status": "Enabled"}]}'
+```
+
+---
+
+## Troubleshooting
+
+### AOF File Corruption
+
+**Symptom**: `Bad file format` when Redis starts
+
+**Recovery**:
+```bash
+# AOF check tool (Redis 7.0+)
+redis-check-aof --fix /var/lib/redis/appendonly.aof
+
+# Or manual recovery
+./restore-redis-aof.sh latest  # Restore from backup
+```
+
+### Backup Upload Timeout
+
+**Symptom**: Backup script fails at S3 upload
+
+**Solutions**:
+```bash
+# Increase timeout in script (line 60)
+aws s3 cp ... --region ... --no-progress
+
+# Or use S3 multipart upload with retries
+aws s3 cp ... --region ... --sse AES256
+```
+
+### Replication Lag After Recovery
+
+**Symptom**: Replicas out of sync after restore
+
+**Recovery**:
+```bash
+# Force replica resync
+redis-cli -h replica slaveof no one  # Stop replicating
+redis-cli -h replica slaveof master 6379  # Resume from scratch
+
+# Monitor sync
+redis-cli -h replica info replication | grep sync
+```
+
+---
+
+## References
+
+- [Redis Persistence](https://redis.io/topics/persistence)
+- [Redis AOF Format](https://redis.io/topics/protocol)
+- Related: fn-89.0 (Domain Architecture), fn-89.1 (Sentinel HA), fn-89.3 (Circuit Breaker)
diff --git a/docs/plans/api_plans.md b/docs/plans/api_plans.md
new file mode 100644
index 00000000..2288c88c
--- /dev/null
+++ b/docs/plans/api_plans.md
@@ -0,0 +1,1113 @@
+# auths-api: Product & Implementation Roadmap
+
+## fn-89 Foundation: What It Enables
+
+The fn-89 epic (domain-driven architecture, fn-89.0 contracts) establishes the **foundational layers** for auths-api:
+
+**What fn-89 Delivers**:
+- **Domain clarity**: identity, auth, compliance, webhooks domains with explicit ownership
+- **Transaction safety**: bootstrap/provisioning workflows with atomicity guarantees
+- **Event-driven architecture**: all domain operations emit webhooks (provision, revoke, expire, refresh)
+- **Observability**: per-domain metrics, Grafana dashboards, SLO-based alerting
+- **SDK parity**: Rust + Python SDKs mirror domain structure (users understand via domain concepts)
+- **Scalability foundation**: sharding strategy, per-shard failover, horizontal deployment patterns
+
+**Market Positioning**:
+- Supply chain security (fintech, infra platforms, critical OSS)
+- Multi-tenant SaaS with cryptographic delegation (orgs provision agents for services)
+- Audit-driven security (full event trail with domain event sourcing)
+
+## Roadmap Overview
+
+After fn-89, auths-api is **provisionally deployable** but **functionally limited**. The roadmap builds on this foundation to unlock strategic use cases:
+
+| Epic | Use Case | Complexity | Value |
+|------|----------|-----------|-------|
+| fn-100 | Policy-driven agent provisioning | High | Very High |
+| fn-101 | Artifact attestation & verification | Medium | Very High |
+| fn-102 | Key rotation & renewal automation | Medium | High |
+| fn-103 | Approval workflows (sensitive ops) | Medium | High |
+| fn-104 | Agent quotas & rate limiting | Low | Medium |
+| fn-105 | Multi-org federation & cross-org delegation | Very High | High |
+| fn-106 | Compliance & audit export (SOC2, FedRAMP) | Medium | High |
+| fn-107 | Agent analytics & usage observability | Low | Medium |
+
+---
+
+## fn-100: Policy-Driven Agent Provisioning
+
+**Goal**: Orgs define rules that automatically provision agents based on namespace config, without manual admin intervention.
+
+**Use Case**:
+- Org admin: "Whenever a CI pipeline starts in namespace X, auto-provision a ci-runner agent with signing + artifact capabilities, TTL 1 hour"
+- Org admin: "Allow developers to self-provision personal agents for CLI use, limited to read-only capabilities"
+- Org admin: "Revoke all agents in namespace Y that haven't been used in 30 days"
+
+### Sub-task fn-100.1: Policy Schema & Evaluation Engine
+
+**Description**: Define policy language and evaluation logic for agent provisioning rules.
+
+**Deliverables**:
+- Policy schema (JSON): trigger rules, agent templates, capability grants
+- Policy evaluator: given namespace context, determine which agents to provision
+- Admin API: `POST /v1/policies { namespace, rules, [triggers] }`
+
+**Pseudo-code**:
+```rust
+// Policy schema
+pub struct AgentPolicy {
+    namespace: String,
+    rules: Vec<PolicyRule>,
+}
+
+pub enum PolicyTrigger {
+    OnNamespaceBoot { }, // when namespace initializes
+    OnCiPipelineStart { ci_platform: String }, // "github", "gitlab"
+    OnDeveloperLogin { }, // when human logs in
+    OnSchedule { cron: String }, // "0 2 * * *" = daily 2am
+}
+
+pub struct PolicyRule {
+    name: String,
+    trigger: PolicyTrigger,
+    condition: String, // "namespace.platform == 'github' && team == 'infra'"
+    agent_template: AgentTemplate,
+}
+
+pub struct AgentTemplate {
+    name_pattern: String, // "ci-runner-{platform}-{id}"
+    capabilities: Vec<String>, // ["sign_artifacts", "publish_releases"]
+    ttl_seconds: u64,
+    rotation_period: Option<u64>, // auto-rotate every N seconds
+}
+
+// Evaluator
+pub async fn evaluate_policy(
+    namespace: &str,
+    policy: &AgentPolicy,
+    trigger: &PolicyTrigger,
+    context: &PolicyContext, // env vars, CI platform info, etc.
+) -> Result<Vec<AgentTemplate>> {
+    // 1. Filter rules by trigger type
+    // 2. Evaluate conditions against context
+    // 3. Return matching templates
+}
+
+pub async fn apply_policy(
+    namespace: &str,
+    templates: Vec<AgentTemplate>,
+    identity_service: &dyn IdentityService,
+) -> Result<Vec<Agent>> {
+    // 1. For each template, provision agent
+    // 2. Emit policy.agent_provisioned event (webhook)
+    // 3. Log to compliance domain
+}
+```
+
+**Acceptance Criteria**:
+- Policy schema supports at least 4 trigger types (boot, ci_start, login, schedule)
+- Condition evaluator handles namespace context, env vars, user attributes
+- Policy rules can grant multiple capabilities
+- Admin can list, update, delete policies
+- Policy changes take effect immediately (no restart)
+
+---
+
+### Sub-task fn-100.2: Scheduled Policy Evaluation (Cron-like)
+
+**Description**: Periodic evaluation of policies (e.g., "revoke unused agents daily").
+
+**Deliverables**:
+- Background job: periodic policy evaluation based on cron schedule
+- Metrics: policies evaluated/hour, agents auto-provisioned, agents auto-revoked
+- Admin endpoint to trigger manual evaluation
+
+**Pseudo-code**:
+```rust
+pub struct ScheduledPolicy {
+    policy_id: String,
+    schedule: String, // cron expression
+}
+
+pub async fn scheduled_policy_evaluator(
+    policies: Arc<Vec<ScheduledPolicy>>,
+    scheduler: &dyn Scheduler,
+) {
+    for policy in policies.iter() {
+        scheduler.schedule(
+            policy.schedule.clone(),
+            move || {
+                Box::pin(async {
+                    let templates = evaluate_policy(&policy).await?;
+                    apply_policy(templates).await?;
+                })
+            },
+        ).await?;
+    }
+}
+
+// Example: auto-revoke unused agents
+pub async fn revoke_unused_agents(
+    namespace: &str,
+    threshold_days: u64,
+) -> Result<Vec<String>> {
+    // 1. Query audit logs: which agents haven't been used in threshold_days
+    // 2. Batch revoke them
+    // 3. Emit agent.revoked events (webhooks)
+    // 4. Return revoked agent IDs
+}
+```
+
+**Acceptance Criteria**:
+- Cron-based scheduling works (daily, hourly, etc.)
+- Unused agent cleanup runs reliably
+- Metrics exposed: scheduled_policy_evaluations, agents_auto_provisioned, agents_auto_revoked
+- Manual trigger endpoint: `POST /v1/policies/{id}/evaluate` for testing
+
+---
+
+### Sub-task fn-100.3: Multi-Namespace Policies & Inheritance
+
+**Description**: Org-level policy templates that cascade to namespaces, with override capability.
+
+**Deliverables**:
+- Policy hierarchy: global > org > namespace > agent
+- Inheritance: namespaces inherit org policies unless explicitly overridden
+- Conflict resolution: most-specific policy wins
+
+**Pseudo-code**:
+```rust
+pub struct PolicyHierarchy {
+    global: Option<AgentPolicy>,     // Auths platform-wide
+    org: Option<AgentPolicy>,        // Org-level defaults
+    namespace: AgentPolicy,           // Namespace-specific
+}
+
+pub async fn resolve_policies(
+    namespace: &str,
+    org_id: &str,
+) -> Result<AgentPolicy> {
+    // 1. Load global policy (if any)
+    // 2. Load org policy (if any)
+    // 3. Load namespace policy
+    // 4. Merge: namespace overrides org, org overrides global
+    // 5. Return merged policy
+}
+```
+
+**Acceptance Criteria**:
+- Policy inheritance documented with examples
+- Override syntax clear (namespace policy `extends` org policy)
+- Conflict resolution predictable
+
+---
+
+## fn-101: Artifact Attestation & Verification
+
+**Goal**: Agents sign artifacts (commits, releases, container images); third parties verify provenance without needing artifact server access.
+
+**Use Case**:
+- CI agent signs build artifact (binary, container image, release tarball)
+- Developer pushes signed artifact + attestation to public registry
+- User downloads artifact, verifies signature: "This build came from org X's CI, signed with agent ID Y, approved on date Z"
+- Supply chain attack prevention: fake artifact rejected because signature doesn't verify
+
+### Sub-task fn-101.1: Artifact Signing Service
+
+**Description**: Agents create deterministic, canonicalized signatures over artifacts.
+
+**Deliverables**:
+- Artifact signing API: `POST /v1/artifacts/sign { agent_id, artifact_hash, metadata }`
+- Returns: signed attestation (JSON)
+- Attestation includes: artifact hash, agent DID, timestamp, signature
+
+**Pseudo-code**:
+```rust
+pub struct ArtifactAttestation {
+    version: String,                // "1.0"
+    artifact_hash: String,          // sha256 of artifact
+    artifact_hash_algorithm: String, // "sha256"
+    agent_id: String,
+    agent_did: String,
+    signer_did: String,            // dev who triggered the sign
+    signed_at: DateTime<Utc>,
+    expires_at: Option<DateTime<Utc>>,
+    metadata: Map<String, Value>,  // platform, build_id, version, etc.
+    signature: String,             // base64url(ed25519_sig)
+}
+
+pub async fn sign_artifact(
+    agent_id: &str,
+    artifact_hash: &str,
+    metadata: Map<String, Value>,
+    artifact_service: &dyn ArtifactService,
+    auth_domain: &dyn AuthDomain,
+) -> Result<ArtifactAttestation> {
+    // 1. Validate agent has "sign_artifacts" capability
+    // 2. Load agent's device key from device domain
+    // 3. Canonicalize attestation (json-canon, RFC 8785)
+    // 4. Sign with agent's key
+    // 5. Return attestation
+}
+
+pub async fn verify_artifact_attestation(
+    attestation: &ArtifactAttestation,
+    identity_resolver: &dyn IdentityResolver,
+    current_time: DateTime<Utc>,
+) -> Result<AttestationValidity> {
+    // 1. Validate signature (Ed25519 over canonical JSON)
+    // 2. Check not expired
+    // 3. Resolve agent_did from IdentityResolver
+    // 4. Return validity
+}
+```
+
+**Acceptance Criteria**:
+- Artifacts can be signed atomically with hash only (no file upload needed)
+- Attestations are JSON, machine-readable
+- Canonical form verified (json-canon)
+- Verification works offline (given agent DID + public key)
+
+---
+
+### Sub-task fn-101.2: Attestation Storage & Distribution
+
+**Description**: Store attestations for lookup and verification.
+
+**Deliverables**:
+- Attestation registry: `POST /v1/attestations { artifact_hash, attestation }`
+- List attestations: `GET /v1/attestations?artifact_hash=...&agent_did=...`
+- Storage: Redis (hot cache) + audit log (immutable)
+
+**Pseudo-code**:
+```rust
+pub struct AttestationRegistry {
+    backend: Arc<dyn AttestationStorage>,
+}
+
+pub async fn register_attestation(
+    attestation: ArtifactAttestation,
+    registry: &AttestationRegistry,
+    compliance: &dyn ComplianceDomain,
+) -> Result<()> {
+    // 1. Validate attestation signature
+    // 2. Store in Redis: attestations:{artifact_hash}:{agent_did}
+    // 3. Emit attestation.registered event (webhook)
+    // 4. Log to compliance domain
+}
+
+pub async fn get_attestations(
+    artifact_hash: &str,
+    agent_did: Option<&str>,
+    registry: &AttestationRegistry,
+) -> Result<Vec<ArtifactAttestation>> {
+    // 1. Query Redis by artifact_hash
+    // 2. Optionally filter by agent_did
+    // 3. Return sorted by signed_at (newest first)
+}
+```
+
+**Acceptance Criteria**:
+- Attestations queryable by artifact hash + optional agent DID
+- Immutable (no updates, only append)
+- Exported in audit logs
+
+---
+
+### Sub-task fn-101.3: Integration: Git Commit Signing
+
+**Description**: Extend Git commit signing to embed artifact attestations.
+
+**Deliverables**:
+- auths-cli: `auths sign-commit` can include attestation hash
+- Commit signatures include attestation reference
+- Verification: git signature validates + attestation is lookupable
+
+**Pseudo-code**:
+```rust
+pub struct CommitSignatureWithAttestation {
+    commit_hash: String,
+    commit_signature: String,     // existing
+    attestation_hash: Option<String>, // hash of artifact being committed
+    attestation_reference: Option<String>, // URL to attestation registry
+}
+
+pub async fn sign_commit_with_attestation(
+    commit_hash: &str,
+    artifact_hash: Option<&str>,
+    agent_service: &dyn AgentService,
+) -> Result<CommitSignatureWithAttestation> {
+    // 1. Sign commit (existing logic)
+    // 2. If artifact_hash provided:
+    //    a. Look up attestation
+    //    b. Include reference in signature metadata
+    // 3. Return signature + attestation ref
+}
+```
+
+**Acceptance Criteria**:
+- Git commits can link to artifact attestations
+- Attestation reference immutable after commit
+- Verification chain: commit sig → attestation sig → agent DID
+
+---
+
+## fn-102: Key Rotation & Renewal Automation
+
+**Goal**: Agents automatically rotate their signing keys on a schedule, maintaining continuous signing capability.
+
+**Use Case**:
+- Long-lived agent (CI runner, bot) rotates its key every 30 days automatically
+- Old key revoked after grace period (new key already active)
+- No service disruption (clients always get latest key)
+
+### Sub-task fn-102.1: Agent Key Rotation Policy
+
+**Description**: Define rotation schedules and execution logic.
+
+**Deliverables**:
+- Policy schema: rotation period, grace period, notifications
+- Rotation scheduler: periodic background job
+- Pre-rotation notification: webhook to inform subscribers
+
+**Pseudo-code**:
+```rust
+pub struct KeyRotationPolicy {
+    agent_id: String,
+    rotation_period: Duration, // e.g., 30 days
+    grace_period: Duration,     // e.g., 7 days (old key still valid)
+    notify_before: Duration,    // e.g., 3 days before rotation
+    auto_rotate: bool,
+}
+
+pub async fn schedule_key_rotation(
+    agent_id: &str,
+    policy: KeyRotationPolicy,
+    scheduler: &dyn Scheduler,
+) -> Result<()> {
+    // 1. Calculate next rotation time: now + policy.rotation_period
+    // 2. Schedule webhook notification: now + (rotation_period - notify_before)
+    // 3. Schedule rotation: now + rotation_period
+    // 4. Store scheduled rotations in Redis
+}
+
+pub async fn perform_key_rotation(
+    agent_id: &str,
+    device_service: &dyn DeviceService,
+) -> Result<RotationResult> {
+    // 1. Generate new device key
+    // 2. Add new key to agent's device list
+    // 3. Mark old key as "rotating" (valid until grace_period expires)
+    // 4. Emit device.key_rotated event
+    // 5. Old key expires after grace_period (cleanup job)
+}
+
+pub struct RotationResult {
+    agent_id: String,
+    old_key_did: String,
+    new_key_did: String,
+    new_key_public: String,
+    old_key_expires_at: DateTime<Utc>,
+}
+```
+
+**Acceptance Criteria**:
+- Rotation period configurable per agent
+- Pre-rotation notification sent (webhook event)
+- Old key valid during grace period, then revoked automatically
+- Audit trail: all rotations logged
+
+---
+
+### Sub-task fn-102.2: Client Handling of Key Rotation
+
+**Description**: SDK clients handle transparent key rotation (fetch new key, use it).
+
+**Deliverables**:
+- SDK: automatic key refresh on rotation
+- Cache invalidation: old key removed from cache on expiry
+- Error handling: retry with new key if old key rejected
+
+**Pseudo-code**:
+```rust
+// Rust SDK
+pub async fn sign_with_rotation_aware(
+    agent_id: &str,
+    data: &[u8],
+    sdk: &Agent,
+) -> Result<String> {
+    loop {
+        match sdk.sign(data).await {
+            Ok(sig) => return Ok(sig),
+            Err(SignError::KeyExpired) => {
+                // Key was just rotated, refresh and retry
+                sdk.refresh_keys().await?;
+                // retry the sign
+            }
+            Err(e) => return Err(e),
+        }
+    }
+}
+
+// Python SDK equivalent
+async def sign_with_rotation_aware(agent_id: str, data: bytes) -> str:
+    while True:
+        try:
+            sig = await agent.sign(data)
+            return sig
+        except KeyExpiredError:
+            await agent.refresh_keys()
+            # retry
+```
+
+**Acceptance Criteria**:
+- SDK automatically detects key rotation
+- Seamless retry on key expiry
+- Logging: key rotation events visible in client logs
+
+---
+
+### Sub-task fn-102.3: Renewal Before Expiry
+
+**Description**: Extend agent TTL automatically before expiration (similar to token refresh).
+
+**Deliverables**:
+- Renewal scheduler: check agents expiring within N days
+- Auto-renewal: extend TTL by another rotation period
+- Notification: alert if auto-renewal fails (manual intervention)
+
+**Pseudo-code**:
+```rust
+pub async fn schedule_agent_renewals(
+    namespace: &str,
+    renewal_threshold: Duration, // e.g., 7 days
+    scheduler: &dyn Scheduler,
+) -> Result<()> {
+    // 1. Find agents expiring within threshold
+    // 2. Schedule renewal job: now + (agent.expires_at - renewal_threshold)
+    // 3. On job trigger: extend TTL + emit agent.renewed event
+}
+
+pub async fn renew_agent_before_expiry(
+    namespace: &str,
+    agent_id: &str,
+    new_ttl: Duration,
+) -> Result<Agent> {
+    // 1. Validate agent not already expired
+    // 2. Update agent.expires_at = now + new_ttl
+    // 3. Store in Redis
+    // 4. Emit agent.renewed event
+    // 5. Log to compliance
+}
+```
+
+**Acceptance Criteria**:
+- Agents auto-renew before expiry (no service gap)
+- Renewal events visible in audit logs
+- Admin notified if renewal fails
+
+---
+
+## fn-103: Approval Workflows for Sensitive Operations
+
+**Goal**: High-stakes operations (revoke agent, rotate keys, change policies) require human approval.
+
+**Use Case**:
+- CI agent provisioning is automatic (fn-100)
+- But revoking an agent requires approval from 2 org admins
+- Deployment policy changes require approval from security team
+
+### Sub-task fn-103.1: Approval Request & Decision
+
+**Description**: Create, manage, approve/deny sensitive operations.
+
+**Deliverables**:
+- Approval schema: operation type, requester, approvers, deadline
+- API: `POST /v1/approvals/request { operation, reason, requires_approvers }`
+- API: `POST /v1/approvals/{id}/approve { approver_did, decision, note }`
+
+**Pseudo-code**:
+```rust
+pub enum ApprovalOperation {
+    RevokeAgent { agent_id: String },
+    RotateAgentKey { agent_id: String },
+    ChangePolicy { policy_id: String, old: Policy, new: Policy },
+    DeleteNamespace { namespace: String },
+}
+
+pub struct ApprovalRequest {
+    id: String,
+    namespace: String,
+    operation: ApprovalOperation,
+    requester_did: String,
+    required_approvers: Vec<String>, // DIDs of required approvers
+    approvals: Map<String, Approval>, // approver_did -> decision
+    deadline: DateTime<Utc>,
+    status: ApprovalStatus, // pending, approved, rejected, expired
+}
+
+pub struct Approval {
+    approver_did: String,
+    decision: ApprovalDecision, // Approved, Rejected
+    reason: String,
+    approved_at: DateTime<Utc>,
+}
+
+pub async fn request_approval(
+    operation: ApprovalOperation,
+    requester_did: &str,
+    approvers: Vec<String>,
+    deadline: Duration,
+) -> Result<ApprovalRequest> {
+    // 1. Create request
+    // 2. Store in Redis: approvals:{request_id}
+    // 3. Emit approval.requested event (sends to approvers)
+    // 4. Log to compliance domain
+}
+
+pub async fn approve_operation(
+    request_id: &str,
+    approver_did: &str,
+    decision: ApprovalDecision,
+) -> Result<ApprovalRequest> {
+    // 1. Record approval
+    // 2. If all required approvals received: apply operation
+    // 3. Emit approval.decided event
+}
+```
+
+**Acceptance Criteria**:
+- Approval rules configurable per operation type
+- Multiple approvers supported
+- Deadline enforced (requests expire)
+- Audit trail of all approvals
+
+---
+
+### Sub-task fn-103.2: Conditional Execution (After Approval)
+
+**Description**: Execute operations only after approval(s) received.
+
+**Deliverables**:
+- Approval-gated operations: revoke, rotate, policy change
+- Execution: automatic or manual trigger after approved
+- Rollback: undo operation if approval is later revoked
+
+**Pseudo-code**:
+```rust
+pub async fn revoke_agent_with_approval(
+    namespace: &str,
+    agent_id: &str,
+    requester_did: &str,
+) -> Result<ApprovalRequest> {
+    // 1. Create approval request (operation: RevokeAgent)
+    // 2. Determine required approvers (from policy)
+    // 3. Return request (client must wait for approvals)
+}
+
+pub async fn execute_approved_operation(
+    approval_request: &ApprovalRequest,
+) -> Result<OperationResult> {
+    // 1. Validate request is fully approved
+    // 2. Check deadline not exceeded
+    // 3. Execute operation (revoke, rotate, etc.)
+    // 4. Emit operation.executed event
+    // 5. Log to compliance
+}
+
+pub async fn revoke_approval_and_undo(
+    approval_request: &ApprovalRequest,
+    approver_who_revoked: &str,
+) -> Result<()> {
+    // 1. Mark approval as revoked
+    // 2. If operation already executed: undo it (restore agent, etc.)
+    // 3. Emit approval.revoked event
+}
+```
+
+**Acceptance Criteria**:
+- Operations block until approval received
+- Automatic execution vs. manual trigger (configurable)
+- Approval can be revoked with undo capability
+
+---
+
+## fn-104: Agent Quotas & Rate Limiting
+
+**Goal**: Prevent resource exhaustion and abuse; fair allocation across namespaces.
+
+**Use Case**:
+- Org limit: max 1000 agents per namespace
+- Rate limit: max 100 agents provisioned/hour
+- Quota enforcement: prevent over-provisioning
+
+### Sub-task fn-104.1: Quota Tracking & Enforcement
+
+**Description**: Track agent counts, enforce limits.
+
+**Deliverables**:
+- Quota schema: max agents, max provisions/hour
+- Quota check: before provisioning, verify limits
+- Metrics: quota usage, rejections
+
+**Pseudo-code**:
+```rust
+pub struct AgentQuota {
+    namespace: String,
+    max_agents: u64,
+    max_provisions_per_hour: u64,
+}
+
+pub async fn check_quota(
+    namespace: &str,
+    quota: &AgentQuota,
+    agent_service: &dyn AgentService,
+) -> Result<QuotaStatus> {
+    // 1. Count current agents in namespace
+    // 2. Count provisions in last hour (from audit log)
+    // 3. Return { agents_available, provisions_available }
+}
+
+pub async fn provision_agent_with_quota(
+    namespace: &str,
+    config: ProvisionConfig,
+) -> Result<Agent> {
+    // 1. Check quota
+    // 2. If exceeded: return QuotaExceededError
+    // 3. Otherwise: proceed with provision
+}
+
+pub struct QuotaStatus {
+    agents_used: u64,
+    agents_available: u64,
+    provisions_this_hour: u64,
+    provisions_available: u64,
+}
+```
+
+**Acceptance Criteria**:
+- Quotas enforced at provision time
+- Soft limit warnings + hard limit rejections
+- Quotas configurable per namespace
+- Quota usage visible via metrics
+
+---
+
+### Sub-task fn-104.2: Rate Limiting (Leaky Bucket)
+
+**Description**: Leaky bucket rate limiter for agent operations.
+
+**Deliverables**:
+- Rate limit: X operations/second per namespace
+- Burst allowance: allow spikes up to Y requests
+- Headers: X-RateLimit-* in API responses
+
+**Pseudo-code**:
+```rust
+pub struct RateLimiter {
+    capacity: f64,           // max tokens
+    refill_rate: f64,        // tokens per second
+    current_tokens: f64,
+}
+
+pub async fn check_rate_limit(
+    namespace: &str,
+    limiter: &mut RateLimiter,
+    cost: f64, // tokens to consume
+) -> Result<RateLimitStatus> {
+    // 1. Refill tokens based on elapsed time
+    // 2. If tokens >= cost: consume and allow
+    // 3. Otherwise: reject (too fast)
+}
+
+pub struct RateLimitStatus {
+    allowed: bool,
+    tokens_remaining: f64,
+    reset_at: DateTime<Utc>,
+}
+```
+
+**Acceptance Criteria**:
+- Rate limits configurable (default: 100 ops/sec)
+- Burst allowance (e.g., 50 tokens)
+- Metrics: rate limit hits, rejections
+- Headers: X-RateLimit-{Limit,Used,Remaining,ResetAt}
+
+---
+
+## fn-105: Multi-Org Federation & Cross-Org Delegation
+
+**Goal**: Organizations trust each other; agent from org A can act on behalf of org B (with permission).
+
+**Use Case**:
+- Company A uses Company B's SaaS platform
+- Company A's CI agent provisions its own agents on platform B
+- Company A's agents can sign artifacts on platform B without sharing keys with B
+
+### Sub-task fn-105.1: Cross-Org Agent Recognition
+
+**Description**: Org A's agent is recognized as legitimate by org B.
+
+**Deliverables**:
+- Trust anchor: org B trusts org A's DIDs
+- Agent delegation: org A agent can act in org B context
+- Verification: cross-org signatures validate
+
+**Pseudo-code**:
+```rust
+pub struct OrgTrustAnchor {
+    org_a_id: String,
+    org_b_id: String,
+    org_a_root_did: String, // root DID of org A
+    delegated_capabilities: Vec<String>, // [sign_artifacts, publish_releases]
+    expires_at: DateTime<Utc>,
+}
+
+pub async fn establish_trust(
+    org_a: &str,
+    org_b: &str,
+    root_did: &str,
+    capabilities: Vec<String>,
+) -> Result<OrgTrustAnchor> {
+    // 1. Org B admin approves trust anchor (approval workflow)
+    // 2. Store in Redis: trust_anchors:{org_b}:{org_a}
+    // 3. Emit trust.established event
+}
+
+pub async fn verify_cross_org_delegation(
+    agent_id: &str,
+    agent_org: &str,
+    target_org: &str,
+    required_capability: &str,
+    identity_resolver: &dyn IdentityResolver,
+) -> Result<bool> {
+    // 1. Resolve agent's org and DID
+    // 2. Check trust anchor: agent_org → target_org exists
+    // 3. Verify required_capability in delegated_capabilities
+    // 4. Return true if delegated, false otherwise
+}
+```
+
+**Acceptance Criteria**:
+- Cross-org trust relationships configurable
+- Delegation verified before operation
+- Audit trail: cross-org operations logged
+
+---
+
+### Sub-task fn-105.2: Shared Agent Pool (Federation Lite)
+
+**Description**: Multiple orgs share a pool of agents (e.g., shared CI runners).
+
+**Deliverables**:
+- Shared namespace: agents available to multiple orgs
+- Attribution: operations tied to requesting org
+- Resource isolation: quotas per org in shared pool
+
+**Pseudo-code**:
+```rust
+pub struct SharedNamespace {
+    id: String,
+    participating_orgs: Vec<String>,
+    agents: Vec<Agent>, // shared pool
+    quotas: Map<String, AgentQuota>, // per-org limits
+}
+
+pub async fn provision_from_shared_pool(
+    shared_namespace: &str,
+    requesting_org: &str,
+    config: ProvisionConfig,
+) -> Result<Agent> {
+    // 1. Check org quota in shared namespace
+    // 2. Tag agent with org_id (attribution)
+    // 3. Provision agent
+    // 4. Log: agent provisioned by org X in shared namespace Y
+}
+
+pub async fn audit_shared_namespace(
+    shared_namespace: &str,
+) -> Result<Vec<AuditEvent>> {
+    // 1. Query audit log: all events in shared namespace
+    // 2. Organize by org (attribution)
+    // 3. Return usage per org
+}
+```
+
+**Acceptance Criteria**:
+- Shared pool manageable via API
+- Per-org quotas enforced
+- Attribution clear (audit trail shows which org provisioned agent)
+
+---
+
+## fn-106: Compliance & Audit Export (SOC2, FedRAMP)
+
+**Goal**: Organizations need audit logs for compliance (SOC2, FedRAMP, HIPAA); export in standard formats.
+
+**Use Case**:
+- SOC2 auditor: "Show me all agent provisioning events for the last 90 days"
+- FedRAMP: "Export audit logs in CEF (Common Event Format)"
+- Compliance officer: "Generate report: who provisioned which agents, when, why"
+
+### Sub-task fn-106.1: Audit Log Retention & Queryability
+
+**Description**: Store audit logs for X years; fast queries by date range, agent, user.
+
+**Deliverables**:
+- Retention policy: configurable (default 7 years for compliance)
+- Query endpoint: `GET /v1/audit?start_date=...&end_date=...&agent_id=...&event_type=...`
+- Export formats: JSON, CSV, CEF
+
+**Pseudo-code**:
+```rust
+pub async fn query_audit_logs(
+    namespace: &str,
+    filter: AuditFilter,
+    format: ExportFormat, // JSON, CSV, CEF
+) -> Result<Vec<u8>> {
+    // 1. Query compliance domain: audit events matching filter
+    // 2. Sort by timestamp
+    // 3. Format as requested (JSON, CSV, CEF)
+    // 4. Return bytes
+}
+
+pub struct AuditFilter {
+    start_date: DateTime<Utc>,
+    end_date: DateTime<Utc>,
+    event_types: Option<Vec<String>>, // agent.provisioned, agent.revoked, etc.
+    agent_ids: Option<Vec<String>>,
+    user_ids: Option<Vec<String>>,
+}
+
+pub enum ExportFormat {
+    Json,
+    Csv,
+    Cef, // Common Event Format (for SIEM integration)
+}
+
+// CEF format example:
+// CEF:0|auths|auths-api|1.0|agent.provisioned|Agent Provisioned|5|agent_id=abc123 delegator_did=did:keri:E... capabilities=sign_artifacts created_at=2026-03-29T10:00:00Z
+```
+
+**Acceptance Criteria**:
+- Query by date range, agent, event type, user
+- Export in at least 2 formats (JSON, CSV)
+- CEF export for SIEM integration
+- Retention configurable per namespace
+
+---
+
+### Sub-task fn-106.2: Compliance Report Generation
+
+**Description**: Automated reports for compliance auditors.
+
+**Deliverables**:
+- Report templates: SOC2, FedRAMP, HIPAA, PCI-DSS
+- Report generation: `POST /v1/compliance/reports { template, namespace, date_range }`
+- Report includes: summary, detailed events, risk assessment
+
+**Pseudo-code**:
+```rust
+pub enum ComplianceTemplate {
+    SOC2,
+    FedRAMP,
+    HIPAA,
+    PciDss,
+}
+
+pub struct ComplianceReport {
+    template: ComplianceTemplate,
+    generated_at: DateTime<Utc>,
+    namespace: String,
+    summary: ReportSummary,
+    findings: Vec<Finding>,
+    audit_logs: Vec<AuditEvent>,
+}
+
+pub struct ReportSummary {
+    total_agents: u64,
+    agents_provisioned_period: u64,
+    agents_revoked_period: u64,
+    policy_changes: u64,
+    unapproved_operations: u64, // red flag
+}
+
+pub async fn generate_compliance_report(
+    namespace: &str,
+    template: ComplianceTemplate,
+    date_range: DateRange,
+) -> Result<ComplianceReport> {
+    // 1. Query audit logs for period
+    // 2. Check for policy violations (unapproved ops, quota exceeds)
+    // 3. Generate summary
+    // 4. Format as report
+}
+```
+
+**Acceptance Criteria**:
+- At least 2 compliance templates (SOC2, FedRAMP)
+- Reports include summary + detailed audit trail
+- Automated risk flagging (e.g., unapproved operations)
+
+---
+
+## fn-107: Agent Analytics & Usage Observability
+
+**Goal**: Understand agent usage patterns; identify unused/underutilized agents; capacity planning.
+
+**Use Case**:
+- Dashboard: "Which agents haven't been used in 30 days?" (cleanup candidates)
+- Metrics: "Agent provisioning trend: 100/month → 500/month" (growth signal)
+- Forecast: "At current growth, we'll hit quota in 45 days"
+
+### Sub-task fn-107.1: Agent Usage Metrics
+
+**Description**: Track which agents are actively used; expose usage trends.
+
+**Deliverables**:
+- Usage metrics: last_used, usage_count, operations_performed
+- Dashboard: agent usage heatmap, trend lines
+- Alerts: unused agents (>30 days), low-usage agents
+
+**Pseudo-code**:
+```rust
+pub struct AgentUsageMetrics {
+    agent_id: String,
+    provisioned_at: DateTime<Utc>,
+    first_used_at: Option<DateTime<Utc>>,
+    last_used_at: Option<DateTime<Utc>>,
+    usage_count: u64,
+    operations: Map<String, u64>, // sign_artifacts: 42, publish_releases: 10
+    days_since_last_use: u64,
+}
+
+pub async fn compute_agent_usage(
+    namespace: &str,
+    days_back: u64, // e.g., 30
+    agent_service: &dyn AgentService,
+) -> Result<Vec<AgentUsageMetrics>> {
+    // 1. Query all agents in namespace
+    // 2. For each agent: query audit log for operations in last N days
+    // 3. Compute last_used_at, usage_count, operations
+    // 4. Return sorted by last_used_at (oldest first)
+}
+
+pub async fn identify_unused_agents(
+    namespace: &str,
+    threshold_days: u64, // e.g., 30
+) -> Result<Vec<Agent>> {
+    // 1. Compute usage metrics
+    // 2. Filter: days_since_last_use >= threshold
+    // 3. Return unused agents
+}
+```
+
+**Acceptance Criteria**:
+- Usage metrics queryable per agent, namespace
+- Last-used timestamp tracked accurately
+- Operations per agent visible
+- Unused agents easily identifiable
+
+---
+
+### Sub-task fn-107.2: Capacity & Growth Analytics
+
+**Description**: Forecast capacity; alert on quota approach; plan scaling.
+
+**Deliverables**:
+- Forecast: project agent count 30/60/90 days out
+- Alerts: "At current rate, you'll hit quota in 30 days"
+- Recommendations: "Consider increasing quota or cleaning unused agents"
+
+**Pseudo-code**:
+```rust
+pub struct CapacityForecast {
+    namespace: String,
+    current_agents: u64,
+    quota: u64,
+    utilization: f64, // percentage
+    provisioning_rate: f64, // agents/day
+    forecast_30d: u64,
+    forecast_60d: u64,
+    days_to_quota: Option<u64>, // None if declining
+    recommendations: Vec<String>,
+}
+
+pub async fn forecast_capacity(
+    namespace: &str,
+    days_history: u64, // e.g., 90
+) -> Result<CapacityForecast> {
+    // 1. Compute provisioning rate (agents/day) from audit log
+    // 2. Project forward 30, 60, 90 days
+    // 3. Calculate days to quota at current rate
+    // 4. Generate recommendations
+}
+
+pub fn generate_recommendations(
+    forecast: &CapacityForecast,
+) -> Vec<String> {
+    let mut recs = vec![];
+    if forecast.days_to_quota.is_some() && forecast.days_to_quota < Some(30) {
+        recs.push("Consider increasing quota".into());
+    }
+    // ... more logic
+    recs
+}
+```
+
+**Acceptance Criteria**:
+- Linear regression on provisioning rate (last 90 days)
+- Forecast 30/60/90 days out
+- Alerts when approaching quota (<30 days)
+- Recommendations actionable (increase quota, cleanup unused)
+
+---
+
+## Cross-Cutting Considerations
+
+**Testing Strategy**:
+- Integration tests for each epic (fn-100 through fn-107)
+- Simulation: synthetic workloads (high provisioning rates, quota hits)
+- Compliance validation: audit logs match expected events
+
+**Observability**:
+- Per-epic metrics (policy evaluations, attestations signed, approvals, etc.)
+- Distributed tracing: trace a provisioning request through all domain layers
+- Runbooks: playbooks for common scenarios (quota exceeded, approval stuck, key rotation failure)
+
+**Documentation**:
+- User guides: how to use each feature (policies, attestations, approvals)
+- Operator guides: deployment, monitoring, troubleshooting
+- API reference: all endpoints, request/response schemas
+- Examples: concrete workflows (supply chain signing, policy-driven CI)
+
+---
+
+## Summary: From fn-89 to Production
+
+**fn-89** provides the **foundational infrastructure** (domain architecture, transactions, observability).
+
+**fn-100–107** unlock **strategic use cases**:
+- Policy-driven automation (fn-100)
+- Supply chain security (fn-101)
+- Operational continuity (fn-102, fn-104)
+- Governance & approval (fn-103)
+- Federation (fn-105)
+- Compliance (fn-106)
+- Operations intelligence (fn-107)
+
+**Market Positioning**:
+- Early: auths-api is infrastructure (supply chain signing, audit trails)
+- Scale: policy-driven provisioning, approval workflows, federation
+- Mature: compliance automation, analytics, advanced governance
+
+**Timeline Estimate**:
+- fn-89: 4–6 weeks (foundation)
+- fn-100–103: 6–8 weeks (core features)
+- fn-104–107: 4–6 weeks (optimization & intelligence)
+- **Total to production-ready**: 3–4 months
+
+**Go-to-Market**:
+1. **Closed beta** (fn-89 + fn-100): fintech, infra platforms
+2. **Open beta** (fn-89 + fn-100–103): broader enterprise
+3. **GA** (fn-89–107): full feature set for compliance-heavy orgs