diff --git a/crates/auths-api/src/domains/agents/handlers.rs b/crates/auths-api/src/domains/agents/handlers.rs index 6abe90c7..f2150454 100644 --- a/crates/auths-api/src/domains/agents/handlers.rs +++ b/crates/auths-api/src/domains/agents/handlers.rs @@ -47,19 +47,20 @@ pub async fn authorize_operation( #[allow(clippy::disallowed_methods)] // INVARIANT: HTTP handler boundary let now = chrono::Utc::now(); - // Validate clock skew (±5 minutes) - let time_diff = { - let duration = now.signed_duration_since(req.timestamp); - duration.num_seconds().unsigned_abs() - }; - if time_diff > 300 { - return Err((StatusCode::BAD_REQUEST, "Clock skew too large".to_string())); - } - let service = AgentService::new(state.registry, state.persistence); let resp = service - .authorize(&req.agent_did, &req.capability, now) - .map_err(|e| (StatusCode::UNAUTHORIZED, e))?; + .authorize(&req.agent_did, &req.capability, now, req.timestamp) + .map_err(|e| { + let error_msg = e.to_string(); + // Clock skew is a request validation error (400) + // Authorization failures are authorization errors (401) + let status = if error_msg.contains("Clock skew") { + StatusCode::BAD_REQUEST + } else { + StatusCode::UNAUTHORIZED + }; + (status, error_msg) + })?; Ok((StatusCode::OK, Json(resp))) } diff --git a/crates/auths-deployment/config/sentinel.conf b/crates/auths-deployment/config/sentinel.conf new file mode 100644 index 00000000..84861335 --- /dev/null +++ b/crates/auths-deployment/config/sentinel.conf @@ -0,0 +1,57 @@ +# Redis Sentinel Configuration Template +# Production-grade 3-instance Sentinel cluster for auths-api +# See: docs/PRODUCTION_REDIS_HA.md for deployment guides + +# Bind to all interfaces (override in deployment) +bind 0.0.0.0 +protected-mode no + +# Sentinel port (default 26379) +port 26379 + +# Sentinel working directory +dir ./ + +# Master name (referenced by clients) +# All 3 Sentinels must use the same name +sentinel monitor mymaster 127.0.0.1 6379 2 + +# Time in milliseconds before Sentinel considers master unreachable +# After this time, if a majority of Sentinels agree, auto-failover begins +# Recommended: 30s for auths-api (balance between detection time and false positives) +sentinel down_after_milliseconds mymaster 30000 + +# Number of replicas to reconfigure in parallel during failover +# Set to 1 to avoid traffic spikes during switchover +sentinel parallel_syncs mymaster 1 + +# Failover timeout: how long to wait before giving up +# Should be at least 3x down_after_milliseconds +sentinel failover_timeout mymaster 120000 + +# Sentinel logging +loglevel notice +logfile "" + +# Deny dangerous commands (scripting, config modification) +sentinel deny_scripts_reconfig yes + +# Authentication (if Redis requires password) +# Uncomment and set for production: +# sentinel auth-pass mymaster your-redis-password + +# Sentinel quorum for starting auto-failover +# With 3 Sentinels, quorum=2 means any 2 can trigger failover +# (This is implicitly 2 from the "sentinel monitor" command above) + +# Notification script on failure detection (optional) +# Called when failover starts: script will be called +# sentinel notification-script mymaster /path/to/notification-script.sh + +# Configuration propagation script (optional) +# Called after failover to reconfigure replicas +# sentinel client-reconfig-script mymaster /path/to/client-reconfig-script.sh + +# For testing: allow Sentinel to accept SHUTDOWN command +# Remove in production +sentinel deny_scripts_reconfig no diff --git a/crates/auths-deployment/scripts/backup-redis-aof.sh b/crates/auths-deployment/scripts/backup-redis-aof.sh new file mode 100755 index 00000000..ff44d15d --- /dev/null +++ b/crates/auths-deployment/scripts/backup-redis-aof.sh @@ -0,0 +1,148 @@ +#!/bin/bash +# Automated Redis AOF backup to S3 +# Usage: AWS_REGION=us-east-1 ./backup-redis-aof.sh [redis-host] [redis-port] +# +# Cron job (2am UTC daily): +# 0 2 * * * cd /app && AWS_REGION=us-east-1 ./backup-redis-aof.sh localhost 6379 >> /var/log/redis-backup.log 2>&1 + +set -e + +# Configuration +REDIS_HOST=${1:-localhost} +REDIS_PORT=${2:-6379} +AWS_REGION=${AWS_REGION:-us-east-1} +S3_BUCKET="${S3_BUCKET:-auths-redis-backups}" +BACKUP_RETENTION_DAYS=30 +MAX_BACKUP_SIZE_MB=1000 # Alert if > 1GB + +# Derived variables +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +BACKUP_NAME="redis-aof-${TIMESTAMP}.aof.gz" +LOCAL_AOF_PATH="/tmp/redis-aof-${TIMESTAMP}.aof" +COMPRESSED_AOF_PATH="${LOCAL_AOF_PATH}.gz" +S3_KEY="backups/${BACKUP_NAME}" +S3_URI="s3://${S3_BUCKET}/${S3_KEY}" +LOG_PREFIX="[$(date '+%Y-%m-%d %H:%M:%S')]" + +# Color output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}${LOG_PREFIX}${NC} $*"; } +log_warn() { echo -e "${YELLOW}${LOG_PREFIX}${NC} $*"; } +log_error() { echo -e "${RED}${LOG_PREFIX}${NC} $*"; exit 1; } + +# === Step 1: Verify Redis connectivity === +log_info "Verifying Redis connectivity ($REDIS_HOST:$REDIS_PORT)..." +if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" ping >/dev/null 2>&1; then + log_error "Redis not reachable at $REDIS_HOST:$REDIS_PORT" +fi +log_info "Redis reachable ✓" + +# === Step 2: Trigger AOF rewrite === +log_info "Triggering AOF rewrite (compaction)..." +if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" BGREWRITEAOF >/dev/null 2>&1; then + log_warn "AOF rewrite failed (may already be in progress)" +fi + +# Wait for rewrite to complete (max 30s) +sleep 2 +log_info "Waiting for AOF rewrite..." +for i in {1..15}; do + if redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" info persistence | grep -q "aof_rewrite_in_progress:0"; then + log_info "AOF rewrite completed" + break + fi + sleep 2 +done + +# === Step 3: Get AOF file location === +log_info "Locating AOF file..." +REDIS_AOF_PATH=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" config get appendfilename | tail -1) +REDIS_DIR=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" config get dir | tail -1) +FULL_AOF_PATH="${REDIS_DIR}/${REDIS_AOF_PATH}" + +log_info "AOF file: $FULL_AOF_PATH" +if [[ ! -f "$FULL_AOF_PATH" ]]; then + log_error "AOF file not found at $FULL_AOF_PATH" +fi + +# === Step 4: Copy and compress AOF === +log_info "Copying AOF to temporary location..." +cp "$FULL_AOF_PATH" "$LOCAL_AOF_PATH" + +log_info "Compressing AOF..." +gzip -f "$LOCAL_AOF_PATH" + +# Check backup size +BACKUP_SIZE_MB=$(($(stat -f%z "$COMPRESSED_AOF_PATH" 2>/dev/null || stat -c%s "$COMPRESSED_AOF_PATH") / 1024 / 1024)) +log_info "Compressed AOF size: ${BACKUP_SIZE_MB}MB" + +if [[ $BACKUP_SIZE_MB -gt $MAX_BACKUP_SIZE_MB ]]; then + log_warn "ALERT: Backup size (${BACKUP_SIZE_MB}MB) exceeds threshold (${MAX_BACKUP_SIZE_MB}MB)" +fi + +# === Step 5: Upload to S3 === +log_info "Uploading to S3: $S3_URI" +if ! aws s3 cp "$COMPRESSED_AOF_PATH" "$S3_URI" \ + --region "$AWS_REGION" \ + --storage-class STANDARD_IA \ + --metadata "timestamp=${TIMESTAMP},redis-host=${REDIS_HOST},backup-size=${BACKUP_SIZE_MB}MB" \ + 2>&1; then + log_error "S3 upload failed for $S3_URI" +fi +log_info "✓ Backup uploaded to S3" + +# === Step 6: Cleanup old local backups === +log_info "Cleaning up temporary files..." +rm -f "$COMPRESSED_AOF_PATH" + +# === Step 7: Cleanup old S3 backups (retention policy) === +log_info "Applying retention policy (keeping ${BACKUP_RETENTION_DAYS} days)..." +CUTOFF_DATE=$(date -u -d "${BACKUP_RETENTION_DAYS} days ago" +%Y-%m-%d 2>/dev/null || date -u -v-${BACKUP_RETENTION_DAYS}d +%Y-%m-%d) + +# List and delete old backups +OLD_BACKUPS=$(aws s3api list-objects-v2 \ + --bucket "$S3_BUCKET" \ + --prefix "backups/" \ + --region "$AWS_REGION" \ + --query "Contents[?LastModified<'${CUTOFF_DATE}T00:00:00Z'].Key" \ + --output text 2>/dev/null || echo "") + +if [[ -n "$OLD_BACKUPS" ]]; then + log_info "Deleting old backups..." + for key in $OLD_BACKUPS; do + log_info " Deleting: $key" + aws s3 rm "s3://${S3_BUCKET}/${key}" --region "$AWS_REGION" 2>/dev/null || true + done +fi + +# === Step 8: Log success === +log_info "✓ Backup completed successfully" +log_info "Summary:" +log_info " Timestamp: $TIMESTAMP" +log_info " Size: ${BACKUP_SIZE_MB}MB" +log_info " Location: $S3_URI" +log_info " Redis: $REDIS_HOST:$REDIS_PORT" + +# === Step 9: CloudWatch metric (optional) === +if command -v aws >/dev/null 2>&1; then + log_info "Publishing CloudWatch metrics..." + aws cloudwatch put-metric-data \ + --namespace "auths/redis" \ + --metric-name "backup-size-mb" \ + --value "$BACKUP_SIZE_MB" \ + --region "$AWS_REGION" \ + 2>/dev/null || log_warn "Failed to publish metrics" + + aws cloudwatch put-metric-data \ + --namespace "auths/redis" \ + --metric-name "backup-success" \ + --value 1 \ + --region "$AWS_REGION" \ + 2>/dev/null || true +fi + +exit 0 diff --git a/crates/auths-deployment/scripts/restore-redis-aof.sh b/crates/auths-deployment/scripts/restore-redis-aof.sh new file mode 100755 index 00000000..3a068349 --- /dev/null +++ b/crates/auths-deployment/scripts/restore-redis-aof.sh @@ -0,0 +1,195 @@ +#!/bin/bash +# Restore Redis from AOF backup (point-in-time recovery) +# Usage: ./restore-redis-aof.sh [redis-host] [redis-port] [backup-date] +# +# Examples: +# ./restore-redis-aof.sh s3://my-bucket/redis-aof-20260329_020000.aof.gz +# ./restore-redis-aof.sh /local/redis-aof-20260329_020000.aof.gz localhost 6379 +# ./restore-redis-aof.sh latest localhost 6379 2026-03-28 # Restore backup from specific date + +set -e + +# Configuration +BACKUP_SOURCE=$1 +REDIS_HOST=${2:-localhost} +REDIS_PORT=${3:-6379} +BACKUP_DATE=${4:-} +S3_BUCKET="${S3_BUCKET:-auths-redis-backups}" +AWS_REGION=${AWS_REGION:-us-east-1} +WORK_DIR="/tmp/redis-restore-$(date +%s)" +REDIS_DIR=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" config get dir 2>/dev/null | tail -1 || echo "/var/lib/redis") +REDIS_AOF_NAME=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" config get appendfilename 2>/dev/null | tail -1 || echo "appendonly.aof") + +# Color output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; } + +# === Validation === +if [[ -z "$BACKUP_SOURCE" ]]; then + log_error "Usage: $0 [redis-host] [redis-port] [backup-date]" +fi + +if ! command -v redis-cli >/dev/null; then + log_error "redis-cli not found. Install redis-tools." +fi + +# === Step 1: Find backup file === +log_info "Locating backup file..." + +BACKUP_FILE="" +if [[ "$BACKUP_SOURCE" == "latest" ]]; then + # Find latest backup from optional date + if [[ -n "$BACKUP_DATE" ]]; then + log_info "Finding latest backup from $BACKUP_DATE..." + BACKUP_FILE=$(aws s3api list-objects-v2 \ + --bucket "$S3_BUCKET" \ + --prefix "backups/redis-aof-${BACKUP_DATE}" \ + --region "$AWS_REGION" \ + --query 'Contents | sort_by(@, &LastModified) | [-1].Key' \ + --output text 2>/dev/null || echo "") + else + log_info "Finding latest backup..." + BACKUP_FILE=$(aws s3api list-objects-v2 \ + --bucket "$S3_BUCKET" \ + --prefix "backups/" \ + --region "$AWS_REGION" \ + --query 'Contents | sort_by(@, &LastModified) | [-1].Key' \ + --output text 2>/dev/null || echo "") + fi + + if [[ -z "$BACKUP_FILE" || "$BACKUP_FILE" == "None" ]]; then + log_error "No backup found in S3" + fi + BACKUP_SOURCE="s3://${S3_BUCKET}/${BACKUP_FILE}" + log_info "Using: $BACKUP_SOURCE" +elif [[ "$BACKUP_SOURCE" =~ ^s3:// ]]; then + log_info "Using S3 backup: $BACKUP_SOURCE" +elif [[ -f "$BACKUP_SOURCE" ]]; then + log_info "Using local backup: $BACKUP_SOURCE" +else + log_error "Backup not found: $BACKUP_SOURCE" +fi + +# === Step 2: Download backup === +mkdir -p "$WORK_DIR" +log_info "Downloading backup..." + +LOCAL_BACKUP="${WORK_DIR}/backup.aof.gz" +if [[ "$BACKUP_SOURCE" =~ ^s3:// ]]; then + if ! aws s3 cp "$BACKUP_SOURCE" "$LOCAL_BACKUP" --region "$AWS_REGION"; then + log_error "Failed to download $BACKUP_SOURCE" + fi +else + cp "$BACKUP_SOURCE" "$LOCAL_BACKUP" +fi + +log_info "✓ Backup downloaded" + +# === Step 3: Decompress === +log_info "Decompressing..." +if ! gunzip -f "$LOCAL_BACKUP"; then + log_error "Failed to decompress backup" +fi + +LOCAL_AOF="${LOCAL_BACKUP%.gz}" +log_info "✓ Decompressed to $LOCAL_AOF" + +# === Step 4: Validate AOF === +log_info "Validating AOF integrity..." + +# Redis can validate by trying to load it +if ! timeout 30 redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" --pipe < "$LOCAL_AOF" >/dev/null 2>&1; then + # Check for obvious corruption markers + if head -c 10 "$LOCAL_AOF" | grep -q "REDIS"; then + log_info "AOF header present (RDB format, may be snapshot)" + fi +fi + +# Count entries (rough validation) +ENTRY_COUNT=$(grep -c "^\*" "$LOCAL_AOF" || echo "unknown") +log_info "AOF entries: ~$ENTRY_COUNT" + +if [[ $ENTRY_COUNT -eq 0 ]]; then + log_warn "Warning: AOF appears empty or corrupted" +fi + +# === Step 5: Backup current AOF === +log_info "Backing up current AOF..." +if [[ -f "${REDIS_DIR}/${REDIS_AOF_NAME}" ]]; then + CURRENT_BACKUP="${WORK_DIR}/appendonly.aof.backup" + cp "${REDIS_DIR}/${REDIS_AOF_NAME}" "$CURRENT_BACKUP" + log_info "✓ Current AOF backed up to $CURRENT_BACKUP" +fi + +# === Step 6: Stop Redis === +log_info "Stopping Redis ($REDIS_HOST:$REDIS_PORT)..." +if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" shutdown >/dev/null 2>&1; then + log_warn "Redis already stopped" +fi + +sleep 2 +if redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" ping >/dev/null 2>&1; then + log_error "Failed to stop Redis" +fi +log_info "✓ Redis stopped" + +# === Step 7: Replace AOF === +log_info "Replacing AOF file..." +if [[ ! -d "$REDIS_DIR" ]]; then + log_error "Redis directory not found: $REDIS_DIR" +fi + +cp "$LOCAL_AOF" "${REDIS_DIR}/${REDIS_AOF_NAME}" +log_info "✓ AOF replaced" + +# === Step 8: Start Redis === +log_info "Starting Redis..." +# This is environment-specific; assuming systemd +if command -v systemctl >/dev/null; then + if ! systemctl start redis-server 2>/dev/null; then + log_warn "Could not start Redis via systemctl (may be docker-compose or manual)" + fi +else + log_warn "systemctl not found. Manually start Redis and verify." +fi + +sleep 3 + +# === Step 9: Verify recovery === +log_info "Verifying recovery..." +if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" ping >/dev/null 2>&1; then + log_error "Redis not responding after restore. Check logs." +fi +log_info "✓ Redis responding" + +# Get stats +DBSIZE=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" dbsize | grep -oE '[0-9]+' || echo "0") +MEMORY=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" info memory | grep used_memory_human | cut -d: -f2 || echo "unknown") + +log_info "Database size: $DBSIZE keys" +log_info "Memory usage: $MEMORY" + +# === Step 10: Cleanup === +log_info "Cleaning up temporary files..." +rm -rf "$WORK_DIR" + +log_info "✓ Recovery completed successfully" +log_info "" +log_info "Summary:" +log_info " Backup source: $BACKUP_SOURCE" +log_info " Redis: $REDIS_HOST:$REDIS_PORT" +log_info " Keys restored: $DBSIZE" +log_info " Memory: $MEMORY" +log_info "" +log_info "Next steps:" +log_info " 1. Verify data integrity in application" +log_info " 2. Check for replication lag if using replicas" +log_info " 3. Resume monitoring/alerting" + +exit 0 diff --git a/crates/auths-deployment/scripts/start-sentinel.sh b/crates/auths-deployment/scripts/start-sentinel.sh new file mode 100755 index 00000000..cc7fa261 --- /dev/null +++ b/crates/auths-deployment/scripts/start-sentinel.sh @@ -0,0 +1,177 @@ +#!/bin/bash +# Start Redis Sentinel instances for auths-api HA +# Usage: ./start-sentinel.sh [mode: local|cloud] +# +# Local mode: starts 3 Sentinels + master + 2 replicas via docker-compose (testing) +# Cloud mode: generates configs for managed deployment + +set -e + +MODE=${1:-local} +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CONFIG_DIR="${SCRIPT_DIR}/../config" + +# === Local Mode: Docker Compose Test Setup === +if [[ "$MODE" == "local" ]]; then + echo "Starting local Sentinel cluster (docker-compose)..." + + # Create docker-compose.yml for 3 Sentinels + master + 2 replicas + cat > "${SCRIPT_DIR}/docker-compose-sentinel.yml" << 'EOF' +version: '3.8' +services: + redis-master: + image: redis:7-alpine + ports: + - "6379:6379" + command: redis-server --appendonly yes --dir /data + volumes: + - redis-master-data:/data + networks: + - sentinel-net + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 3 + + redis-replica-1: + image: redis:7-alpine + ports: + - "6380:6379" + command: redis-server --port 6379 --replicaof redis-master 6379 --appendonly yes --dir /data + volumes: + - redis-replica-1-data:/data + depends_on: + redis-master: + condition: service_healthy + networks: + - sentinel-net + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 3 + + redis-replica-2: + image: redis:7-alpine + ports: + - "6381:6379" + command: redis-server --port 6379 --replicaof redis-master 6379 --appendonly yes --dir /data + volumes: + - redis-replica-2-data:/data + depends_on: + redis-master: + condition: service_healthy + networks: + - sentinel-net + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 3 + + sentinel-1: + image: redis:7-alpine + ports: + - "26379:26379" + command: redis-sentinel /etc/sentinel/sentinel.conf --port 26379 + volumes: + - ./config/sentinel.conf:/etc/sentinel/sentinel.conf:ro + - sentinel-1-data:/data + depends_on: + - redis-master + - redis-replica-1 + - redis-replica-2 + networks: + - sentinel-net + healthcheck: + test: ["CMD", "redis-cli", "-p", "26379", "ping"] + interval: 5s + timeout: 3s + retries: 3 + + sentinel-2: + image: redis:7-alpine + ports: + - "26380:26379" + command: redis-sentinel /etc/sentinel/sentinel.conf --port 26379 + volumes: + - ./config/sentinel.conf:/etc/sentinel/sentinel.conf:ro + - sentinel-2-data:/data + depends_on: + - redis-master + - redis-replica-1 + - redis-replica-2 + networks: + - sentinel-net + healthcheck: + test: ["CMD", "redis-cli", "-p", "26379", "ping"] + interval: 5s + timeout: 3s + retries: 3 + + sentinel-3: + image: redis:7-alpine + ports: + - "26381:26379" + command: redis-sentinel /etc/sentinel/sentinel.conf --port 26379 + volumes: + - ./config/sentinel.conf:/etc/sentinel/sentinel.conf:ro + - sentinel-3-data:/data + depends_on: + - redis-master + - redis-replica-1 + - redis-replica-2 + networks: + - sentinel-net + healthcheck: + test: ["CMD", "redis-cli", "-p", "26379", "ping"] + interval: 5s + timeout: 3s + retries: 3 + +volumes: + redis-master-data: + redis-replica-1-data: + redis-replica-2-data: + sentinel-1-data: + sentinel-2-data: + sentinel-3-data: + +networks: + sentinel-net: + driver: bridge +EOF + + cd "${SCRIPT_DIR}" + + # Start services + docker-compose -f docker-compose-sentinel.yml up -d + + # Wait for cluster to stabilize + echo "Waiting for cluster to stabilize (10s)..." + sleep 10 + + echo "✓ Sentinel cluster started" + echo "" + echo "Cluster Status:" + docker exec "$(docker-compose -f docker-compose-sentinel.yml ps -q sentinel-1)" \ + redis-cli -p 26379 sentinel masters | grep -E "name|role|status" + + echo "" + echo "Connection String: redis-sentinel://localhost:26379,localhost:26380,localhost:26381?service_name=mymaster" + echo "Test with: redis-cli -h localhost -p 26379 sentinel masters" + +# === Cloud Mode: Generate configs for managed deployments === +elif [[ "$MODE" == "cloud" ]]; then + echo "Generating configs for cloud deployment..." + echo "See docs/PRODUCTION_REDIS_HA.md for platform-specific setup:" + echo " - Self-hosted EC2 (deploy sentinel cluster separately)" + echo " - AWS ElastiCache (managed failover, skip Sentinel)" + echo " - Upstash (managed failover, skip Sentinel)" + echo " - GCP Memorystore (managed failover, skip Sentinel)" + +else + echo "Usage: $0 [local|cloud]" + exit 1 +fi diff --git a/crates/auths-deployment/scripts/test-sentinel-failover.sh b/crates/auths-deployment/scripts/test-sentinel-failover.sh new file mode 100755 index 00000000..e4d48df8 --- /dev/null +++ b/crates/auths-deployment/scripts/test-sentinel-failover.sh @@ -0,0 +1,224 @@ +#!/bin/bash +# Test Redis Sentinel failover behavior +# Validates: master detection, election, and recovery +# +# Tests: +# 1. Verify 3-instance Sentinel quorum is healthy +# 2. Stop master → verify new master elected within 30s +# 3. Verify Sentinel detects failure + quorum decides +# 4. Verify old master becomes replica when it recovers +# 5. Verify replication lag < 1s during normal operation + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SENTINEL_PORTS=(26379 26380 26381) +REDIS_PORTS=(6379 6380 6381) + +# Color output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*"; } + +# === Test 1: Verify Sentinel cluster health === +test_sentinel_health() { + log_info "Test 1: Verify Sentinel cluster health" + + for port in "${SENTINEL_PORTS[@]}"; do + if redis-cli -p "$port" ping >/dev/null 2>&1; then + log_info "Sentinel on port $port: responsive" + else + log_error "Sentinel on port $port: FAILED" + return 1 + fi + done + + # Check quorum status + masters=$(redis-cli -p 26379 sentinel masters) + if echo "$masters" | grep -q "mymaster"; then + log_info "Sentinel quorum: monitoring mymaster ✓" + else + log_error "Sentinel not monitoring mymaster" + return 1 + fi +} + +# === Test 2: Verify current master === +test_master_info() { + log_info "Test 2: Identify current master" + + for port in "${REDIS_PORTS[@]}"; do + role=$(redis-cli -p "$port" role 2>/dev/null | head -1 || echo "") + if [[ "$role" == "master" ]]; then + log_info "Master found on port $port" + echo "$port" + return 0 + fi + done + + log_error "No master found!" + return 1 +} + +# === Test 3: Kill master and verify failover === +test_failover_detection() { + local master_port=$1 + log_info "Test 3: Kill master (port $master_port) and verify failover" + + # Record timestamp before kill + local start_time=$(date +%s) + + # Kill master + log_warn "Stopping Redis master on port $master_port..." + redis-cli -p "$master_port" shutdown >/dev/null 2>&1 || true + + # Wait and check for new master election + local elected_time="" + local timeout=40 # Allow up to 40s for election + local elapsed=0 + + while [[ $elapsed -lt $timeout ]]; do + sleep 2 + elapsed=$(($(date +%s) - start_time)) + + # Check which node became master + for port in "${REDIS_PORTS[@]}"; do + if [[ "$port" == "$master_port" ]]; then + continue # Skip old master + fi + + role=$(redis-cli -p "$port" role 2>/dev/null | head -1 || echo "") + if [[ "$role" == "master" ]]; then + elected_time=$elapsed + log_info "✓ New master elected on port $port after ${elapsed}s" + echo "$port" + return 0 + fi + done + done + + log_error "Failover FAILED: No new master elected within ${timeout}s" + return 1 +} + +# === Test 4: Verify replication lag === +test_replication_lag() { + local replica_port=$1 + log_info "Test 4: Verify replication lag < 1s" + + # Get replication info + local offset=$(redis-cli -p "$replica_port" info replication | grep master_repl_offset | cut -d: -f2) + local lag=$(redis-cli -p "$replica_port" info replication | grep slave_repl_offset | cut -d: -f2) + + if [[ -z "$offset" || -z "$lag" ]]; then + log_warn "Could not determine replication lag (node may not be initialized yet)" + return 0 + fi + + local diff=$((offset - lag)) + log_info "Replication offset: $offset, replica lag: ${diff} bytes" + + if [[ $diff -lt 1024 ]]; then + log_info "✓ Replication lag acceptable (< 1KB)" + return 0 + else + log_warn "Replication lag high: ${diff} bytes (may indicate slow network)" + return 0 # Don't fail, as lag is expected right after failover + fi +} + +# === Test 5: Verify old master becomes replica on recovery === +test_old_master_recovery() { + local old_master_port=$1 + local new_master_port=$2 + + log_info "Test 5: Restart old master and verify it becomes replica" + + # Restart old master + log_warn "Restarting old master on port $old_master_port..." + + # In docker-compose, this would be: docker-compose restart redis-master + # For now, just verify Sentinel can find it when we manually restart + + # This test is environment-specific and may require manual intervention + log_warn "Skipping manual restart (environment-specific)" +} + +# === Test 6: Verify quorum resilience === +test_quorum_resilience() { + log_info "Test 6: Verify quorum with 2 of 3 Sentinels (down 1)" + + # Kill one Sentinel + log_warn "Stopping Sentinel on port 26381..." + redis-cli -p 26381 shutdown >/dev/null 2>&1 || true + + sleep 2 + + # Verify remaining 2 Sentinels can still monitor + local quorum_healthy=0 + for port in 26379 26380; do + if redis-cli -p "$port" sentinel masters >/dev/null 2>&1; then + log_info "Sentinel on port $port: still responsive (2/3 quorum)" + quorum_healthy=1 + fi + done + + if [[ $quorum_healthy -eq 1 ]]; then + log_info "✓ Quorum resilience verified" + else + log_error "Quorum lost with 1 Sentinel down" + fi +} + +# === Main test sequence === +main() { + log_info "Starting Sentinel failover tests..." + echo "" + + # Check if docker-compose is running + if ! docker-compose -f "${SCRIPT_DIR}/docker-compose-sentinel.yml" ps sentinel-1 >/dev/null 2>&1; then + log_error "docker-compose not running. Start with: $SCRIPT_DIR/start-sentinel.sh local" + exit 1 + fi + + # Run tests + if ! test_sentinel_health; then + log_error "Sentinel health check failed" + exit 1 + fi + echo "" + + if ! master_port=$(test_master_info); then + log_error "Failed to identify master" + exit 1 + fi + echo "" + + if ! new_master_port=$(test_failover_detection "$master_port"); then + log_error "Failover detection failed" + exit 1 + fi + echo "" + + test_replication_lag "$new_master_port" + echo "" + + test_quorum_resilience + echo "" + + log_info "Failover test completed!" + echo "" + echo "Summary:" + echo " ✓ Sentinel quorum healthy" + echo " ✓ Failover detection working (< 40s)" + echo " ✓ New master elected" + echo " ✓ Replication lag acceptable" + echo " ✓ Quorum resilience verified" +} + +main "$@" diff --git a/crates/auths-sdk/src/domains/agents/service.rs b/crates/auths-sdk/src/domains/agents/service.rs index 84a4fe58..141fb2cd 100644 --- a/crates/auths-sdk/src/domains/agents/service.rs +++ b/crates/auths-sdk/src/domains/agents/service.rs @@ -140,7 +140,17 @@ impl AgentService { agent_did: &str, capability: &str, now: chrono::DateTime, + request_timestamp: chrono::DateTime, ) -> Result { + // Validate clock skew (±5 minutes) + let time_diff = { + let duration = now.signed_duration_since(request_timestamp); + duration.num_seconds().unsigned_abs() + }; + if time_diff > 300 { + return Err("Clock skew too large".to_string()); + } + // Verify signature using IdentityResolver // TODO: Integrate with IdentityResolver when available diff --git a/docs/DOMAIN_ARCHITECTURE.md b/docs/DOMAIN_ARCHITECTURE.md new file mode 100644 index 00000000..39184458 --- /dev/null +++ b/docs/DOMAIN_ARCHITECTURE.md @@ -0,0 +1,540 @@ +# Domain Architecture: Entity Ownership & API Contracts + +**Status**: Production Readiness Phase 1.5 (fn-89.0) +**Last Updated**: 2026-03-29 +**Owner**: Architecture / SDK Team + +--- + +## Overview + +This document defines the foundational domain entity ownership map and API contracts that all auths-api services and infrastructure depend on. It ensures consistent semantics across identity, device, signing, auth, and compliance domains. + +--- + +## Domain Entity Ownership Map + +### Identity Domain (`domains/identity/`) + +**Entities**: +- Developer identity (did:keri) +- Agent provisioning state +- Agent lifecycle (provision → refresh → revoke or expire) + +**Storage**: +- Redis key: `agents:{namespace}:{agent_id}` +- TTL: `agent.expires_at` +- Write-through cache (primary source of truth is Redis during normal operation) + +**Cache Invalidation**: +- On `agent.provisioned` event +- On `agent.revoked` event +- On `agent.expired` event (fn-89.9 expiry job) + +**Lifecycle**: +- `provision` → `active` → `refresh` (token) → `revoke` or `expire` + +**API Endpoints**: +- `GET /v1/agents` (list agents in namespace) +- `GET /v1/agents/{id}` (get agent details) +- `POST /v1/agents` (provision new agent) +- `DELETE /v1/agents/{id}` (revoke agent) + +--- + +### Device Domain (`domains/device/`) + +**Entities**: +- Agent device keys (Ed25519 public keys) +- Device attestations +- Key rotation state + +**Storage**: +- Redis key: `device_keys:{namespace}:{agent_id}:{device_id}` +- TTL: `agent.expires_at` (cascade with agent) +- Indexed hash for fast lookups + +**Cache Invalidation**: +- On `device.key_rotated` event +- On agent revocation (cascade delete all device keys) + +**Lifecycle**: +- Linked at agent provision +- Rotated periodically (device refresh, future work) +- Revoked with agent + +**API Endpoints**: +- `GET /v1/agents/{id}/devices` (list agent's device keys) +- `POST /v1/agents/{id}/devices/{device_id}/rotate` (rotate key, future) + +--- + +### Auth Domain (`domains/auth/`) + +**Entities**: +- Bearer tokens +- Token expiry +- Agent authorization state +- Token capabilities + +**Storage**: +- Redis key: `tokens:{token_hash}` → `{agent_id, expires_at, capabilities}` +- TTL: `token.expires_at` +- Hash-based for O(1) lookup + +**Cache Invalidation**: +- On `token.refreshed` event +- On agent revocation (cascade invalidate all tokens) +- On token expiry (TTL cleanup) + +**Lifecycle**: +- Issued at agent provision (initial token) +- Refreshed on demand via `/v1/agents/{id}/token/refresh` +- Invalidated on revoke +- Auto-expired via TTL + +**API Endpoints**: +- `POST /v1/agents/{id}/token/refresh` (refresh token) +- `POST /v1/auth/validate` (internal: validate token) + +--- + +### Compliance Domain (`domains/compliance/`) + +**Entities**: +- Audit events (immutable) +- Approval workflows (future, fn-90) +- Policy rules (future, fn-90) + +**Storage**: +- Redis AOF (append-only file) for durability (fn-89.2) +- Immutable audit log file (retention: 90 days) +- Queryable via `/v1/audit` endpoint + +**Cache Invalidation**: +- None (append-only, never invalidated) + +**Lifecycle**: +- Immutable (created once, never modified) +- Retained for 90 days +- Queryable with filters (namespace, event type, date range) + +**API Endpoints**: +- `GET /v1/audit` (list, filter, query audit logs) +- `GET /v1/audit/{event_id}` (get specific event) + +--- + +### Webhook Domain (`domains/webhooks/`) + +**Entities**: +- Webhook subscriptions (admin-configured) +- Delivery state (pending, delivered, failed) +- Dead-letter queue (for failed deliveries) + +**Storage**: +- Redis hash: `webhooks:{webhook_id}` (subscription config) +- Redis sorted set: `dlq:{domain_name}` (failed deliveries, by timestamp) +- Persistent (no TTL unless explicitly deleted) + +**Cache Invalidation**: +- On subscription change (register, update, delete) +- Manual: admin deletes subscription + +**Lifecycle**: +- Registered by admin via bootstrap or API +- Fired on domain events (provision, revoke, etc.) +- Retry on failure (exponential backoff) +- Dead-lettered after N failures + +**API Endpoints**: +- `POST /v1/webhooks` (register webhook) +- `GET /v1/webhooks` (list subscriptions) +- `DELETE /v1/webhooks/{id}` (unregister) +- `POST /v1/webhooks/{id}/test` (test delivery) + +--- + +## Cross-Domain Event Contracts + +### Identity Domain Events + +**`agent.provisioned`** +- **Emitted by**: `Identity::provision_agent()` in `domains/identity/provision.rs` +- **Payload**: + ```json + { + "event_type": "agent.provisioned", + "agent_id": "agent_ABC...", + "namespace": "myapp", + "delegator_did": "did:keri:...", + "device_public_key": "z...", + "created_at": "2026-03-29T11:00:00Z", + "expires_at": "2027-03-29T11:00:00Z" + } + ``` +- **Triggers**: + - Write to Redis: `agents:{namespace}:{agent_id}` + - Emit to audit log (fn-89.5) + - Queue webhook delivery (fn-89.15) + - Update agent list cache +- **Transaction**: Atomic via Redis MULTI/EXEC + +**`agent.revoked`** +- **Emitted by**: `Identity::revoke_agent()` in `domains/identity/provision.rs` +- **Payload**: + ```json + { + "event_type": "agent.revoked", + "agent_id": "agent_ABC...", + "revoked_by": "admin@example.com", + "revoke_reason": "Compromised key / User request / Expiration", + "revoked_at": "2026-03-29T12:00:00Z" + } + ``` +- **Triggers**: + - Invalidate Redis: `agents:{namespace}:{agent_id}` (DELETE) + - Cascade: invalidate all `device_keys:*:{agent_id}:*` + - Cascade: invalidate all `tokens:*` for this agent + - Emit to audit log + - Queue webhook delivery +- **Transaction**: Atomic up to cache invalidation; webhooks are async + +**`agent.expired`** +- **Emitted by**: Background expiry job (fn-89.9: token lifecycle) +- **Payload**: + ```json + { + "event_type": "agent.expired", + "agent_id": "agent_ABC...", + "originally_expired_at": "2027-03-29T11:00:00Z" + } + ``` +- **Triggers**: + - Delete from Redis: agent state + device keys + tokens + - Emit to audit log + - Queue webhook delivery +- **Transaction**: Atomic + +### Device Domain Events + +**`device.key_rotated`** +- **Emitted by**: Device rotation endpoint (future: fn-90.5, `domains/device/service.rs`) +- **Payload**: + ```json + { + "event_type": "device.key_rotated", + "agent_id": "agent_ABC...", + "device_id": "device_XYZ...", + "old_key_hash": "sha256:...", + "new_key_hash": "sha256:...", + "rotated_at": "2026-03-29T13:00:00Z" + } + ``` +- **Triggers**: + - Update Redis: `device_keys:{namespace}:{agent_id}:{device_id}` + - Emit to audit log (optional) + - Queue webhook delivery (optional) +- **Transaction**: Atomic + +### Auth Domain Events + +**`token.refreshed`** +- **Emitted by**: `Auth::refresh_token()` → `POST /v1/agents/{id}/token/refresh` (fn-89.9) +- **Payload**: + ```json + { + "event_type": "token.refreshed", + "agent_id": "agent_ABC...", + "new_expires_at": "2026-04-05T11:00:00Z", + "new_token_hash": "sha256:..." + } + ``` +- **Triggers**: + - Update Redis: `tokens:{token_hash}` + - Emit to audit log + - Queue webhook delivery (optional) +- **Transaction**: Atomic + +--- + +## Transaction Boundary Definitions + +### Bootstrap Workflow (fn-89.8) + +**Steps**: +1. Challenge-response (client proves key ownership) +2. Register identity (store in Git, optional) +3. Provision first agent for that identity + +**Atomicity**: All-or-nothing +- If any step fails, rollback to initial state +- If agent provision fails, delete identity from IdentityResolver + +**Storage Locations**: +- Agent state → Redis +- Identity → Git refs `refs/auths/identities/{namespace}/{did}` (optional) + +**Failure Mode**: If bootstrap fails partway through, retry from step 1 (idempotent) + +### Agent Provisioning Workflow + +**Steps**: +1. Validate capabilities against namespace policy +2. Sign attestation (device signature required) +3. Write agent state to Redis cache +4. Emit `agent.provisioned` event +5. Queue webhooks asynchronously + +**Atomicity**: All-or-nothing up to webhook queueing +- Redis MULTI/EXEC for steps 1-4 +- Webhooks are async (best-effort, retryable) + +**Rollback**: If any step fails, delete created agent state and fail fast + +### Token Refresh Workflow + +**Steps**: +1. Validate current token (lookup in `tokens:{token_hash}`) +2. Generate new token (from crypto library) +3. Update Redis cache: `tokens:{old_hash}` → DELETE, `tokens:{new_hash}` → WRITE +4. Emit `token.refreshed` event +5. Return new token to client + +**Atomicity**: Atomic (no external events until return) +- Redis MULTI/EXEC for token cache update +- Event emission is part of the transaction + +**Fallback**: If Redis write fails, client can retry (idempotent if implemented) + +### Agent Revocation Workflow + +**Steps**: +1. Mark agent as revoked in policy store +2. Invalidate Redis: agent state, device keys, tokens +3. Emit `agent.revoked` event +4. Queue webhooks asynchronously + +**Atomicity**: Atomic up to cache invalidation +- Steps 1-3 are atomic (single Redis transaction) +- Webhooks are async + +**Cascade**: Revoking an agent automatically: +- Deletes all device keys (`device_keys:*:{agent_id}:*`) +- Invalidates all tokens for that agent +- No new tokens can be issued + +--- + +## Domain Contracts & Public API Surface + +### Identity Domain Public API + +```rust +/// Provision a new agent for the given namespace. +/// +/// Args: +/// * `namespace`: Namespace identifier +/// * `config`: ProvisionConfig (identity, capabilities, ttl) +/// * `identity_resolver`: For storing identity (optional) +/// * `clock`: For timestamp injection +/// +/// Usage: +/// ```ignore +/// let agent = identity.provision_agent( +/// "myapp", +/// config, +/// &identity_resolver, +/// &clock, +/// ).await?; +/// ``` +pub async fn provision_agent( + namespace: &str, + config: ProvisionConfig, + identity_resolver: &dyn IdentityResolver, + clock: &dyn ClockProvider, +) -> Result; + +/// Revoke an agent (marks as revoked, invalidates cache). +pub async fn revoke_agent( + namespace: &str, + agent_id: &str, + revoked_by: &str, + reason: &str, + clock: &dyn ClockProvider, +) -> Result<(), RevocationError>; + +/// Get agent details (cache lookup). +pub async fn get_agent(namespace: &str, agent_id: &str) -> Result; + +/// List agents in namespace (pagination support in fn-89.13). +pub async fn list_agents( + namespace: &str, + limit: usize, + offset: usize, +) -> Result, QueryError>; +``` + +### Auth Domain Public API + +```rust +/// Validate a bearer token (lookup in tokens cache). +pub async fn validate_token( + namespace: &str, + token: &str, +) -> Result; + +/// Refresh a token (issue new token, invalidate old one). +pub async fn refresh_token( + namespace: &str, + agent_id: &str, + current_token: &str, + ttl_seconds: u64, + clock: &dyn ClockProvider, +) -> Result; + +/// Check if agent has a capability. +pub async fn check_capability( + namespace: &str, + agent_id: &str, + capability: &str, +) -> Result; +``` + +### Compliance Domain Public API + +```rust +/// Emit an audit event (write to audit log + Redis AOF). +pub async fn emit_audit_event(event: AuditEvent) -> Result<(), StorageError>; + +/// Query audit logs with filters. +pub async fn query_audit_logs( + namespace: &str, + filter: AuditFilter, + limit: usize, +) -> Result, QueryError>; +``` + +### Webhook Domain Public API + +```rust +/// Dispatch a webhook to all registered subscribers. +pub async fn dispatch_webhook( + domain: &str, + event: &str, + payload: serde_json::Value, +) -> Result<(), DispatchError>; + +/// Register a new webhook subscription. +pub async fn register_webhook( + namespace: &str, + url: &str, + events: Vec, + secret: &str, +) -> Result; + +/// List all webhook subscriptions for a namespace. +pub async fn list_webhooks(namespace: &str) -> Result, QueryError>; +``` + +--- + +## Storage Locality Reference + +### Redis (Hot Cache) + +| Key Pattern | Type | TTL | Usage | +|---|---|---|---| +| `agents:{ns}:{agent_id}` | Hash | `agent.expires_at` | Agent state (name, created_at, device keys list) | +| `device_keys:{ns}:{agent_id}:{device_id}` | Hash | `agent.expires_at` | Device public key + metadata | +| `tokens:{token_hash}` | Hash | `token.expires_at` | Token metadata (agent_id, capabilities, expires_at) | +| `webhooks:{webhook_id}` | Hash | None (persistent) | Webhook subscription config (url, events, secret) | +| `dlq:{domain_name}` | Sorted Set | None (persistent) | Dead-letter queue (failed webhook deliveries, scored by timestamp) | + +### Audit Log (Immutable) + +- **Redis AOF**: Durability mechanism (fn-89.2) +- **Audit Log File**: Queryable via `/v1/audit` endpoint (fn-89.14) +- **Retention**: 90 days (configurable) +- **Format**: JSONL (one event per line) + +### Git (Optional, via IdentityResolver) + +- **Path**: `refs/auths/identities/{namespace}/{did}` +- **Contents**: Human-readable identity metadata +- **Purpose**: Optional visibility into registered identities +- **Note**: Not used for runtime lookups (cache-first via Redis) + +--- + +## Domain Dependency Diagram + +``` +┌─────────────────────────────────────────────────────┐ +│ auths-api HTTP Routes Layer │ +│ /v1/agents, /v1/tokens, /v1/audit, /v1/webhooks │ +└─────────────────┬───────────────────────────────────┘ + │ + ┌───────────┼────────────────────────────┐ + │ │ │ + v v v +┌──────────────┐ ┌──────────────┐ ┌──────────────┐ +│ Identity │ │ Auth │ │ Compliance │ +│ Domain │ │ Domain │ │ Domain │ +│ │ │ │ │ │ +│ • provision │ │ • validate │ │ • audit log │ +│ • revoke │ │ • refresh │ │ • queries │ +│ • list │ │ • capability │ │ │ +└──────┬───────┘ └──────┬───────┘ └──────────────┘ + │ │ │ + └─────────────────┼───────────────┘ + │ + ┌───────┴────────┐ + │ │ + v v + ┌──────────────┐ ┌──────────────┐ + │ Webhook │ │ Redis │ + │ Domain │ │ (Cache) │ + │ │ │ │ + │ • dispatch │ │ • MULTI/EXEC │ + │ • register │ │ • TTL mgmt │ + │ • dead-letter│ │ • Sentinel HA│ + └──────────────┘ └──────┬───────┘ + │ + v + ┌──────────────┐ + │ Sentinel HA │ + │ + AOF backup │ + └──────────────┘ +``` + +--- + +## Key Design Principles + +1. **Redis as Source of Truth**: For hot data (agents, tokens). Git is optional (identity visibility only). +2. **Event-Driven**: All state changes emit events for audit + webhooks. +3. **Transaction Boundaries**: Atomic up to cache; webhooks are best-effort async. +4. **TTL-Based Cleanup**: No explicit delete cron; Redis TTL handles cleanup. +5. **Cascade on Revoke**: Agent revocation cascades to devices and tokens. +6. **Audit Trail**: All domain events logged for compliance (fn-89.5, fn-89.14). + +--- + +## Integration Checklist (for fn-89.1 onwards) + +- [ ] Read this document before starting fn-89.1 +- [ ] Reference Redis keys from "Storage Locality" section +- [ ] Emit events per "Cross-Domain Event Contracts" +- [ ] Respect transaction boundaries from "Transaction Boundary Definitions" +- [ ] Use public APIs from "Domain Contracts & Public API Surface" + +--- + +**Related Tasks**: +- fn-89.1: Redis Sentinel + failover +- fn-89.2: AOF backup + point-in-time recovery +- fn-89.5: Structured audit logging (emit_audit_event) +- fn-89.9: Token refresh endpoint +- fn-89.14: Audit query endpoint +- fn-89.15: Webhook delivery diff --git a/docs/PRODUCTION_REDIS_HA.md b/docs/PRODUCTION_REDIS_HA.md new file mode 100644 index 00000000..8b854a51 --- /dev/null +++ b/docs/PRODUCTION_REDIS_HA.md @@ -0,0 +1,511 @@ +# Production Redis HA Setup Guide + +**Related**: fn-89.1 (Redis Sentinel + failover configuration and docs) + +Redis high availability is **critical** for auths-api. This document covers four deployment patterns with increasing operational overhead vs. cost. + +--- + +## Quick Comparison + +| Platform | Failover | Backups | Cost | Operational Load | +|----------|----------|---------|------|------------------| +| **Managed (Upstash/ElastiCache/Memorystore)** | Automatic | Automatic | $$$ | Minimal | +| **Self-Hosted EC2 + Sentinel** | Automatic | Manual (fn-89.2) | $ | Medium | +| **Self-Hosted Docker + Sentinel** | Automatic | Manual | $ | Low (testing) | +| **Single Master (NOT recommended for production)** | None | Manual | $ | None (risky) | + +**Recommendation**: Start with managed (Upstash or AWS ElastiCache) for production. Self-host Sentinel only if you need cost control + accept operational complexity. + +--- + +## Architecture Overview + +### Managed Services (Upstash, ElastiCache, Memorystore) + +``` +┌─────────────────────────────────┐ +│ auths-api (replicas) │ +│ (multiple availability zones) │ +└────────────┬────────────────────┘ + │ Connect to service endpoint + │ (auto-discovers master) + v + ┌────────────────────┐ + │ Managed Redis HA │ + │ (Master + Replicas)│ + │ - Auto-failover │ + │ - Auto-backups │ + │ - Monitoring │ + └────────────────────┘ +``` + +### Self-Hosted (EC2/Kubernetes + Sentinel) + +``` +┌──────────────────────────────────────────────────┐ +│ auths-api (multiple pods/instances) │ +│ (Kubernetes or EC2 Auto Scaling Group) │ +└────────────┬─────────────────────────────────────┘ + │ Connect to Sentinel (quorum) + │ + ┌───────┴────────────┐ + │ │ + v v +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ Sentinel 1 │ │ Sentinel 2 │ │ Sentinel 3 │ +│ (port 26379) │ (port 26379) │ (port 26379) +└──────┬──────┘ └──────┬──────┘ └──────┬──────┘ + │ monitors │ monitors │ monitors + │ │ │ + └──────────────────┼──────────────────┘ + │ quorum (2 of 3) + ┌────────────┴────────────┐ + │ │ + v v + ┌─────────────┐ ┌──────────────┐ + │ Redis │ replicates│ Redis │ + │ Master │to │ Replica 1 │ + └─────────────┘ └──────────────┘ + │ replicates to + v + ┌──────────────┐ + │ Redis │ + │ Replica 2 │ + └──────────────┘ +``` + +--- + +## Platform 1: AWS ElastiCache (Recommended for AWS) + +### Setup + +1. **Create Redis Cluster with Multi-AZ Failover**: + ```bash + aws elasticache create-replication-group \ + --replication-group-description "auths-api-cache" \ + --engine redis \ + --engine-version 7.0 \ + --cache-node-type cache.r6g.xlarge \ + --num-cache-clusters 3 \ + --automatic-failover-enabled \ + --multi-az-enabled \ + --at-rest-encryption-enabled \ + --transit-encryption-enabled \ + --auth-token "your-secure-token-here" + ``` + +2. **Retrieve Endpoint**: + ```bash + aws elasticache describe-replication-groups \ + --replication-group-id auths-api-cache \ + --query 'ReplicationGroups[0].ConfigurationEndpoint' + ``` + Returns: `auths-api-cache.abc123.ng.0001.use1.cache.amazonaws.com:6379` + +3. **Security Group**: Allow inbound on port 6379 from auths-api security group. + +### Configuration + +In auths-api config (e.g., `config/redis.toml`): +```toml +[redis] +endpoint = "redis://@auths-api-cache.abc123.ng.0001.use1.cache.amazonaws.com:6379" +# ElastiCache handles replication + failover automatically +# Connection string directly points to cluster endpoint +``` + +### Failover Behavior + +- **Detection Time**: ~15-30s (AWS-managed) +- **RTO** (Recovery Time Objective): < 1 minute +- **Automatic**: No manual intervention needed +- **Transparency**: Connection string remains valid during failover + +### Backups + +```bash +# Automatic snapshots (can configure retention) +aws elasticache create-snapshot \ + --replication-group-id auths-api-cache \ + --snapshot-name auths-api-backup-$(date +%Y%m%d) + +# Point-in-time recovery via automated snapshots +# (See fn-89.2 for AOF backup strategy) +``` + +### Cost + +- `cache.r6g.xlarge` (8GB): ~$0.35/hour (~$250/month) × 3 nodes = **~$750/month** +- Multi-AZ: +10% cost +- Data transfer: varies (typically $0.01/GB out) +- **Total**: ~$800-1000/month for typical workload + +--- + +## Platform 2: Upstash (Recommended for Cost-Conscious / Serverless) + +### Setup + +1. **Create Redis Database**: + - Go to https://console.upstash.com/redis + - Click "Create Database" + - Region: Select closest to app (US-East, EU-West, etc.) + - Eviction Policy: `allkeys-lru` (for cache, safe to evict) + - Enable "Max Retries" for client resilience + +2. **Copy Connection String**: + ``` + redis://default:your-auth-token@your-region-xxxxx.upstash.io:xxxxx + ``` + +### Configuration + +In auths-api config: +```toml +[redis] +endpoint = "redis://default:your-auth-token@your-region-xxxxx.upstash.io:xxxxx" +# Upstash provides automatic failover via managed infrastructure +``` + +### Failover Behavior + +- **Detection Time**: ~5-10s (Upstash-managed) +- **RTO**: < 30s +- **Automatic**: Fully managed, no intervention +- **Transparency**: Connection string remains valid + +### Backups + +Upstash provides: +- Automatic 24-hour retention snapshots +- Point-in-time recovery (with premium tier) +- Daily backups (backup tier) + +```bash +# No manual backups needed; configure via Upstash console +# Premium: Enable backup for point-in-time recovery +``` + +### Cost + +- **Free Tier**: 10,000 commands/day, 256MB, single replica +- **Starter**: $9/month (1GB, Infra Multi-Master Replication) +- **Pro**: $199/month (16GB) +- **Enterprise**: Contact sales +- **Recommended for auths-api**: Pro or Enterprise + +--- + +## Platform 3: GCP Memorystore (Recommended for Google Cloud) + +### Setup + +1. **Create Redis Instance**: + ```bash + gcloud redis instances create auths-api-cache \ + --size=4 \ + --region=us-central1 \ + --tier=standard \ + --redis-version=7.0 \ + --enable-auth \ + --region-zone=us-central1-a + ``` + +2. **Retrieve Connection Info**: + ```bash + gcloud redis instances describe auths-api-cache \ + --region=us-central1 + ``` + Returns: `host` (IP only, no DNS) and `port` + +3. **Network**: Redis is private to VPC; auths-api must be in same VPC. + +### Configuration + +In auths-api config: +```toml +[redis] +endpoint = "redis://default:your-auth-password@10.0.0.3:6379" +# Note: Memorystore uses IP addresses, not DNS names +``` + +### Failover Behavior + +- **Detection Time**: ~30s (automatic) +- **RTO**: < 1 minute +- **Automatic**: Standard tier provides automatic failover +- **Transparency**: Connection via private IP + +### Backups + +```bash +# Manual snapshots +gcloud redis instances snapshot create \ + --instance=auths-api-cache \ + --region=us-central1 + +# Scheduled backups (backup tier) +# Set retention in GCP console +``` + +### Cost + +- **Standard (no HA)**: $0.11/GB/month × 4GB = ~$44/month +- **HA (multi-region)**: +100% cost = ~$88/month +- **Data transfer**: Free within GCP, $0.12/GB out to internet +- **Recommended for auths-api**: HA tier (~$88/month) + +--- + +## Platform 4: Self-Hosted (EC2 + Sentinel) + +Use this **only** if: +- You must minimize cloud costs +- You have ops expertise for Redis + Sentinel management +- Your organization already manages self-hosted Redis + +### Prerequisites + +- 3 EC2 instances (t3.large) in different availability zones + - One for Redis Master + - Two for Redis Replicas + - Plus 3 Sentinel instances (can co-locate on replicas) +- Redis 7.0+ installed +- Sentinel config from `crates/auths-deployment/config/sentinel.conf` + +### Setup + +1. **Install Redis on all 3 instances**: + ```bash + # On all instances: + sudo yum install redis -y + sudo systemctl enable redis + sudo systemctl start redis + ``` + +2. **Configure Master** (first instance): + - Edit `/etc/redis.conf`: + ``` + port 6379 + bind 0.0.0.0 + appendonly yes + requirepass your-redis-password + ``` + +3. **Configure Replicas** (second and third instances): + ``` + port 6379 + bind 0.0.0.0 + replicaof 6379 + requirepass your-redis-password + masterauth your-redis-password + appendonly yes + ``` + +4. **Deploy Sentinel** (all 3 instances): + ```bash + # Copy sentinel.conf from crates/auths-deployment/config/sentinel.conf + sudo cp sentinel.conf /etc/redis-sentinel.conf + sudo chown redis:redis /etc/redis-sentinel.conf + + # Edit /etc/redis-sentinel.conf: + # - Change bind to specific IP or 0.0.0.0 + # - Set down_after_milliseconds 30000 (30s) + # - Set parallel_syncs 1 + + sudo redis-sentinel /etc/redis-sentinel.conf + ``` + +5. **Test Failover**: + ```bash + # Run test script (see fn-89.1) + ./crates/auths-deployment/scripts/test-sentinel-failover.sh + ``` + +### Configuration + +In auths-api config: +```toml +[redis] +# Sentinel discovery (client resolves master dynamically) +endpoint = "redis-sentinel://user:password@sentinel1:26379,sentinel2:26379,sentinel3:26379?service_name=mymaster" +``` + +### Failover Behavior + +- **Detection Time**: ~30s (configurable) +- **RTO**: ~1 minute +- **Manual Intervention**: Monitor Sentinel; no auto-healing for failed machines +- **Operational Overhead**: 2-4 hours/month (monitoring, updates, troubleshooting) + +### Backups + +Manual via `redis-cli` or AOF (see fn-89.2): +```bash +# Manual snapshot +redis-cli BGSAVE + +# AOF (automatic incremental backups) +# Enable in redis.conf: appendonly yes +# See fn-89.2 for point-in-time recovery +``` + +### Cost + +- **EC2 (3 × t3.large)**: $0.10/hour × 3 = **$215/month** +- **Elastic IPs (3)**: ~$1/month +- **EBS storage (3 × 100GB)**: ~$15/month +- **Ops burden**: 2-4 hours/month +- **Total**: ~$230/month + ops time + +--- + +## Connection Resilience + +### Client-Side Retry Logic + +All auths-api clients must implement exponential backoff on Redis connection failures: + +```rust +// Pseudocode for auths-api client +const MAX_RETRIES: usize = 3; +const INITIAL_BACKOFF: Duration = Duration::from_millis(100); + +async fn connect_with_retry() -> Result { + for attempt in 0..MAX_RETRIES { + match redis_client.connect().await { + Ok(client) => return Ok(client), + Err(e) => { + let backoff = INITIAL_BACKOFF * 2u32.pow(attempt as u32); + log::warn!("Redis connect failed (attempt {}): {}, retry in {:?}", + attempt, e, backoff); + sleep(backoff).await; + } + } + } + Err(anyhow::anyhow!("Failed to connect after {} attempts", MAX_RETRIES)) +} +``` + +### Domain Entity Resilience (fn-89.0) + +Redis caches these auths-api entities: +- `agents:{namespace}:{agent_id}` (agent state, TTL = agent.expires_at) +- `tokens:{token_hash}` (token metadata, TTL = token.expires_at) +- `device_keys:*` (device keys, TTL = agent expiry) + +**On Redis unavailability** (fn-89.3 circuit breaker): +- **Authorization queries** (token validation): Return 503 Service Unavailable +- **Cache miss on agent lookup**: 503 (can't validate without cache) +- **Reads from replicas**: Fail over to secondary cache if available + +--- + +## Monitoring & Alerting + +### Key Metrics (fn-89.12) + +For any platform, monitor: +- **Replication lag**: < 1 second (normal), > 5s (alert) +- **Master failover count**: Should be 0-1/month (normal), > 3/month (investigate) +- **Connection pool health**: % connections alive (target: > 95%) +- **Cache hit ratio**: Should be > 90% for auths agents/tokens +- **Memory usage**: < 80% of allocated (auto-eviction at 100%) + +### Alerting + +Example Prometheus rules (fn-89.12): +```yaml +- alert: RedisMasterDown + expr: redis_up{role="master"} == 0 + for: 30s + action: page oncall + +- alert: RedisReplicationLag + expr: redis_replication_lag_bytes > 5242880 # 5MB + for: 2m + action: alert (not page) + +- alert: RedisMemoryHigh + expr: redis_memory_usage_percent > 80 + for: 5m + action: alert (check if cache needs size increase) +``` + +--- + +## Disaster Recovery + +### Recovery Time Objectives (RTO) + +| Failure Scenario | Managed | Self-Hosted | +|---|---|---| +| Master crashes | 1-2 minutes | 30 seconds (Sentinel) + manual failover | +| Entire region down | 5-10 minutes | Data loss (replicate to backup region) | +| Corrupted data | 24 hours (backup restore) | 24+ hours (manual restore from AOF) | + +### Backup Strategy (fn-89.2) + +- **Managed services**: Automatic daily snapshots (retention: 30 days) +- **Self-hosted**: AOF (append-only file) + daily snapshots to S3/GCS +- **Testing**: Monthly restore from backup to validation environment + +--- + +## Decision Tree: Which Platform? + +``` +┌─ AWS User? +│ └─→ Use AWS ElastiCache +│ (most integrated, auto-failover, managed backups) +│ +├─ Google Cloud User? +│ └─→ Use GCP Memorystore (Standard + HA) +│ (best for Kubernetes on GKE) +│ +├─ Serverless / Multi-cloud? +│ └─→ Use Upstash +│ (cheapest managed option, no infra) +│ +└─ On-premises / Self-hosted required? + └─→ Use EC2 + Sentinel + (cheapest, highest ops burden) +``` + +--- + +## Testing & Validation + +### Local Testing (Docker Compose) + +```bash +# Start Sentinel cluster +./crates/auths-deployment/scripts/start-sentinel.sh local + +# Run failover tests +./crates/auths-deployment/scripts/test-sentinel-failover.sh + +# Verify client retries on master kill +# (see test output) +``` + +### Production Validation (Chaos Engineering) + +For self-hosted: +1. Kill master in off-hours +2. Verify failover time < 30s +3. Verify client reconnects without request loss +4. Verify new master has all data +5. Document incident in runbook + +--- + +## References + +- [AWS ElastiCache User Guide](https://docs.aws.amazon.com/elasticache/) +- [Upstash Documentation](https://upstash.com/docs) +- [GCP Memorystore User Guide](https://cloud.google.com/memorystore/docs) +- [Redis Sentinel Documentation](https://redis.io/docs/management/sentinel/) +- Related: fn-89.0 (Domain Architecture), fn-89.2 (AOF Backups), fn-89.12 (Monitoring) diff --git a/docs/REDIS_AOF_BACKUP.md b/docs/REDIS_AOF_BACKUP.md new file mode 100644 index 00000000..8b980fbe --- /dev/null +++ b/docs/REDIS_AOF_BACKUP.md @@ -0,0 +1,461 @@ +# Redis AOF Backup & Point-in-Time Recovery + +**Related**: fn-89.2 (AOF backup automation and point-in-time recovery) + +This document covers automated AOF (Append-Only File) backup strategy, point-in-time recovery procedures, and monitoring for auths-api Redis. + +--- + +## Overview + +**Why AOF?** +- **Durability**: Survives crashes; captures every write operation +- **Granularity**: Point-in-time recovery to any moment in time +- **Compliance**: Immutable audit trail for audit events (fn-89.5) + +**Configuration**: +``` +appendonly yes # Enable AOF +appendfsync everysec # Fsync every 1 second (balance between durability + performance) +auto-aof-rewrite-percentage 100 # Rewrite when AOF grows 100% since last rewrite +auto-aof-rewrite-min-size 64mb # Don't rewrite unless > 64MB +``` + +--- + +## Architecture + +### Data Flow + +``` +┌────────────────┐ +│ auths-api │ +│ (writes data) │ +└────────┬───────┘ + │ Redis WRITE command + v + ┌─────────────────────────────┐ + │ Redis Master │ + │ • appendonly.aof (disk) │ + │ • AOF rewrite (compression) │ + │ • BGSAVE (snapshot) │ + └─────┬───────────────────────┘ + │ Replication + v + ┌──────────────┐ + │ Replica 1 │ + │ + Replica 2 │ + └──────────────┘ + + AOF grows over time: + ┌─────────────────────────────────────────┐ + │ appendonly.aof (~1KB per agent + events)│ + │ │ + │ Daily growth: ~50-100MB (10k agents) │ + │ Monthly size: ~1.5-3GB │ + └─────────────────────────────────────────┘ + + ↓ Daily backup job (2am UTC) + + ┌──────────────────────────────────────┐ + │ S3 Backups (gzip compressed) │ + │ • redis-aof-20260329_020000.aof.gz │ + │ • Compression: ~100-200MB/day │ + │ • Retention: 30 days (~6GB storage) │ + └──────────────────────────────────────┘ +``` + +### Fsync Strategy Tradeoff + +| Fsync Strategy | Durability | Performance | Data Loss Risk | +|---|---|---|---| +| `everysec` (default) | Good | Minimal overhead | Max 1s of data (acceptable) | +| `always` | Best | 10-15% slower | None (but 10x slower) | +| `no` | Worst | Best | May lose minutes of writes | + +**Recommendation for auths-api**: `appendfsync everysec` +- Domain entities cached in Redis (agents, tokens) have TTL +- Token expiry is authoritative source, not AOF +- 1s durability window acceptable for agent state + +--- + +## Backup Automation + +### Daily Backup Script + +**Location**: `crates/auths-deployment/scripts/backup-redis-aof.sh` + +**Process**: +1. Verify Redis connectivity +2. Trigger AOF rewrite (`BGREWRITEAOF`) for compression +3. Copy compressed AOF file +4. Upload to S3 with gzip compression +5. Apply retention policy (delete backups >30 days old) +6. Log success/failure to CloudWatch + +**Cron Job Setup**: +```bash +# In production EC2/Kubernetes: +0 2 * * * cd /app && AWS_REGION=us-east-1 ./backup-redis-aof.sh localhost 6379 >> /var/log/redis-backup.log 2>&1 + +# With error notification: +0 2 * * * cd /app && ./backup-redis-aof.sh localhost 6379 || alert-oncall "Redis backup failed" +``` + +**Example Run**: +```bash +$ AWS_REGION=us-east-1 ./backup-redis-aof.sh localhost 6379 +[2026-03-29 02:00:00] [INFO] Verifying Redis connectivity (localhost:6379)... +[2026-03-29 02:00:00] [INFO] Redis reachable ✓ +[2026-03-29 02:00:00] [INFO] Triggering AOF rewrite (compaction)... +[2026-03-29 02:00:00] [INFO] Waiting for AOF rewrite... +[2026-03-29 02:00:02] [INFO] AOF rewrite completed +[2026-03-29 02:00:03] [INFO] Copying AOF to temporary location... +[2026-03-29 02:00:05] [INFO] Compressing AOF... +[2026-03-29 02:00:08] [INFO] Compressed AOF size: 125MB +[2026-03-29 02:00:10] [INFO] Uploading to S3: s3://auths-redis-backups/backups/redis-aof-20260329_020000.aof.gz +[2026-03-29 02:00:15] [INFO] ✓ Backup uploaded to S3 +[2026-03-29 02:00:16] [INFO] Applying retention policy (keeping 30 days)... +[2026-03-29 02:00:17] [INFO] ✓ Backup completed successfully +[2026-03-29 02:00:17] [INFO] Summary: +[2026-03-29 02:00:17] [INFO] Timestamp: 20260329_020000 +[2026-03-29 02:00:17] [INFO] Size: 125MB +[2026-03-29 02:00:17] [INFO] Location: s3://auths-redis-backups/backups/redis-aof-20260329_020000.aof.gz +[2026-03-29 02:00:17] [INFO] Redis: localhost:6379 +``` + +### S3 Bucket Setup + +```bash +# Create S3 bucket with versioning + lifecycle +aws s3api create-bucket \ + --bucket auths-redis-backups \ + --region us-east-1 + +# Enable versioning +aws s3api put-bucket-versioning \ + --bucket auths-redis-backups \ + --versioning-configuration Status=Enabled + +# Lifecycle policy: delete old backups after 30 days +cat > lifecycle.json << 'EOF' +{ + "Rules": [ + { + "Id": "DeleteOldBackups", + "Status": "Enabled", + "Prefix": "backups/", + "Expiration": { + "Days": 30 + }, + "NoncurrentVersionExpiration": { + "NoncurrentDays": 7 + } + } + ] +} +EOF + +aws s3api put-bucket-lifecycle-configuration \ + --bucket auths-redis-backups \ + --lifecycle-configuration file://lifecycle.json +``` + +### IAM Role + +Needed for EC2/EKS to upload backups: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "s3:ListBucket" + ], + "Resource": [ + "arn:aws:s3:::auths-redis-backups", + "arn:aws:s3:::auths-redis-backups/*" + ] + }, + { + "Effect": "Allow", + "Action": [ + "cloudwatch:PutMetricData" + ], + "Resource": "*" + } + ] +} +``` + +--- + +## Point-in-Time Recovery + +### Manual Recovery Procedure + +**Location**: `crates/auths-deployment/scripts/restore-redis-aof.sh` + +**Scenarios**: + +#### 1. Recover Latest Backup +```bash +# Restore most recent backup +./restore-redis-aof.sh latest localhost 6379 + +# OR specific date +./restore-redis-aof.sh latest localhost 6379 2026-03-28 + +# Output: +# [INFO] Finding latest backup... +# [INFO] Using: s3://auths-redis-backups/backups/redis-aof-20260329_020000.aof.gz +# [INFO] Downloading backup... +# [INFO] ✓ Backup downloaded +# [INFO] Decompressing... +# [INFO] Validating AOF integrity... +# [INFO] Backing up current AOF... +# [INFO] Stopping Redis... +# [INFO] ✓ Redis stopped +# [INFO] Replacing AOF file... +# [INFO] ✓ AOF replaced +# [INFO] Starting Redis... +# [INFO] ✓ Redis responding +# [INFO] Database size: 10247 keys +# [INFO] Memory usage: 512.5M +``` + +#### 2. Recover Specific Backup Date +```bash +# List backups from specific date +aws s3api list-objects-v2 \ + --bucket auths-redis-backups \ + --prefix "backups/redis-aof-2026-03-25" \ + --region us-east-1 + +# Restore specific backup +./restore-redis-aof.sh s3://auths-redis-backups/backups/redis-aof-20260325_020000.aof.gz +``` + +#### 3. Recover from Local File +```bash +./restore-redis-aof.sh /backups/redis-aof-20260325.aof.gz localhost 6379 +``` + +### Recovery Time + +| Scenario | RTO | Notes | +|---|---|---| +| Latest backup | < 5 minutes | Download + decompress + verify + start | +| 7-day-old backup | < 10 minutes | Larger S3 download | +| Full month recovery | < 15 minutes | Limited by decompression + Redis startup | + +### Testing Recovery + +**Monthly Recovery Drill** (1st of each month): +```bash +#!/bin/bash +# Monthly point-in-time recovery test + +echo "Recovery Drill: $(date)" + +# 1. Identify a backup from 7 days ago +RECOVERY_DATE=$(date -u -d "7 days ago" +%Y-%m-%d) +echo "Recovering backup from $RECOVERY_DATE..." + +# 2. Start test Redis on alternate port +TEST_REDIS_PORT=6380 +redis-server --port $TEST_REDIS_PORT & +sleep 2 + +# 3. Restore backup +./restore-redis-aof.sh latest localhost $TEST_REDIS_PORT $RECOVERY_DATE + +# 4. Verify data +TEST_DBSIZE=$(redis-cli -p $TEST_REDIS_PORT dbsize | grep -oE '[0-9]+') +EXPECTED_AGENTS=$(redis-cli -p 6379 dbsize | grep -oE '[0-9]+') + +echo "Keys in restored backup: $TEST_DBSIZE" +echo "Keys in current data: $EXPECTED_AGENTS" + +if [[ $TEST_DBSIZE -gt 0 ]]; then + echo "✓ Recovery test PASSED" +else + echo "✗ Recovery test FAILED" +fi + +# 5. Cleanup +redis-cli -p $TEST_REDIS_PORT shutdown +``` + +--- + +## Monitoring & Alerting + +### CloudWatch Metrics + +Backup script automatically publishes: + +| Metric | Unit | Threshold | Action | +|---|---|---|---| +| `backup-size-mb` | MB | > 1000 | Alert (investigate disk usage) | +| `backup-success` | 0/1 | = 0 | Page oncall (backup failed) | +| `backup-duration-seconds` | Seconds | > 300 | Investigate (timeout) | +| `last-backup-age-hours` | Hours | > 25 | Alert (backup job missed) | + +**CloudWatch Dashboard**: +```json +{ + "widgets": [ + { + "type": "metric", + "properties": { + "metrics": [ + ["auths/redis", "backup-size-mb"], + ["auths/redis", "backup-success"], + ["auths/redis", "last-backup-age-hours"] + ], + "period": 300, + "stat": "Average", + "region": "us-east-1", + "title": "Redis Backup Health" + } + } + ] +} +``` + +### Alarms + +```bash +# Backup failure alarm +aws cloudwatch put-metric-alarm \ + --alarm-name redis-backup-failed \ + --alarm-actions "arn:aws:sns:us-east-1:123456789:oncall" \ + --metric-name backup-success \ + --namespace auths/redis \ + --statistic Sum \ + --period 3600 \ + --threshold 0 \ + --comparison-operator LessThanThreshold + +# Backup size alarm +aws cloudwatch put-metric-alarm \ + --alarm-name redis-backup-size-high \ + --alarm-actions "arn:aws:sns:us-east-1:123456789:alerts" \ + --metric-name backup-size-mb \ + --namespace auths/redis \ + --statistic Maximum \ + --period 300 \ + --threshold 1000 \ + --comparison-operator GreaterThanThreshold +``` + +--- + +## AOF Rewrite + +AOF grows over time as commands accumulate. Redis automatically rewrites (compresses) periodically. + +### Manual Rewrite + +```bash +# Trigger background rewrite (safe, doesn't block) +redis-cli BGREWRITEAOF + +# Monitor progress +redis-cli info persistence | grep aof_rewrite +# Output: aof_rewrite_in_progress:0 (complete) +``` + +### Automatic Rewrite + +Configured in `sentinel.conf`: +``` +auto-aof-rewrite-percentage 100 # Rewrite when AOF grows 100% since last rewrite +auto-aof-rewrite-min-size 64mb # Don't rewrite unless > 64MB +``` + +**Example**: +- Last rewrite produced 50MB AOF +- AOF grows to 100MB (100% growth) +- Redis triggers automatic rewrite +- New AOF compressed to ~50MB again + +--- + +## Retention Policy + +**Default**: 30-day rolling window + +**Rationale**: +- Covers 1 month of history (good for weekly recovery drills) +- Minimal S3 cost (~$6/month for 6GB) +- Weekly snapshots archived separately (fn-90 for long-term archive) + +**Adjust if needed**: +```bash +# 60-day retention +BACKUP_RETENTION_DAYS=60 ./backup-redis-aof.sh + +# S3 lifecycle policy update +aws s3api put-bucket-lifecycle-configuration \ + --bucket auths-redis-backups \ + --lifecycle-configuration '{"Rules": [{"Id": "DeleteAfter60Days", "Expiration": {"Days": 60}, "Status": "Enabled"}]}' +``` + +--- + +## Troubleshooting + +### AOF File Corruption + +**Symptom**: `Bad file format` when Redis starts + +**Recovery**: +```bash +# AOF check tool (Redis 7.0+) +redis-check-aof --fix /var/lib/redis/appendonly.aof + +# Or manual recovery +./restore-redis-aof.sh latest # Restore from backup +``` + +### Backup Upload Timeout + +**Symptom**: Backup script fails at S3 upload + +**Solutions**: +```bash +# Increase timeout in script (line 60) +aws s3 cp ... --region ... --no-progress + +# Or use S3 multipart upload with retries +aws s3 cp ... --region ... --sse AES256 +``` + +### Replication Lag After Recovery + +**Symptom**: Replicas out of sync after restore + +**Recovery**: +```bash +# Force replica resync +redis-cli -h replica slaveof no one # Stop replicating +redis-cli -h replica slaveof master 6379 # Resume from scratch + +# Monitor sync +redis-cli -h replica info replication | grep sync +``` + +--- + +## References + +- [Redis Persistence](https://redis.io/topics/persistence) +- [Redis AOF Format](https://redis.io/topics/protocol) +- Related: fn-89.0 (Domain Architecture), fn-89.1 (Sentinel HA), fn-89.3 (Circuit Breaker) diff --git a/docs/plans/api_plans.md b/docs/plans/api_plans.md new file mode 100644 index 00000000..2288c88c --- /dev/null +++ b/docs/plans/api_plans.md @@ -0,0 +1,1113 @@ +# auths-api: Product & Implementation Roadmap + +## fn-89 Foundation: What It Enables + +The fn-89 epic (domain-driven architecture, fn-89.0 contracts) establishes the **foundational layers** for auths-api: + +**What fn-89 Delivers**: +- **Domain clarity**: identity, auth, compliance, webhooks domains with explicit ownership +- **Transaction safety**: bootstrap/provisioning workflows with atomicity guarantees +- **Event-driven architecture**: all domain operations emit webhooks (provision, revoke, expire, refresh) +- **Observability**: per-domain metrics, Grafana dashboards, SLO-based alerting +- **SDK parity**: Rust + Python SDKs mirror domain structure (users understand via domain concepts) +- **Scalability foundation**: sharding strategy, per-shard failover, horizontal deployment patterns + +**Market Positioning**: +- Supply chain security (fintech, infra platforms, critical OSS) +- Multi-tenant SaaS with cryptographic delegation (orgs provision agents for services) +- Audit-driven security (full event trail with domain event sourcing) + +## Roadmap Overview + +After fn-89, auths-api is **provisionally deployable** but **functionally limited**. The roadmap builds on this foundation to unlock strategic use cases: + +| Epic | Use Case | Complexity | Value | +|------|----------|-----------|-------| +| fn-100 | Policy-driven agent provisioning | High | Very High | +| fn-101 | Artifact attestation & verification | Medium | Very High | +| fn-102 | Key rotation & renewal automation | Medium | High | +| fn-103 | Approval workflows (sensitive ops) | Medium | High | +| fn-104 | Agent quotas & rate limiting | Low | Medium | +| fn-105 | Multi-org federation & cross-org delegation | Very High | High | +| fn-106 | Compliance & audit export (SOC2, FedRAMP) | Medium | High | +| fn-107 | Agent analytics & usage observability | Low | Medium | + +--- + +## fn-100: Policy-Driven Agent Provisioning + +**Goal**: Orgs define rules that automatically provision agents based on namespace config, without manual admin intervention. + +**Use Case**: +- Org admin: "Whenever a CI pipeline starts in namespace X, auto-provision a ci-runner agent with signing + artifact capabilities, TTL 1 hour" +- Org admin: "Allow developers to self-provision personal agents for CLI use, limited to read-only capabilities" +- Org admin: "Revoke all agents in namespace Y that haven't been used in 30 days" + +### Sub-task fn-100.1: Policy Schema & Evaluation Engine + +**Description**: Define policy language and evaluation logic for agent provisioning rules. + +**Deliverables**: +- Policy schema (JSON): trigger rules, agent templates, capability grants +- Policy evaluator: given namespace context, determine which agents to provision +- Admin API: `POST /v1/policies { namespace, rules, [triggers] }` + +**Pseudo-code**: +```rust +// Policy schema +pub struct AgentPolicy { + namespace: String, + rules: Vec, +} + +pub enum PolicyTrigger { + OnNamespaceBoot { }, // when namespace initializes + OnCiPipelineStart { ci_platform: String }, // "github", "gitlab" + OnDeveloperLogin { }, // when human logs in + OnSchedule { cron: String }, // "0 2 * * *" = daily 2am +} + +pub struct PolicyRule { + name: String, + trigger: PolicyTrigger, + condition: String, // "namespace.platform == 'github' && team == 'infra'" + agent_template: AgentTemplate, +} + +pub struct AgentTemplate { + name_pattern: String, // "ci-runner-{platform}-{id}" + capabilities: Vec, // ["sign_artifacts", "publish_releases"] + ttl_seconds: u64, + rotation_period: Option, // auto-rotate every N seconds +} + +// Evaluator +pub async fn evaluate_policy( + namespace: &str, + policy: &AgentPolicy, + trigger: &PolicyTrigger, + context: &PolicyContext, // env vars, CI platform info, etc. +) -> Result> { + // 1. Filter rules by trigger type + // 2. Evaluate conditions against context + // 3. Return matching templates +} + +pub async fn apply_policy( + namespace: &str, + templates: Vec, + identity_service: &dyn IdentityService, +) -> Result> { + // 1. For each template, provision agent + // 2. Emit policy.agent_provisioned event (webhook) + // 3. Log to compliance domain +} +``` + +**Acceptance Criteria**: +- Policy schema supports at least 4 trigger types (boot, ci_start, login, schedule) +- Condition evaluator handles namespace context, env vars, user attributes +- Policy rules can grant multiple capabilities +- Admin can list, update, delete policies +- Policy changes take effect immediately (no restart) + +--- + +### Sub-task fn-100.2: Scheduled Policy Evaluation (Cron-like) + +**Description**: Periodic evaluation of policies (e.g., "revoke unused agents daily"). + +**Deliverables**: +- Background job: periodic policy evaluation based on cron schedule +- Metrics: policies evaluated/hour, agents auto-provisioned, agents auto-revoked +- Admin endpoint to trigger manual evaluation + +**Pseudo-code**: +```rust +pub struct ScheduledPolicy { + policy_id: String, + schedule: String, // cron expression +} + +pub async fn scheduled_policy_evaluator( + policies: Arc>, + scheduler: &dyn Scheduler, +) { + for policy in policies.iter() { + scheduler.schedule( + policy.schedule.clone(), + move || { + Box::pin(async { + let templates = evaluate_policy(&policy).await?; + apply_policy(templates).await?; + }) + }, + ).await?; + } +} + +// Example: auto-revoke unused agents +pub async fn revoke_unused_agents( + namespace: &str, + threshold_days: u64, +) -> Result> { + // 1. Query audit logs: which agents haven't been used in threshold_days + // 2. Batch revoke them + // 3. Emit agent.revoked events (webhooks) + // 4. Return revoked agent IDs +} +``` + +**Acceptance Criteria**: +- Cron-based scheduling works (daily, hourly, etc.) +- Unused agent cleanup runs reliably +- Metrics exposed: scheduled_policy_evaluations, agents_auto_provisioned, agents_auto_revoked +- Manual trigger endpoint: `POST /v1/policies/{id}/evaluate` for testing + +--- + +### Sub-task fn-100.3: Multi-Namespace Policies & Inheritance + +**Description**: Org-level policy templates that cascade to namespaces, with override capability. + +**Deliverables**: +- Policy hierarchy: global > org > namespace > agent +- Inheritance: namespaces inherit org policies unless explicitly overridden +- Conflict resolution: most-specific policy wins + +**Pseudo-code**: +```rust +pub struct PolicyHierarchy { + global: Option, // Auths platform-wide + org: Option, // Org-level defaults + namespace: AgentPolicy, // Namespace-specific +} + +pub async fn resolve_policies( + namespace: &str, + org_id: &str, +) -> Result { + // 1. Load global policy (if any) + // 2. Load org policy (if any) + // 3. Load namespace policy + // 4. Merge: namespace overrides org, org overrides global + // 5. Return merged policy +} +``` + +**Acceptance Criteria**: +- Policy inheritance documented with examples +- Override syntax clear (namespace policy `extends` org policy) +- Conflict resolution predictable + +--- + +## fn-101: Artifact Attestation & Verification + +**Goal**: Agents sign artifacts (commits, releases, container images); third parties verify provenance without needing artifact server access. + +**Use Case**: +- CI agent signs build artifact (binary, container image, release tarball) +- Developer pushes signed artifact + attestation to public registry +- User downloads artifact, verifies signature: "This build came from org X's CI, signed with agent ID Y, approved on date Z" +- Supply chain attack prevention: fake artifact rejected because signature doesn't verify + +### Sub-task fn-101.1: Artifact Signing Service + +**Description**: Agents create deterministic, canonicalized signatures over artifacts. + +**Deliverables**: +- Artifact signing API: `POST /v1/artifacts/sign { agent_id, artifact_hash, metadata }` +- Returns: signed attestation (JSON) +- Attestation includes: artifact hash, agent DID, timestamp, signature + +**Pseudo-code**: +```rust +pub struct ArtifactAttestation { + version: String, // "1.0" + artifact_hash: String, // sha256 of artifact + artifact_hash_algorithm: String, // "sha256" + agent_id: String, + agent_did: String, + signer_did: String, // dev who triggered the sign + signed_at: DateTime, + expires_at: Option>, + metadata: Map, // platform, build_id, version, etc. + signature: String, // base64url(ed25519_sig) +} + +pub async fn sign_artifact( + agent_id: &str, + artifact_hash: &str, + metadata: Map, + artifact_service: &dyn ArtifactService, + auth_domain: &dyn AuthDomain, +) -> Result { + // 1. Validate agent has "sign_artifacts" capability + // 2. Load agent's device key from device domain + // 3. Canonicalize attestation (json-canon, RFC 8785) + // 4. Sign with agent's key + // 5. Return attestation +} + +pub async fn verify_artifact_attestation( + attestation: &ArtifactAttestation, + identity_resolver: &dyn IdentityResolver, + current_time: DateTime, +) -> Result { + // 1. Validate signature (Ed25519 over canonical JSON) + // 2. Check not expired + // 3. Resolve agent_did from IdentityResolver + // 4. Return validity +} +``` + +**Acceptance Criteria**: +- Artifacts can be signed atomically with hash only (no file upload needed) +- Attestations are JSON, machine-readable +- Canonical form verified (json-canon) +- Verification works offline (given agent DID + public key) + +--- + +### Sub-task fn-101.2: Attestation Storage & Distribution + +**Description**: Store attestations for lookup and verification. + +**Deliverables**: +- Attestation registry: `POST /v1/attestations { artifact_hash, attestation }` +- List attestations: `GET /v1/attestations?artifact_hash=...&agent_did=...` +- Storage: Redis (hot cache) + audit log (immutable) + +**Pseudo-code**: +```rust +pub struct AttestationRegistry { + backend: Arc, +} + +pub async fn register_attestation( + attestation: ArtifactAttestation, + registry: &AttestationRegistry, + compliance: &dyn ComplianceDomain, +) -> Result<()> { + // 1. Validate attestation signature + // 2. Store in Redis: attestations:{artifact_hash}:{agent_did} + // 3. Emit attestation.registered event (webhook) + // 4. Log to compliance domain +} + +pub async fn get_attestations( + artifact_hash: &str, + agent_did: Option<&str>, + registry: &AttestationRegistry, +) -> Result> { + // 1. Query Redis by artifact_hash + // 2. Optionally filter by agent_did + // 3. Return sorted by signed_at (newest first) +} +``` + +**Acceptance Criteria**: +- Attestations queryable by artifact hash + optional agent DID +- Immutable (no updates, only append) +- Exported in audit logs + +--- + +### Sub-task fn-101.3: Integration: Git Commit Signing + +**Description**: Extend Git commit signing to embed artifact attestations. + +**Deliverables**: +- auths-cli: `auths sign-commit` can include attestation hash +- Commit signatures include attestation reference +- Verification: git signature validates + attestation is lookupable + +**Pseudo-code**: +```rust +pub struct CommitSignatureWithAttestation { + commit_hash: String, + commit_signature: String, // existing + attestation_hash: Option, // hash of artifact being committed + attestation_reference: Option, // URL to attestation registry +} + +pub async fn sign_commit_with_attestation( + commit_hash: &str, + artifact_hash: Option<&str>, + agent_service: &dyn AgentService, +) -> Result { + // 1. Sign commit (existing logic) + // 2. If artifact_hash provided: + // a. Look up attestation + // b. Include reference in signature metadata + // 3. Return signature + attestation ref +} +``` + +**Acceptance Criteria**: +- Git commits can link to artifact attestations +- Attestation reference immutable after commit +- Verification chain: commit sig → attestation sig → agent DID + +--- + +## fn-102: Key Rotation & Renewal Automation + +**Goal**: Agents automatically rotate their signing keys on a schedule, maintaining continuous signing capability. + +**Use Case**: +- Long-lived agent (CI runner, bot) rotates its key every 30 days automatically +- Old key revoked after grace period (new key already active) +- No service disruption (clients always get latest key) + +### Sub-task fn-102.1: Agent Key Rotation Policy + +**Description**: Define rotation schedules and execution logic. + +**Deliverables**: +- Policy schema: rotation period, grace period, notifications +- Rotation scheduler: periodic background job +- Pre-rotation notification: webhook to inform subscribers + +**Pseudo-code**: +```rust +pub struct KeyRotationPolicy { + agent_id: String, + rotation_period: Duration, // e.g., 30 days + grace_period: Duration, // e.g., 7 days (old key still valid) + notify_before: Duration, // e.g., 3 days before rotation + auto_rotate: bool, +} + +pub async fn schedule_key_rotation( + agent_id: &str, + policy: KeyRotationPolicy, + scheduler: &dyn Scheduler, +) -> Result<()> { + // 1. Calculate next rotation time: now + policy.rotation_period + // 2. Schedule webhook notification: now + (rotation_period - notify_before) + // 3. Schedule rotation: now + rotation_period + // 4. Store scheduled rotations in Redis +} + +pub async fn perform_key_rotation( + agent_id: &str, + device_service: &dyn DeviceService, +) -> Result { + // 1. Generate new device key + // 2. Add new key to agent's device list + // 3. Mark old key as "rotating" (valid until grace_period expires) + // 4. Emit device.key_rotated event + // 5. Old key expires after grace_period (cleanup job) +} + +pub struct RotationResult { + agent_id: String, + old_key_did: String, + new_key_did: String, + new_key_public: String, + old_key_expires_at: DateTime, +} +``` + +**Acceptance Criteria**: +- Rotation period configurable per agent +- Pre-rotation notification sent (webhook event) +- Old key valid during grace period, then revoked automatically +- Audit trail: all rotations logged + +--- + +### Sub-task fn-102.2: Client Handling of Key Rotation + +**Description**: SDK clients handle transparent key rotation (fetch new key, use it). + +**Deliverables**: +- SDK: automatic key refresh on rotation +- Cache invalidation: old key removed from cache on expiry +- Error handling: retry with new key if old key rejected + +**Pseudo-code**: +```rust +// Rust SDK +pub async fn sign_with_rotation_aware( + agent_id: &str, + data: &[u8], + sdk: &Agent, +) -> Result { + loop { + match sdk.sign(data).await { + Ok(sig) => return Ok(sig), + Err(SignError::KeyExpired) => { + // Key was just rotated, refresh and retry + sdk.refresh_keys().await?; + // retry the sign + } + Err(e) => return Err(e), + } + } +} + +// Python SDK equivalent +async def sign_with_rotation_aware(agent_id: str, data: bytes) -> str: + while True: + try: + sig = await agent.sign(data) + return sig + except KeyExpiredError: + await agent.refresh_keys() + # retry +``` + +**Acceptance Criteria**: +- SDK automatically detects key rotation +- Seamless retry on key expiry +- Logging: key rotation events visible in client logs + +--- + +### Sub-task fn-102.3: Renewal Before Expiry + +**Description**: Extend agent TTL automatically before expiration (similar to token refresh). + +**Deliverables**: +- Renewal scheduler: check agents expiring within N days +- Auto-renewal: extend TTL by another rotation period +- Notification: alert if auto-renewal fails (manual intervention) + +**Pseudo-code**: +```rust +pub async fn schedule_agent_renewals( + namespace: &str, + renewal_threshold: Duration, // e.g., 7 days + scheduler: &dyn Scheduler, +) -> Result<()> { + // 1. Find agents expiring within threshold + // 2. Schedule renewal job: now + (agent.expires_at - renewal_threshold) + // 3. On job trigger: extend TTL + emit agent.renewed event +} + +pub async fn renew_agent_before_expiry( + namespace: &str, + agent_id: &str, + new_ttl: Duration, +) -> Result { + // 1. Validate agent not already expired + // 2. Update agent.expires_at = now + new_ttl + // 3. Store in Redis + // 4. Emit agent.renewed event + // 5. Log to compliance +} +``` + +**Acceptance Criteria**: +- Agents auto-renew before expiry (no service gap) +- Renewal events visible in audit logs +- Admin notified if renewal fails + +--- + +## fn-103: Approval Workflows for Sensitive Operations + +**Goal**: High-stakes operations (revoke agent, rotate keys, change policies) require human approval. + +**Use Case**: +- CI agent provisioning is automatic (fn-100) +- But revoking an agent requires approval from 2 org admins +- Deployment policy changes require approval from security team + +### Sub-task fn-103.1: Approval Request & Decision + +**Description**: Create, manage, approve/deny sensitive operations. + +**Deliverables**: +- Approval schema: operation type, requester, approvers, deadline +- API: `POST /v1/approvals/request { operation, reason, requires_approvers }` +- API: `POST /v1/approvals/{id}/approve { approver_did, decision, note }` + +**Pseudo-code**: +```rust +pub enum ApprovalOperation { + RevokeAgent { agent_id: String }, + RotateAgentKey { agent_id: String }, + ChangePolicy { policy_id: String, old: Policy, new: Policy }, + DeleteNamespace { namespace: String }, +} + +pub struct ApprovalRequest { + id: String, + namespace: String, + operation: ApprovalOperation, + requester_did: String, + required_approvers: Vec, // DIDs of required approvers + approvals: Map, // approver_did -> decision + deadline: DateTime, + status: ApprovalStatus, // pending, approved, rejected, expired +} + +pub struct Approval { + approver_did: String, + decision: ApprovalDecision, // Approved, Rejected + reason: String, + approved_at: DateTime, +} + +pub async fn request_approval( + operation: ApprovalOperation, + requester_did: &str, + approvers: Vec, + deadline: Duration, +) -> Result { + // 1. Create request + // 2. Store in Redis: approvals:{request_id} + // 3. Emit approval.requested event (sends to approvers) + // 4. Log to compliance domain +} + +pub async fn approve_operation( + request_id: &str, + approver_did: &str, + decision: ApprovalDecision, +) -> Result { + // 1. Record approval + // 2. If all required approvals received: apply operation + // 3. Emit approval.decided event +} +``` + +**Acceptance Criteria**: +- Approval rules configurable per operation type +- Multiple approvers supported +- Deadline enforced (requests expire) +- Audit trail of all approvals + +--- + +### Sub-task fn-103.2: Conditional Execution (After Approval) + +**Description**: Execute operations only after approval(s) received. + +**Deliverables**: +- Approval-gated operations: revoke, rotate, policy change +- Execution: automatic or manual trigger after approved +- Rollback: undo operation if approval is later revoked + +**Pseudo-code**: +```rust +pub async fn revoke_agent_with_approval( + namespace: &str, + agent_id: &str, + requester_did: &str, +) -> Result { + // 1. Create approval request (operation: RevokeAgent) + // 2. Determine required approvers (from policy) + // 3. Return request (client must wait for approvals) +} + +pub async fn execute_approved_operation( + approval_request: &ApprovalRequest, +) -> Result { + // 1. Validate request is fully approved + // 2. Check deadline not exceeded + // 3. Execute operation (revoke, rotate, etc.) + // 4. Emit operation.executed event + // 5. Log to compliance +} + +pub async fn revoke_approval_and_undo( + approval_request: &ApprovalRequest, + approver_who_revoked: &str, +) -> Result<()> { + // 1. Mark approval as revoked + // 2. If operation already executed: undo it (restore agent, etc.) + // 3. Emit approval.revoked event +} +``` + +**Acceptance Criteria**: +- Operations block until approval received +- Automatic execution vs. manual trigger (configurable) +- Approval can be revoked with undo capability + +--- + +## fn-104: Agent Quotas & Rate Limiting + +**Goal**: Prevent resource exhaustion and abuse; fair allocation across namespaces. + +**Use Case**: +- Org limit: max 1000 agents per namespace +- Rate limit: max 100 agents provisioned/hour +- Quota enforcement: prevent over-provisioning + +### Sub-task fn-104.1: Quota Tracking & Enforcement + +**Description**: Track agent counts, enforce limits. + +**Deliverables**: +- Quota schema: max agents, max provisions/hour +- Quota check: before provisioning, verify limits +- Metrics: quota usage, rejections + +**Pseudo-code**: +```rust +pub struct AgentQuota { + namespace: String, + max_agents: u64, + max_provisions_per_hour: u64, +} + +pub async fn check_quota( + namespace: &str, + quota: &AgentQuota, + agent_service: &dyn AgentService, +) -> Result { + // 1. Count current agents in namespace + // 2. Count provisions in last hour (from audit log) + // 3. Return { agents_available, provisions_available } +} + +pub async fn provision_agent_with_quota( + namespace: &str, + config: ProvisionConfig, +) -> Result { + // 1. Check quota + // 2. If exceeded: return QuotaExceededError + // 3. Otherwise: proceed with provision +} + +pub struct QuotaStatus { + agents_used: u64, + agents_available: u64, + provisions_this_hour: u64, + provisions_available: u64, +} +``` + +**Acceptance Criteria**: +- Quotas enforced at provision time +- Soft limit warnings + hard limit rejections +- Quotas configurable per namespace +- Quota usage visible via metrics + +--- + +### Sub-task fn-104.2: Rate Limiting (Leaky Bucket) + +**Description**: Leaky bucket rate limiter for agent operations. + +**Deliverables**: +- Rate limit: X operations/second per namespace +- Burst allowance: allow spikes up to Y requests +- Headers: X-RateLimit-* in API responses + +**Pseudo-code**: +```rust +pub struct RateLimiter { + capacity: f64, // max tokens + refill_rate: f64, // tokens per second + current_tokens: f64, +} + +pub async fn check_rate_limit( + namespace: &str, + limiter: &mut RateLimiter, + cost: f64, // tokens to consume +) -> Result { + // 1. Refill tokens based on elapsed time + // 2. If tokens >= cost: consume and allow + // 3. Otherwise: reject (too fast) +} + +pub struct RateLimitStatus { + allowed: bool, + tokens_remaining: f64, + reset_at: DateTime, +} +``` + +**Acceptance Criteria**: +- Rate limits configurable (default: 100 ops/sec) +- Burst allowance (e.g., 50 tokens) +- Metrics: rate limit hits, rejections +- Headers: X-RateLimit-{Limit,Used,Remaining,ResetAt} + +--- + +## fn-105: Multi-Org Federation & Cross-Org Delegation + +**Goal**: Organizations trust each other; agent from org A can act on behalf of org B (with permission). + +**Use Case**: +- Company A uses Company B's SaaS platform +- Company A's CI agent provisions its own agents on platform B +- Company A's agents can sign artifacts on platform B without sharing keys with B + +### Sub-task fn-105.1: Cross-Org Agent Recognition + +**Description**: Org A's agent is recognized as legitimate by org B. + +**Deliverables**: +- Trust anchor: org B trusts org A's DIDs +- Agent delegation: org A agent can act in org B context +- Verification: cross-org signatures validate + +**Pseudo-code**: +```rust +pub struct OrgTrustAnchor { + org_a_id: String, + org_b_id: String, + org_a_root_did: String, // root DID of org A + delegated_capabilities: Vec, // [sign_artifacts, publish_releases] + expires_at: DateTime, +} + +pub async fn establish_trust( + org_a: &str, + org_b: &str, + root_did: &str, + capabilities: Vec, +) -> Result { + // 1. Org B admin approves trust anchor (approval workflow) + // 2. Store in Redis: trust_anchors:{org_b}:{org_a} + // 3. Emit trust.established event +} + +pub async fn verify_cross_org_delegation( + agent_id: &str, + agent_org: &str, + target_org: &str, + required_capability: &str, + identity_resolver: &dyn IdentityResolver, +) -> Result { + // 1. Resolve agent's org and DID + // 2. Check trust anchor: agent_org → target_org exists + // 3. Verify required_capability in delegated_capabilities + // 4. Return true if delegated, false otherwise +} +``` + +**Acceptance Criteria**: +- Cross-org trust relationships configurable +- Delegation verified before operation +- Audit trail: cross-org operations logged + +--- + +### Sub-task fn-105.2: Shared Agent Pool (Federation Lite) + +**Description**: Multiple orgs share a pool of agents (e.g., shared CI runners). + +**Deliverables**: +- Shared namespace: agents available to multiple orgs +- Attribution: operations tied to requesting org +- Resource isolation: quotas per org in shared pool + +**Pseudo-code**: +```rust +pub struct SharedNamespace { + id: String, + participating_orgs: Vec, + agents: Vec, // shared pool + quotas: Map, // per-org limits +} + +pub async fn provision_from_shared_pool( + shared_namespace: &str, + requesting_org: &str, + config: ProvisionConfig, +) -> Result { + // 1. Check org quota in shared namespace + // 2. Tag agent with org_id (attribution) + // 3. Provision agent + // 4. Log: agent provisioned by org X in shared namespace Y +} + +pub async fn audit_shared_namespace( + shared_namespace: &str, +) -> Result> { + // 1. Query audit log: all events in shared namespace + // 2. Organize by org (attribution) + // 3. Return usage per org +} +``` + +**Acceptance Criteria**: +- Shared pool manageable via API +- Per-org quotas enforced +- Attribution clear (audit trail shows which org provisioned agent) + +--- + +## fn-106: Compliance & Audit Export (SOC2, FedRAMP) + +**Goal**: Organizations need audit logs for compliance (SOC2, FedRAMP, HIPAA); export in standard formats. + +**Use Case**: +- SOC2 auditor: "Show me all agent provisioning events for the last 90 days" +- FedRAMP: "Export audit logs in CEF (Common Event Format)" +- Compliance officer: "Generate report: who provisioned which agents, when, why" + +### Sub-task fn-106.1: Audit Log Retention & Queryability + +**Description**: Store audit logs for X years; fast queries by date range, agent, user. + +**Deliverables**: +- Retention policy: configurable (default 7 years for compliance) +- Query endpoint: `GET /v1/audit?start_date=...&end_date=...&agent_id=...&event_type=...` +- Export formats: JSON, CSV, CEF + +**Pseudo-code**: +```rust +pub async fn query_audit_logs( + namespace: &str, + filter: AuditFilter, + format: ExportFormat, // JSON, CSV, CEF +) -> Result> { + // 1. Query compliance domain: audit events matching filter + // 2. Sort by timestamp + // 3. Format as requested (JSON, CSV, CEF) + // 4. Return bytes +} + +pub struct AuditFilter { + start_date: DateTime, + end_date: DateTime, + event_types: Option>, // agent.provisioned, agent.revoked, etc. + agent_ids: Option>, + user_ids: Option>, +} + +pub enum ExportFormat { + Json, + Csv, + Cef, // Common Event Format (for SIEM integration) +} + +// CEF format example: +// CEF:0|auths|auths-api|1.0|agent.provisioned|Agent Provisioned|5|agent_id=abc123 delegator_did=did:keri:E... capabilities=sign_artifacts created_at=2026-03-29T10:00:00Z +``` + +**Acceptance Criteria**: +- Query by date range, agent, event type, user +- Export in at least 2 formats (JSON, CSV) +- CEF export for SIEM integration +- Retention configurable per namespace + +--- + +### Sub-task fn-106.2: Compliance Report Generation + +**Description**: Automated reports for compliance auditors. + +**Deliverables**: +- Report templates: SOC2, FedRAMP, HIPAA, PCI-DSS +- Report generation: `POST /v1/compliance/reports { template, namespace, date_range }` +- Report includes: summary, detailed events, risk assessment + +**Pseudo-code**: +```rust +pub enum ComplianceTemplate { + SOC2, + FedRAMP, + HIPAA, + PciDss, +} + +pub struct ComplianceReport { + template: ComplianceTemplate, + generated_at: DateTime, + namespace: String, + summary: ReportSummary, + findings: Vec, + audit_logs: Vec, +} + +pub struct ReportSummary { + total_agents: u64, + agents_provisioned_period: u64, + agents_revoked_period: u64, + policy_changes: u64, + unapproved_operations: u64, // red flag +} + +pub async fn generate_compliance_report( + namespace: &str, + template: ComplianceTemplate, + date_range: DateRange, +) -> Result { + // 1. Query audit logs for period + // 2. Check for policy violations (unapproved ops, quota exceeds) + // 3. Generate summary + // 4. Format as report +} +``` + +**Acceptance Criteria**: +- At least 2 compliance templates (SOC2, FedRAMP) +- Reports include summary + detailed audit trail +- Automated risk flagging (e.g., unapproved operations) + +--- + +## fn-107: Agent Analytics & Usage Observability + +**Goal**: Understand agent usage patterns; identify unused/underutilized agents; capacity planning. + +**Use Case**: +- Dashboard: "Which agents haven't been used in 30 days?" (cleanup candidates) +- Metrics: "Agent provisioning trend: 100/month → 500/month" (growth signal) +- Forecast: "At current growth, we'll hit quota in 45 days" + +### Sub-task fn-107.1: Agent Usage Metrics + +**Description**: Track which agents are actively used; expose usage trends. + +**Deliverables**: +- Usage metrics: last_used, usage_count, operations_performed +- Dashboard: agent usage heatmap, trend lines +- Alerts: unused agents (>30 days), low-usage agents + +**Pseudo-code**: +```rust +pub struct AgentUsageMetrics { + agent_id: String, + provisioned_at: DateTime, + first_used_at: Option>, + last_used_at: Option>, + usage_count: u64, + operations: Map, // sign_artifacts: 42, publish_releases: 10 + days_since_last_use: u64, +} + +pub async fn compute_agent_usage( + namespace: &str, + days_back: u64, // e.g., 30 + agent_service: &dyn AgentService, +) -> Result> { + // 1. Query all agents in namespace + // 2. For each agent: query audit log for operations in last N days + // 3. Compute last_used_at, usage_count, operations + // 4. Return sorted by last_used_at (oldest first) +} + +pub async fn identify_unused_agents( + namespace: &str, + threshold_days: u64, // e.g., 30 +) -> Result> { + // 1. Compute usage metrics + // 2. Filter: days_since_last_use >= threshold + // 3. Return unused agents +} +``` + +**Acceptance Criteria**: +- Usage metrics queryable per agent, namespace +- Last-used timestamp tracked accurately +- Operations per agent visible +- Unused agents easily identifiable + +--- + +### Sub-task fn-107.2: Capacity & Growth Analytics + +**Description**: Forecast capacity; alert on quota approach; plan scaling. + +**Deliverables**: +- Forecast: project agent count 30/60/90 days out +- Alerts: "At current rate, you'll hit quota in 30 days" +- Recommendations: "Consider increasing quota or cleaning unused agents" + +**Pseudo-code**: +```rust +pub struct CapacityForecast { + namespace: String, + current_agents: u64, + quota: u64, + utilization: f64, // percentage + provisioning_rate: f64, // agents/day + forecast_30d: u64, + forecast_60d: u64, + days_to_quota: Option, // None if declining + recommendations: Vec, +} + +pub async fn forecast_capacity( + namespace: &str, + days_history: u64, // e.g., 90 +) -> Result { + // 1. Compute provisioning rate (agents/day) from audit log + // 2. Project forward 30, 60, 90 days + // 3. Calculate days to quota at current rate + // 4. Generate recommendations +} + +pub fn generate_recommendations( + forecast: &CapacityForecast, +) -> Vec { + let mut recs = vec![]; + if forecast.days_to_quota.is_some() && forecast.days_to_quota < Some(30) { + recs.push("Consider increasing quota".into()); + } + // ... more logic + recs +} +``` + +**Acceptance Criteria**: +- Linear regression on provisioning rate (last 90 days) +- Forecast 30/60/90 days out +- Alerts when approaching quota (<30 days) +- Recommendations actionable (increase quota, cleanup unused) + +--- + +## Cross-Cutting Considerations + +**Testing Strategy**: +- Integration tests for each epic (fn-100 through fn-107) +- Simulation: synthetic workloads (high provisioning rates, quota hits) +- Compliance validation: audit logs match expected events + +**Observability**: +- Per-epic metrics (policy evaluations, attestations signed, approvals, etc.) +- Distributed tracing: trace a provisioning request through all domain layers +- Runbooks: playbooks for common scenarios (quota exceeded, approval stuck, key rotation failure) + +**Documentation**: +- User guides: how to use each feature (policies, attestations, approvals) +- Operator guides: deployment, monitoring, troubleshooting +- API reference: all endpoints, request/response schemas +- Examples: concrete workflows (supply chain signing, policy-driven CI) + +--- + +## Summary: From fn-89 to Production + +**fn-89** provides the **foundational infrastructure** (domain architecture, transactions, observability). + +**fn-100–107** unlock **strategic use cases**: +- Policy-driven automation (fn-100) +- Supply chain security (fn-101) +- Operational continuity (fn-102, fn-104) +- Governance & approval (fn-103) +- Federation (fn-105) +- Compliance (fn-106) +- Operations intelligence (fn-107) + +**Market Positioning**: +- Early: auths-api is infrastructure (supply chain signing, audit trails) +- Scale: policy-driven provisioning, approval workflows, federation +- Mature: compliance automation, analytics, advanced governance + +**Timeline Estimate**: +- fn-89: 4–6 weeks (foundation) +- fn-100–103: 6–8 weeks (core features) +- fn-104–107: 4–6 weeks (optimization & intelligence) +- **Total to production-ready**: 3–4 months + +**Go-to-Market**: +1. **Closed beta** (fn-89 + fn-100): fintech, infra platforms +2. **Open beta** (fn-89 + fn-100–103): broader enterprise +3. **GA** (fn-89–107): full feature set for compliance-heavy orgs