diff --git a/.agents/skills/test-backend/SKILL.md b/.agents/skills/test-backend/SKILL.md new file mode 100644 index 0000000..2696c80 --- /dev/null +++ b/.agents/skills/test-backend/SKILL.md @@ -0,0 +1,70 @@ +--- +name: test-backend +description: Launch and test the k3d or k3s backend lifecycle (init, up, kubectl, down, purge). Use when you want to run a full integration test of a stack backend. +user_invocable: true +metadata: + author: obol-team + version: "1.0.0" + domain: testing + triggers: test backend, test k3d, test k3s, integration test, flow test, backend test + role: tester + scope: validation + output-format: report +--- + +# Test Backend Skill + +Runs a full lifecycle integration test for the obol stack backend (k3d or k3s). + +## Arguments + +The skill accepts an optional argument specifying which backend to test: + +- `k3s` - Test the k3s (bare-metal) backend only +- `k3d` - Test the k3d (Docker-based) backend only +- `all` - Test both backends sequentially (default) +- No argument defaults to `all` + +Examples: +- `/test-backend k3s` +- `/test-backend k3d` +- `/test-backend all` +- `/test-backend` (same as `all`) + +## Workflow + +### 1. Pre-flight + +- Build the obol binary: `go build -o .workspace/bin/obol ./cmd/obol` from the project root +- Verify the binary was created successfully +- Set `OBOL_DEVELOPMENT=true` and add `.workspace/bin` to PATH + +### 2. Run Test Script + +Based on the argument, run the appropriate test script(s) located alongside this skill: + +- **k3s**: Run `.agents/skills/test-backend/scripts/test-k3s.sh` +- **k3d**: Run `.agents/skills/test-backend/scripts/test-k3d.sh` +- **all**: Run k3s first, then k3d (k3s requires sudo so test it first while credentials are fresh) + +Execute the script via Bash tool from the project root directory. The scripts require: +- **k3s**: Linux, sudo access, k3s binary in `.workspace/bin/` +- **k3d**: Docker running, k3d binary in `.workspace/bin/` + +### 3. Report Results + +After each script completes, report: +- Total pass/fail counts (shown in the RESULTS line) +- Any specific test failures with their names +- Overall verdict: all green or needs attention + +If a test script fails (non-zero exit), read the output to identify which test(s) failed and summarize. + +## Important Notes + +- The k3s backend requires **sudo access** - the user may need to enter their password +- The k3d backend requires **Docker to be running** +- Each test script performs its own cleanup (purge) before and after +- Tests are sequential and ordered: init -> up -> verify -> down -> restart -> purge +- Typical runtime: ~2-4 minutes per backend +- If the environment has issues (Docker not starting, k3s not installing), report the problem clearly rather than retrying endlessly diff --git a/.agents/skills/test-backend/scripts/test-k3d.sh b/.agents/skills/test-backend/scripts/test-k3d.sh new file mode 100755 index 0000000..9657254 --- /dev/null +++ b/.agents/skills/test-backend/scripts/test-k3d.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +set -euo pipefail + +# K3d Backend Integration Test +# Requires: Docker running, k3d binary, OBOL_DEVELOPMENT=true + +PROJECT_ROOT="$(cd "$(dirname "$0")/../../../.." && pwd)" +OBOL="${PROJECT_ROOT}/.workspace/bin/obol" +export OBOL_DEVELOPMENT=true +export PATH="${PROJECT_ROOT}/.workspace/bin:$PATH" + +cd "$PROJECT_ROOT" + +PASS=0 +FAIL=0 + +log() { echo "$(date +%H:%M:%S) $*"; } +pass() { log " PASS: $*"; PASS=$((PASS + 1)); } +fail() { log " FAIL: $*"; FAIL=$((FAIL + 1)); } + +check() { + local desc="$1"; shift + if "$@"; then pass "$desc"; else fail "$desc"; fi +} + +check_fail() { + local desc="$1"; shift + if ! "$@" 2>/dev/null; then pass "$desc"; else fail "$desc (should have failed)"; fi +} + +k3d_is_functional() { + $OBOL kubectl get nodes --no-headers 2>/dev/null | grep -q "Ready" +} + +# Pre-flight: verify Docker is running +if ! docker info >/dev/null 2>&1; then + log "ERROR: Docker is not running. Start Docker and try again." + exit 1 +fi + +log "=========================================" +log "K3d Backend Integration Test" +log "=========================================" + +# --- Cleanup --- +log "--- Cleanup: purging any existing stack ---" +$OBOL stack purge --force 2>/dev/null || true + +# --- TEST 1: stack init (default = k3d) --- +log "" +log "--- TEST 1: stack init (default = k3d) ---" +check "stack init" $OBOL stack init +check "k3d.yaml exists" test -f .workspace/config/k3d.yaml +check ".stack-id exists" test -f .workspace/config/.stack-id +check ".stack-backend exists" test -f .workspace/config/.stack-backend +check "defaults/ directory exists" test -d .workspace/config/defaults +BACKEND=$(cat .workspace/config/.stack-backend) +check "backend is k3d" test "$BACKEND" = "k3d" +STACK_ID=$(cat .workspace/config/.stack-id) +log " Stack ID: $STACK_ID" + +# --- TEST 2: stack init again (should fail without --force) --- +log "" +log "--- TEST 2: stack init again (should fail without --force) ---" +check_fail "init without --force correctly rejected" $OBOL stack init + +# --- TEST 3: stack init --force --- +log "" +log "--- TEST 3: stack init --force ---" +$OBOL stack init --force +NEW_ID=$(cat .workspace/config/.stack-id) +check "stack ID preserved on --force ($STACK_ID)" test "$STACK_ID" = "$NEW_ID" + +# --- TEST 4: stack up --- +log "" +log "--- TEST 4: stack up ---" +check "stack up" $OBOL stack up +check "kubeconfig.yaml exists" test -f .workspace/config/kubeconfig.yaml + +# Wait for nodes to be ready (k3d can take a moment) +log " Waiting for nodes to be ready..." +DEADLINE=$((SECONDS + 120)) +while [ $SECONDS -lt $DEADLINE ]; do + if k3d_is_functional; then break; fi + sleep 3 +done +check "k3d is functional (nodes ready)" k3d_is_functional + +# --- TEST 5: kubectl passthrough --- +log "" +log "--- TEST 5: kubectl passthrough ---" +NODES=$($OBOL kubectl get nodes --no-headers 2>/dev/null | wc -l) +check "kubectl sees nodes ($NODES)" test "$NODES" -ge 1 + +NS=$($OBOL kubectl get namespaces --no-headers 2>/dev/null | wc -l) +check "kubectl sees namespaces ($NS)" test "$NS" -ge 1 + +# --- TEST 6: stack down --- +log "" +log "--- TEST 6: stack down ---" +check "stack down" $OBOL stack down +check "config preserved after down" test -f .workspace/config/.stack-id + +# Verify cluster stopped (kubectl should fail) +sleep 2 +check_fail "kubectl unreachable after down" $OBOL kubectl get nodes --no-headers + +# --- TEST 7: stack down already stopped --- +log "" +log "--- TEST 7: stack down already stopped ---" +check "stack down (already stopped)" $OBOL stack down + +# --- TEST 8: stack up (restart after down) --- +log "" +log "--- TEST 8: stack up (restart) ---" +check "stack up (restart)" $OBOL stack up + +# Wait for nodes to be ready after restart +log " Waiting for nodes to be ready..." +DEADLINE=$((SECONDS + 120)) +while [ $SECONDS -lt $DEADLINE ]; do + if k3d_is_functional; then break; fi + sleep 3 +done +check "k3d functional after restart" k3d_is_functional + +READY=$($OBOL kubectl get nodes --no-headers 2>/dev/null | grep -c "Ready" || true) +check "node ready after restart ($READY)" test "$READY" -ge 1 + +# --- TEST 9: stack purge --- +log "" +log "--- TEST 9: stack purge ---" +check "stack purge" $OBOL stack purge +sleep 2 +check "config removed" test ! -f .workspace/config/.stack-id + +# --- TEST 10: full cycle + purge --force --- +log "" +log "--- TEST 10: full cycle + purge --force ---" +check "init for purge test" $OBOL stack init +check "up for purge test" $OBOL stack up +check "purge --force" $OBOL stack purge --force +sleep 2 +check "config removed after purge --force" test ! -f .workspace/config/.stack-id + +log "" +log "=========================================" +log "K3d RESULTS: $PASS passed, $FAIL failed" +log "=========================================" + +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi diff --git a/.agents/skills/test-backend/scripts/test-k3s.sh b/.agents/skills/test-backend/scripts/test-k3s.sh new file mode 100755 index 0000000..03e3bca --- /dev/null +++ b/.agents/skills/test-backend/scripts/test-k3s.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash +set -euo pipefail + +# K3s Backend Integration Test +# Requires: Linux, sudo access, k3s binary, OBOL_DEVELOPMENT=true + +PROJECT_ROOT="$(cd "$(dirname "$0")/../../../.." && pwd)" +OBOL="${PROJECT_ROOT}/.workspace/bin/obol" +export OBOL_DEVELOPMENT=true +export PATH="${PROJECT_ROOT}/.workspace/bin:$PATH" + +cd "$PROJECT_ROOT" + +PASS=0 +FAIL=0 + +log() { echo "$(date +%H:%M:%S) $*"; } +pass() { log " PASS: $*"; PASS=$((PASS + 1)); } +fail() { log " FAIL: $*"; FAIL=$((FAIL + 1)); } + +check() { + local desc="$1"; shift + if "$@"; then pass "$desc"; else fail "$desc"; fi +} + +check_fail() { + local desc="$1"; shift + if ! "$@" 2>/dev/null; then pass "$desc"; else fail "$desc (should have failed)"; fi +} + +k3s_is_functional() { + $OBOL kubectl get nodes --no-headers 2>/dev/null | grep -q "Ready" +} + +log "=========================================" +log "K3s Backend Integration Test" +log "=========================================" + +# --- Cleanup --- +log "--- Cleanup: purging any existing stack ---" +$OBOL stack purge --force 2>/dev/null || true + +# --- TEST 1: stack init --backend k3s --- +log "" +log "--- TEST 1: stack init --backend k3s ---" +check "stack init --backend k3s" $OBOL stack init --backend k3s +check "k3s-config.yaml exists" test -f .workspace/config/k3s-config.yaml +check ".stack-id exists" test -f .workspace/config/.stack-id +check ".stack-backend exists" test -f .workspace/config/.stack-backend +check "defaults/ directory exists" test -d .workspace/config/defaults +BACKEND=$(cat .workspace/config/.stack-backend) +check "backend is k3s" test "$BACKEND" = "k3s" +STACK_ID=$(cat .workspace/config/.stack-id) +log " Stack ID: $STACK_ID" + +# --- TEST 2: stack init again (should fail without --force) --- +log "" +log "--- TEST 2: stack init again (should fail without --force) ---" +check_fail "init without --force correctly rejected" $OBOL stack init --backend k3s + +# --- TEST 3: stack init --force (should preserve stack ID) --- +log "" +log "--- TEST 3: stack init --force (should preserve stack ID) ---" +$OBOL stack init --backend k3s --force +NEW_ID=$(cat .workspace/config/.stack-id) +check "stack ID preserved on --force ($STACK_ID)" test "$STACK_ID" = "$NEW_ID" + +# --- TEST 4: stack up --- +log "" +log "--- TEST 4: stack up ---" +check "stack up" $OBOL stack up +check "PID file exists" test -f .workspace/config/.k3s.pid +check "kubeconfig.yaml exists" test -f .workspace/config/kubeconfig.yaml +check "k3s is functional (nodes ready)" k3s_is_functional + +# --- TEST 5: kubectl passthrough --- +log "" +log "--- TEST 5: kubectl passthrough ---" +NODES=$($OBOL kubectl get nodes --no-headers 2>/dev/null | wc -l) +check "kubectl sees nodes ($NODES)" test "$NODES" -ge 1 + +NS=$($OBOL kubectl get namespaces --no-headers 2>/dev/null | wc -l) +check "kubectl sees namespaces ($NS)" test "$NS" -ge 1 + +# --- TEST 6: stack up idempotent (already running) --- +log "" +log "--- TEST 6: stack up idempotent ---" +OLD_PID=$(cat .workspace/config/.k3s.pid) +check "stack up while running" $OBOL stack up +NEW_PID=$(cat .workspace/config/.k3s.pid) +check "PID unchanged (idempotent) ($OLD_PID = $NEW_PID)" test "$OLD_PID" = "$NEW_PID" + +# --- TEST 7: stack down --- +log "" +log "--- TEST 7: stack down ---" +check "stack down" $OBOL stack down +check "PID file cleaned up" test ! -f .workspace/config/.k3s.pid +check "config preserved after down" test -f .workspace/config/.stack-id +log " Waiting for API server to become unreachable..." +API_DOWN=false +for i in $(seq 1 15); do + if ! $OBOL kubectl get nodes --no-headers 2>/dev/null; then + API_DOWN=true + break + fi + sleep 2 +done +check "kubectl unreachable after down" test "$API_DOWN" = "true" + +# --- TEST 8: stack down again (already stopped) --- +log "" +log "--- TEST 8: stack down already stopped ---" +check "stack down (already stopped)" $OBOL stack down + +# --- TEST 9: stack up (restart after down) --- +log "" +log "--- TEST 9: stack up (restart) ---" +check "stack up (restart)" $OBOL stack up +check "PID file exists after restart" test -f .workspace/config/.k3s.pid +check "k3s functional after restart" k3s_is_functional + +READY=$($OBOL kubectl get nodes --no-headers 2>/dev/null | grep -c "Ready" || true) +check "node ready after restart ($READY)" test "$READY" -ge 1 + +# --- TEST 10: stack purge (without --force) --- +log "" +log "--- TEST 10: stack purge ---" +check "stack purge" $OBOL stack purge +sleep 2 +check "config removed" test ! -f .workspace/config/.stack-id +check "k3s pid file removed" test ! -f .workspace/config/.k3s.pid + +# --- TEST 11: full cycle + purge --force --- +log "" +log "--- TEST 11: full cycle + purge --force ---" +check "init for purge test" $OBOL stack init --backend k3s +check "up for purge test" $OBOL stack up +check "purge --force" $OBOL stack purge --force +sleep 2 +check "config removed after purge --force" test ! -f .workspace/config/.stack-id + +log "" +log "=========================================" +log "K3s RESULTS: $PASS passed, $FAIL failed" +log "=========================================" + +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi diff --git a/cmd/obol/bootstrap.go b/cmd/obol/bootstrap.go index f2d3eb2..60683d3 100644 --- a/cmd/obol/bootstrap.go +++ b/cmd/obol/bootstrap.go @@ -27,7 +27,7 @@ func bootstrapCommand(cfg *config.Config) *cli.Command { // Step 1: Initialize stack fmt.Println("Initializing stack configuration...") - if err := stack.Init(cfg, false); err != nil { + if err := stack.Init(cfg, false, ""); err != nil { // Check if it's an "already exists" error - that's okay if !strings.Contains(err.Error(), "already exists") { return fmt.Errorf("bootstrap init failed: %w", err) diff --git a/cmd/obol/main.go b/cmd/obol/main.go index 69f92c5..871eb07 100644 --- a/cmd/obol/main.go +++ b/cmd/obol/main.go @@ -102,9 +102,14 @@ GLOBAL OPTIONS: Aliases: []string{"f"}, Usage: "Force overwrite existing configuration", }, + &cli.StringFlag{ + Name: "backend", + Usage: "Cluster backend: k3d (Docker-based) or k3s (bare-metal)", + EnvVars: []string{"OBOL_BACKEND"}, + }, }, Action: func(c *cli.Context) error { - return stack.Init(cfg, c.Bool("force")) + return stack.Init(cfg, c.Bool("force"), c.String("backend")) }, }, { diff --git a/internal/embed/embed.go b/internal/embed/embed.go index 2c189eb..7a0d723 100644 --- a/internal/embed/embed.go +++ b/internal/embed/embed.go @@ -15,6 +15,9 @@ import ( //go:embed k3d-config.yaml var K3dConfig string +//go:embed k3s-config.yaml +var K3sConfig string + //go:embed all:infrastructure var infrastructureFS embed.FS diff --git a/internal/embed/infrastructure/base/templates/local-path.yaml b/internal/embed/infrastructure/base/templates/local-path.yaml index 77713e9..2547c50 100644 --- a/internal/embed/infrastructure/base/templates/local-path.yaml +++ b/internal/embed/infrastructure/base/templates/local-path.yaml @@ -11,7 +11,7 @@ data: "nodePathMap":[ { "node":"DEFAULT_PATH_FOR_NON_LISTED_NODES", - "paths":["/data"] + "paths":["{{ .Values.dataDir }}"] } ] } diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml.gotmpl similarity index 93% rename from internal/embed/infrastructure/helmfile.yaml rename to internal/embed/infrastructure/helmfile.yaml.gotmpl index e3ce9a3..1fd2e7e 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml.gotmpl @@ -1,7 +1,10 @@ # Helmfile for Obol Stack default infrastructure # Orchestrates core infrastructure components deployed with every stack # Uses Traefik with Gateway API for routing (replaces nginx-ingress) -{{- $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} +{{ $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} +{{- $dataDir := env "STACK_DATA_DIR" | default "/data" -}} +{{- $network := env "STACK_NETWORK" | default "mainnet" -}} +{{- $gatewayApiVersion := "v1.4.1" }} repositories: - name: traefik @@ -17,19 +20,14 @@ repositories: - name: stakater url: https://stakater.github.io/stakater-charts -# Single source of truth: change this to switch networks -values: - - network: mainnet - - gatewayApiVersion: v1.4.1 - releases: # Local storage provisioner (raw manifests wrapped as chart) - name: base namespace: kube-system chart: ./base values: - - dataDir: /data - - network: "{{ .Values.network }}" + - dataDir: '{{ $dataDir }}' + - network: "{{ $network }}" # Monitoring stack (Prometheus operator + Prometheus) - name: monitoring @@ -37,6 +35,7 @@ releases: createNamespace: true chart: prometheus-community/kube-prometheus-stack version: 79.5.0 + timeout: 600 values: - ./values/monitoring.yaml.gotmpl @@ -54,7 +53,7 @@ releases: args: - apply - -f - - https://github.com/kubernetes-sigs/gateway-api/releases/download/{{ .Values.gatewayApiVersion }}/standard-install.yaml + - https://github.com/kubernetes-sigs/gateway-api/releases/download/{{ $gatewayApiVersion }}/standard-install.yaml # Traefik ingress controller with Gateway API support - name: traefik diff --git a/internal/embed/infrastructure/values/erpc.yaml.gotmpl b/internal/embed/infrastructure/values/erpc.yaml.gotmpl index b7c07f8..78274e9 100644 --- a/internal/embed/infrastructure/values/erpc.yaml.gotmpl +++ b/internal/embed/infrastructure/values/erpc.yaml.gotmpl @@ -1,4 +1,4 @@ -{{- $network := .Values.network -}} +{{- $network := env "STACK_NETWORK" | default "mainnet" -}} {{- $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} {{- $chainId := 1 -}} {{/* Default: mainnet */}} {{- if eq $network "hoodi" -}} @@ -87,12 +87,18 @@ config: |- allowCredentials: true maxAge: 3600 -# Secret env variables -secretEnv: - OBOL_OAUTH_TOKEN: - secretKeyRef: - name: obol-oauth-token - key: token +# Secret env variables (chart expects flat string map, e.g. KEY: "value") +# The OBOL_OAUTH_TOKEN is injected from a Kubernetes secret via extraEnv instead +secretEnv: {} + +# Inject the OAuth token from the Kubernetes secret +extraEnv: + - name: OBOL_OAUTH_TOKEN + valueFrom: + secretKeyRef: + name: obol-oauth-token + key: token + optional: true # Extra args for the erpc container extraArgs: [] diff --git a/internal/embed/infrastructure/values/monitoring.yaml.gotmpl b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl index d7a0dc1..a7a6095 100644 --- a/internal/embed/infrastructure/values/monitoring.yaml.gotmpl +++ b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl @@ -20,6 +20,10 @@ prometheus: cpu: 500m memory: 1Gi +prometheusOperator: + admissionWebhooks: + enabled: false # Disable webhook pre-install hooks (avoids timeout on fresh k3s) + grafana: enabled: false # Enable when we want UI access diff --git a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl index caff157..66f068b 100644 --- a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl +++ b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl @@ -1,4 +1,4 @@ -{{- $network := .Values.network -}} +{{- $network := env "STACK_NETWORK" | default "mainnet" -}} {{- $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} replicaCount: 1 @@ -11,17 +11,17 @@ image: - name: NEXT_PUBLIC_HELIOS_CLIENT_URL value: "http://helios-{{ $network }}.helios.svc.cluster.local:8545" - name: NEXT_PUBLIC_ERPC_URL - value: "{{ printf \"https://%s/rpc\" $publicDomain }}" + value: "https://{{ $publicDomain }}/rpc" - name: NEXT_PUBLIC_AZTEC_SEQUENCER_URL value: "http://l2-sequencer-node-mainnet-node.aztec.svc.cluster.local:8080" - name: BETTER_AUTH_SECRET - value: "{{ env \"BETTER_AUTH_SECRET\" }}" + value: '{{ env "BETTER_AUTH_SECRET" }}' - name: BETTER_AUTH_URL - value: "{{ printf \"https://%s\" $publicDomain }}" + value: "https://{{ $publicDomain }}" - name: OBOL_GOOGLE_CLIENT_ID - value: "{{ env \"OBOL_GOOGLE_CLIENT_ID\" }}" + value: '{{ env "OBOL_GOOGLE_CLIENT_ID" }}' - name: OBOL_GOOGLE_CLIENT_SECRET - value: "{{ env \"OBOL_GOOGLE_CLIENT_SECRET\" }}" + value: '{{ env "OBOL_GOOGLE_CLIENT_SECRET" }}' - name: OBOL_AUTH_DB_PATH value: "/data/auth.sqlite" diff --git a/internal/embed/k3s-config.yaml b/internal/embed/k3s-config.yaml new file mode 100644 index 0000000..1c75e5a --- /dev/null +++ b/internal/embed/k3s-config.yaml @@ -0,0 +1,24 @@ +# k3s server configuration for Obol Stack +# Generated by: obol stack init --backend k3s + +# Disable components we manage ourselves (matching k3d config) +disable: + - traefik + - local-storage + +# Data directory for k3s internal state +data-dir: {{DATA_DIR}}/k3s + +# Bind to all interfaces for local access +bind-address: 0.0.0.0 +https-listen-port: 6443 + +# TLS SANs for local access +tls-san: + - "127.0.0.1" + - "localhost" + - "obol.stack" + +# Node labels +node-label: + - "obol.cluster-id={{STACK_ID}}" diff --git a/internal/stack/backend.go b/internal/stack/backend.go new file mode 100644 index 0000000..f26014d --- /dev/null +++ b/internal/stack/backend.go @@ -0,0 +1,77 @@ +package stack + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +const ( + // BackendK3d is the k3d backend (Docker-based, default) + BackendK3d = "k3d" + // BackendK3s is the standalone k3s backend (bare-metal) + BackendK3s = "k3s" + + stackBackendFile = ".stack-backend" +) + +// Backend abstracts the Kubernetes cluster runtime (k3d, k3s) +type Backend interface { + // Name returns the backend identifier (e.g., "k3d", "k3s") + Name() string + + // Init generates backend-specific cluster configuration files + Init(cfg *config.Config, stackID string) error + + // Up creates or starts the cluster and returns kubeconfig contents + Up(cfg *config.Config, stackID string) (kubeconfigData []byte, err error) + + // IsRunning returns true if the cluster is currently running + IsRunning(cfg *config.Config, stackID string) (bool, error) + + // Down stops the cluster without destroying configuration or data + Down(cfg *config.Config, stackID string) error + + // Destroy removes the cluster entirely (containers/processes) + Destroy(cfg *config.Config, stackID string) error + + // DataDir returns the storage path for the local-path-provisioner. + // For k3d this is "/data" (Docker volume mount point). + // For k3s this is the absolute host path to cfg.DataDir. + DataDir(cfg *config.Config) string + + // Prerequisites checks that required software/permissions are available + Prerequisites(cfg *config.Config) error +} + +// NewBackend creates a Backend by name +func NewBackend(name string) (Backend, error) { + switch name { + case BackendK3d: + return &K3dBackend{}, nil + case BackendK3s: + return &K3sBackend{}, nil + default: + return nil, fmt.Errorf("unknown backend: %s (supported: k3d, k3s)", name) + } +} + +// LoadBackend reads the persisted backend choice from .stack-backend file. +// Falls back to k3d if no file exists (backward compatibility). +func LoadBackend(cfg *config.Config) (Backend, error) { + path := filepath.Join(cfg.ConfigDir, stackBackendFile) + data, err := os.ReadFile(path) + if err != nil { + return &K3dBackend{}, nil + } + return NewBackend(strings.TrimSpace(string(data))) +} + +// SaveBackend persists the backend choice +func SaveBackend(cfg *config.Config, name string) error { + path := filepath.Join(cfg.ConfigDir, stackBackendFile) + return os.WriteFile(path, []byte(name), 0644) +} diff --git a/internal/stack/backend_k3d.go b/internal/stack/backend_k3d.go new file mode 100644 index 0000000..8fdd3de --- /dev/null +++ b/internal/stack/backend_k3d.go @@ -0,0 +1,164 @@ +package stack + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + + "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/embed" +) + +const ( + k3dConfigFile = "k3d.yaml" +) + +// K3dBackend manages clusters via k3d (k3s inside Docker containers) +type K3dBackend struct{} + +func (b *K3dBackend) Name() string { return BackendK3d } + +func (b *K3dBackend) Prerequisites(cfg *config.Config) error { + // Check Docker is running + cmd := exec.Command("docker", "info") + cmd.Stdout = nil + cmd.Stderr = nil + if err := cmd.Run(); err != nil { + return fmt.Errorf("Docker is not running. k3d backend requires Docker.\nStart Docker and try again") + } + + // Check k3d binary exists + k3dPath := filepath.Join(cfg.BinDir, "k3d") + if _, err := os.Stat(k3dPath); os.IsNotExist(err) { + return fmt.Errorf("k3d not found at %s\nRun obolup.sh to install dependencies", k3dPath) + } + return nil +} + +func (b *K3dBackend) Init(cfg *config.Config, stackID string) error { + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + + absConfigDir, err := filepath.Abs(cfg.ConfigDir) + if err != nil { + return fmt.Errorf("failed to get absolute path for config directory: %w", err) + } + + // Template k3d config with actual values + k3dConfig := embed.K3dConfig + k3dConfig = strings.ReplaceAll(k3dConfig, "{{STACK_ID}}", stackID) + k3dConfig = strings.ReplaceAll(k3dConfig, "{{DATA_DIR}}", absDataDir) + k3dConfig = strings.ReplaceAll(k3dConfig, "{{CONFIG_DIR}}", absConfigDir) + + k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) + if err := os.WriteFile(k3dConfigPath, []byte(k3dConfig), 0644); err != nil { + return fmt.Errorf("failed to write k3d config: %w", err) + } + + fmt.Printf("K3d config saved to: %s\n", k3dConfigPath) + return nil +} + +func (b *K3dBackend) IsRunning(cfg *config.Config, stackID string) (bool, error) { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + listCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "list", "--no-headers") + output, err := listCmd.Output() + if err != nil { + return false, fmt.Errorf("k3d list command failed: %w", err) + } + return strings.Contains(string(output), stackName), nil +} + +func (b *K3dBackend) Up(cfg *config.Config, stackID string) ([]byte, error) { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) + + running, err := b.IsRunning(cfg, stackID) + if err != nil { + return nil, err + } + + if running { + fmt.Printf("Stack already exists, attempting to start: %s (id: %s)\n", stackName, stackID) + startCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "start", stackName) + startCmd.Stdout = os.Stdout + startCmd.Stderr = os.Stderr + if err := startCmd.Run(); err != nil { + return nil, fmt.Errorf("failed to start existing cluster: %w", err) + } + } else { + // Create data directory if it doesn't exist + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return nil, fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + if err := os.MkdirAll(absDataDir, 0755); err != nil { + return nil, fmt.Errorf("failed to create data directory: %w", err) + } + + fmt.Println("Creating k3d cluster...") + createCmd := exec.Command( + filepath.Join(cfg.BinDir, "k3d"), + "cluster", "create", stackName, + "--config", k3dConfigPath, + "--kubeconfig-update-default=false", + ) + createCmd.Stdout = os.Stdout + createCmd.Stderr = os.Stderr + if err := createCmd.Run(); err != nil { + return nil, fmt.Errorf("failed to create cluster: %w", err) + } + } + + // Export kubeconfig + kubeconfigCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "kubeconfig", "get", stackName) + kubeconfigData, err := kubeconfigCmd.Output() + if err != nil { + return nil, fmt.Errorf("failed to get kubeconfig: %w", err) + } + + return kubeconfigData, nil +} + +func (b *K3dBackend) Down(cfg *config.Config, stackID string) error { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + + fmt.Printf("Stopping stack gracefully: %s (id: %s)\n", stackName, stackID) + + stopCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "stop", stackName) + stopCmd.Stdout = os.Stdout + stopCmd.Stderr = os.Stderr + if err := stopCmd.Run(); err != nil { + fmt.Println("Graceful stop timed out or failed, forcing cluster deletion") + deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) + deleteCmd.Stdout = os.Stdout + deleteCmd.Stderr = os.Stderr + if err := deleteCmd.Run(); err != nil { + return fmt.Errorf("failed to stop cluster: %w", err) + } + } + + return nil +} + +func (b *K3dBackend) Destroy(cfg *config.Config, stackID string) error { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + + fmt.Printf("Deleting cluster containers: %s\n", stackName) + deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) + deleteCmd.Stdout = os.Stdout + deleteCmd.Stderr = os.Stderr + if err := deleteCmd.Run(); err != nil { + fmt.Printf("Failed to delete cluster (may already be deleted): %v\n", err) + } + + return nil +} + +func (b *K3dBackend) DataDir(cfg *config.Config) string { + return "/data" +} diff --git a/internal/stack/backend_k3s.go b/internal/stack/backend_k3s.go new file mode 100644 index 0000000..3325b13 --- /dev/null +++ b/internal/stack/backend_k3s.go @@ -0,0 +1,330 @@ +package stack + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" + "time" + + "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/embed" +) + +const ( + k3sConfigFile = "k3s-config.yaml" + k3sPidFile = ".k3s.pid" + k3sLogFile = "k3s.log" +) + +// K3sBackend manages a standalone k3s cluster (bare-metal) +type K3sBackend struct{} + +func (b *K3sBackend) Name() string { return BackendK3s } + +func (b *K3sBackend) Prerequisites(cfg *config.Config) error { + if runtime.GOOS != "linux" { + return fmt.Errorf("k3s backend is only supported on Linux") + } + + // Check sudo access (allow interactive password prompt) + cmd := exec.Command("sudo", "-v") + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("k3s backend requires root/sudo access") + } + + // Check k3s binary exists + k3sPath := filepath.Join(cfg.BinDir, "k3s") + if _, err := os.Stat(k3sPath); os.IsNotExist(err) { + return fmt.Errorf("k3s not found at %s\nRun obolup.sh to install dependencies", k3sPath) + } + + return nil +} + +func (b *K3sBackend) Init(cfg *config.Config, stackID string) error { + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + + // Template k3s config with actual values + k3sConfig := embed.K3sConfig + k3sConfig = strings.ReplaceAll(k3sConfig, "{{STACK_ID}}", stackID) + k3sConfig = strings.ReplaceAll(k3sConfig, "{{DATA_DIR}}", absDataDir) + + k3sConfigPath := filepath.Join(cfg.ConfigDir, k3sConfigFile) + if err := os.WriteFile(k3sConfigPath, []byte(k3sConfig), 0644); err != nil { + return fmt.Errorf("failed to write k3s config: %w", err) + } + + fmt.Printf("K3s config saved to: %s\n", k3sConfigPath) + return nil +} + +func (b *K3sBackend) IsRunning(cfg *config.Config, stackID string) (bool, error) { + pid, err := b.readPid(cfg) + if err != nil { + return false, nil + } + + return b.isProcessAlive(pid), nil +} + +func (b *K3sBackend) Up(cfg *config.Config, stackID string) ([]byte, error) { + running, _ := b.IsRunning(cfg, stackID) + if running { + fmt.Println("k3s is already running") + kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) + data, err := os.ReadFile(kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("k3s is running but kubeconfig not found: %w", err) + } + return data, nil + } + + // Clean up stale PID file if it exists (QA R6) + b.cleanStalePid(cfg) + + k3sConfigPath := filepath.Join(cfg.ConfigDir, k3sConfigFile) + if _, err := os.Stat(k3sConfigPath); os.IsNotExist(err) { + return nil, fmt.Errorf("k3s config not found at %s\nRun 'obol stack init --backend k3s' first", k3sConfigPath) + } + + // Create data directory + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return nil, fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + if err := os.MkdirAll(absDataDir, 0755); err != nil { + return nil, fmt.Errorf("failed to create data directory: %w", err) + } + + kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) + k3sBinary := filepath.Join(cfg.BinDir, "k3s") + logPath := filepath.Join(cfg.ConfigDir, k3sLogFile) + + // Remove stale kubeconfig so we wait for k3s to write a fresh one + os.Remove(kubeconfigPath) + + // Open log file for k3s output + logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) + if err != nil { + return nil, fmt.Errorf("failed to create k3s log file: %w", err) + } + + fmt.Println("Starting k3s server...") + + // Start k3s server as background process via sudo + cmd := exec.Command("sudo", + k3sBinary, "server", + "--config", k3sConfigPath, + "--write-kubeconfig", kubeconfigPath, + "--write-kubeconfig-mode", "0600", + ) + cmd.Stdout = logFile + cmd.Stderr = logFile + + if err := cmd.Start(); err != nil { + logFile.Close() + return nil, fmt.Errorf("failed to start k3s: %w", err) + } + + // Save PID before releasing the process handle + pid := cmd.Process.Pid + + // Write PID file + pidPath := filepath.Join(cfg.ConfigDir, k3sPidFile) + if err := os.WriteFile(pidPath, []byte(strconv.Itoa(pid)), 0600); err != nil { + logFile.Close() + return nil, fmt.Errorf("failed to write k3s PID file: %w", err) + } + + // Detach the process + cmd.Process.Release() + logFile.Close() + + fmt.Printf("k3s started (pid: %d)\n", pid) + fmt.Printf("Logs: %s\n", logPath) + + // Wait for kubeconfig to be written by k3s + fmt.Println("Waiting for kubeconfig...") + deadline := time.Now().Add(2 * time.Minute) + for time.Now().Before(deadline) { + if info, err := os.Stat(kubeconfigPath); err == nil && info.Size() > 0 { + // Fix ownership: k3s writes kubeconfig as root via sudo + exec.Command("sudo", "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), kubeconfigPath).Run() + + data, err := os.ReadFile(kubeconfigPath) + if err == nil && len(data) > 0 { + fmt.Println("Kubeconfig ready, waiting for API server...") + + // Wait for the API server to actually respond + apiDeadline := time.Now().Add(90 * time.Second) + kubectlPath := filepath.Join(cfg.BinDir, "kubectl") + for time.Now().Before(apiDeadline) { + probe := exec.Command(kubectlPath, "--kubeconfig", kubeconfigPath, + "get", "nodes", "--no-headers") + if out, err := probe.Output(); err == nil && len(out) > 0 { + fmt.Println("API server ready") + return data, nil + } + time.Sleep(3 * time.Second) + } + + // Return kubeconfig even if API isn't fully ready yet + fmt.Println("Warning: API server not fully ready, proceeding anyway") + return data, nil + } + } + time.Sleep(2 * time.Second) + } + + return nil, fmt.Errorf("k3s did not write kubeconfig within timeout\nCheck logs: %s", logPath) +} + +func (b *K3sBackend) Down(cfg *config.Config, stackID string) error { + pid, err := b.readPid(cfg) + if err != nil { + fmt.Println("k3s PID file not found, may not be running") + return nil + } + + if !b.isProcessAlive(pid) { + fmt.Println("k3s process not running, cleaning up PID file") + b.removePidFile(cfg) + return nil + } + + fmt.Printf("Stopping k3s (pid: %d)...\n", pid) + + // Send SIGTERM to the sudo/k3s process only (not the process group). + // Using negative PID (process group kill) is unsafe here because the saved PID + // is the sudo wrapper, whose process group can include unrelated system processes + // like systemd-logind — killing those crashes the desktop session. + // sudo forwards SIGTERM to k3s, which handles its own child process cleanup. + pidStr := strconv.Itoa(pid) + stopCmd := exec.Command("sudo", "kill", "-TERM", pidStr) + stopCmd.Stdout = os.Stdout + stopCmd.Stderr = os.Stderr + if err := stopCmd.Run(); err != nil { + fmt.Printf("SIGTERM failed, sending SIGKILL: %v\n", err) + exec.Command("sudo", "kill", "-9", pidStr).Run() + } + + // Wait for process to exit (up to 30 seconds) + deadline := time.Now().Add(30 * time.Second) + for time.Now().Before(deadline) { + if !b.isProcessAlive(pid) { + break + } + time.Sleep(1 * time.Second) + } + + // Clean up orphaned k3s child processes (containerd-shim, etc.) + // Use k3s-killall.sh if available, otherwise kill containerd shims directly. + killallPath := "/usr/local/bin/k3s-killall.sh" + if _, err := os.Stat(killallPath); err == nil { + fmt.Println("Running k3s cleanup...") + cleanCmd := exec.Command("sudo", killallPath) + cleanCmd.Stdout = os.Stdout + cleanCmd.Stderr = os.Stderr + cleanCmd.Run() + } else { + // k3s-killall.sh not installed (binary-only install via obolup). + // Kill orphaned containerd-shim processes that use the k3s socket. + fmt.Println("Cleaning up k3s child processes...") + exec.Command("sudo", "pkill", "-TERM", "-f", "containerd-shim.*k3s").Run() + time.Sleep(2 * time.Second) + // Force-kill any that survived SIGTERM + exec.Command("sudo", "pkill", "-KILL", "-f", "containerd-shim.*k3s").Run() + } + + b.removePidFile(cfg) + fmt.Println("k3s stopped") + return nil +} + +func (b *K3sBackend) Destroy(cfg *config.Config, stackID string) error { + // Stop if running + b.Down(cfg, stackID) + + // Clean up k3s state directories (default + custom data-dir) + absDataDir, _ := filepath.Abs(cfg.DataDir) + cleanDirs := []string{ + "/var/lib/rancher/k3s", + "/etc/rancher/k3s", + filepath.Join(absDataDir, "k3s"), + } + for _, dir := range cleanDirs { + if _, err := os.Stat(dir); err == nil { + fmt.Printf("Cleaning up: %s\n", dir) + exec.Command("sudo", "rm", "-rf", dir).Run() + } + } + + // Run uninstall script if available + uninstallPath := "/usr/local/bin/k3s-uninstall.sh" + if _, err := os.Stat(uninstallPath); err == nil { + fmt.Println("Running k3s uninstall...") + uninstallCmd := exec.Command("sudo", uninstallPath) + uninstallCmd.Stdout = os.Stdout + uninstallCmd.Stderr = os.Stderr + uninstallCmd.Run() + } + + return nil +} + +func (b *K3sBackend) DataDir(cfg *config.Config) string { + absDataDir, _ := filepath.Abs(cfg.DataDir) + return absDataDir +} + +// readPid reads the k3s PID from the PID file +func (b *K3sBackend) readPid(cfg *config.Config) (int, error) { + pidPath := filepath.Join(cfg.ConfigDir, k3sPidFile) + data, err := os.ReadFile(pidPath) + if err != nil { + return 0, err + } + pid, err := strconv.Atoi(strings.TrimSpace(string(data))) + if err != nil { + return 0, fmt.Errorf("invalid PID in %s: %w", pidPath, err) + } + if pid <= 0 { + return 0, fmt.Errorf("invalid PID in %s: %d", pidPath, pid) + } + return pid, nil +} + +// cleanStalePid removes the PID file if the process is no longer running +func (b *K3sBackend) cleanStalePid(cfg *config.Config) { + pid, err := b.readPid(cfg) + if err != nil { + return + } + if !b.isProcessAlive(pid) { + fmt.Printf("Cleaning up stale PID file (pid %d no longer running)\n", pid) + b.removePidFile(cfg) + } +} + +// isProcessAlive checks if a root-owned process is still running. +// Uses sudo kill -0 since the k3s process runs as root and direct +// signal(0) from an unprivileged user returns EPERM. +func (b *K3sBackend) isProcessAlive(pid int) bool { + return exec.Command("sudo", "kill", "-0", strconv.Itoa(pid)).Run() == nil +} + +// removePidFile removes the k3s PID file +func (b *K3sBackend) removePidFile(cfg *config.Config) { + pidPath := filepath.Join(cfg.ConfigDir, k3sPidFile) + os.Remove(pidPath) +} diff --git a/internal/stack/backend_k3s_test.go b/internal/stack/backend_k3s_test.go new file mode 100644 index 0000000..e7a09ba --- /dev/null +++ b/internal/stack/backend_k3s_test.go @@ -0,0 +1,97 @@ +package stack + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +func TestK3sReadPid(t *testing.T) { + tests := []struct { + name string + content string + wantPid int + wantErr bool + errContains string + }{ + {name: "valid pid", content: "12345", wantPid: 12345}, + {name: "with trailing newline", content: "12345\n", wantPid: 12345}, + {name: "with whitespace", content: " 12345 ", wantPid: 12345}, + {name: "pid 1", content: "1", wantPid: 1}, + {name: "large pid", content: "4194304", wantPid: 4194304}, + {name: "not a number", content: "not-a-number", wantErr: true, errContains: "invalid PID"}, + {name: "empty content", content: "", wantErr: true, errContains: "invalid PID"}, + {name: "float", content: "123.45", wantErr: true, errContains: "invalid PID"}, + {name: "negative", content: "-1", wantErr: true, errContains: "invalid PID"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + pidPath := filepath.Join(tmpDir, k3sPidFile) + if err := os.WriteFile(pidPath, []byte(tt.content), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + b := &K3sBackend{} + pid, err := b.readPid(cfg) + if tt.wantErr { + if err == nil { + t.Fatalf("readPid() = %d, nil error; want error containing %q", pid, tt.errContains) + } + if !strings.Contains(err.Error(), tt.errContains) { + t.Errorf("readPid() error = %q, want containing %q", err.Error(), tt.errContains) + } + return + } + if err != nil { + t.Fatalf("readPid() unexpected error: %v", err) + } + if pid != tt.wantPid { + t.Errorf("readPid() = %d, want %d", pid, tt.wantPid) + } + }) + } + + t.Run("missing file", func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + b := &K3sBackend{} + _, err := b.readPid(cfg) + if err == nil { + t.Fatal("readPid() with no file should return error") + } + }) +} + +func TestK3sRemovePidFile(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + pidPath := filepath.Join(tmpDir, k3sPidFile) + if err := os.WriteFile(pidPath, []byte("12345"), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + b := &K3sBackend{} + b.removePidFile(cfg) + + if _, err := os.Stat(pidPath); !os.IsNotExist(err) { + t.Error("PID file should have been removed") + } +} + +func TestK3sRemovePidFileNoop(t *testing.T) { + // Removing a non-existent PID file should not panic or error + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + b := &K3sBackend{} + b.removePidFile(cfg) // should not panic +} diff --git a/internal/stack/backend_test.go b/internal/stack/backend_test.go new file mode 100644 index 0000000..e59836c --- /dev/null +++ b/internal/stack/backend_test.go @@ -0,0 +1,321 @@ +package stack + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +// Compile-time interface compliance checks +var ( + _ Backend = (*K3dBackend)(nil) + _ Backend = (*K3sBackend)(nil) +) + +func TestNewBackend(t *testing.T) { + tests := []struct { + name string + input string + wantName string + wantErr bool + errContains string + }{ + {name: "k3d backend", input: "k3d", wantName: "k3d"}, + {name: "k3s backend", input: "k3s", wantName: "k3s"}, + {name: "unknown backend", input: "docker", wantErr: true, errContains: "unknown backend"}, + {name: "empty string", input: "", wantErr: true, errContains: "unknown backend"}, + {name: "case sensitive", input: "K3D", wantErr: true, errContains: "unknown backend"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + backend, err := NewBackend(tt.input) + if tt.wantErr { + if err == nil { + t.Fatalf("NewBackend(%q) = nil error, want error containing %q", tt.input, tt.errContains) + } + if !strings.Contains(err.Error(), tt.errContains) { + t.Errorf("NewBackend(%q) error = %q, want containing %q", tt.input, err.Error(), tt.errContains) + } + return + } + if err != nil { + t.Fatalf("NewBackend(%q) unexpected error: %v", tt.input, err) + } + if backend.Name() != tt.wantName { + t.Errorf("NewBackend(%q).Name() = %q, want %q", tt.input, backend.Name(), tt.wantName) + } + }) + } +} + +func TestK3dBackendName(t *testing.T) { + b := &K3dBackend{} + if got := b.Name(); got != BackendK3d { + t.Errorf("K3dBackend.Name() = %q, want %q", got, BackendK3d) + } +} + +func TestK3sBackendName(t *testing.T) { + b := &K3sBackend{} + if got := b.Name(); got != BackendK3s { + t.Errorf("K3sBackend.Name() = %q, want %q", got, BackendK3s) + } +} + +func TestK3dBackendDataDir(t *testing.T) { + // k3d DataDir must always return "/data" regardless of cfg.DataDir, + // because k3d mounts the host data dir to /data inside the container. + tests := []struct { + name string + dataDir string + }{ + {name: "absolute path", dataDir: "/home/user/.local/share/obol"}, + {name: "relative path", dataDir: ".workspace/data"}, + {name: "empty string", dataDir: ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + b := &K3dBackend{} + cfg := &config.Config{DataDir: tt.dataDir} + if got := b.DataDir(cfg); got != "/data" { + t.Errorf("K3dBackend.DataDir() = %q, want %q (must always be /data for Docker mount)", got, "/data") + } + }) + } +} + +func TestK3sBackendDataDir(t *testing.T) { + // k3s DataDir must return an absolute version of cfg.DataDir, + // because k3s runs directly on the host. + b := &K3sBackend{} + + t.Run("absolute path passthrough", func(t *testing.T) { + cfg := &config.Config{DataDir: "/home/user/.local/share/obol"} + got := b.DataDir(cfg) + if got != "/home/user/.local/share/obol" { + t.Errorf("K3sBackend.DataDir() = %q, want %q", got, "/home/user/.local/share/obol") + } + }) + + t.Run("relative path resolved to absolute", func(t *testing.T) { + cfg := &config.Config{DataDir: "relative/path"} + got := b.DataDir(cfg) + if !filepath.IsAbs(got) { + t.Errorf("K3sBackend.DataDir() = %q, want absolute path", got) + } + if !strings.HasSuffix(got, "relative/path") { + t.Errorf("K3sBackend.DataDir() = %q, want suffix %q", got, "relative/path") + } + }) +} + +func TestSaveAndLoadBackend(t *testing.T) { + tests := []struct { + name string + backend string + wantName string + }{ + {name: "save k3s load k3s", backend: "k3s", wantName: "k3s"}, + {name: "save k3d load k3d", backend: "k3d", wantName: "k3d"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + if err := SaveBackend(cfg, tt.backend); err != nil { + t.Fatalf("SaveBackend() error: %v", err) + } + + backend, err := LoadBackend(cfg) + if err != nil { + t.Fatalf("LoadBackend() error: %v", err) + } + if backend.Name() != tt.wantName { + t.Errorf("LoadBackend().Name() = %q, want %q", backend.Name(), tt.wantName) + } + }) + } +} + +func TestLoadBackendFallsBackToK3d(t *testing.T) { + // When no .stack-backend file exists, LoadBackend must return k3d + // for backward compatibility with existing stacks. + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + backend, err := LoadBackend(cfg) + if err != nil { + t.Fatalf("LoadBackend() error: %v", err) + } + if backend.Name() != BackendK3d { + t.Errorf("LoadBackend() with no file = %q, want %q (backward compat)", backend.Name(), BackendK3d) + } +} + +func TestLoadBackendWithWhitespace(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + // Write file with trailing newline and whitespace + path := filepath.Join(tmpDir, stackBackendFile) + if err := os.WriteFile(path, []byte("k3s\n "), 0644); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + backend, err := LoadBackend(cfg) + if err != nil { + t.Fatalf("LoadBackend() error: %v", err) + } + if backend.Name() != BackendK3s { + t.Errorf("LoadBackend() = %q, want %q", backend.Name(), BackendK3s) + } +} + +func TestLoadBackendInvalidName(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + path := filepath.Join(tmpDir, stackBackendFile) + if err := os.WriteFile(path, []byte("docker-swarm"), 0644); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + _, err := LoadBackend(cfg) + if err == nil { + t.Fatal("LoadBackend() with invalid backend name should return error") + } + if !strings.Contains(err.Error(), "unknown backend") { + t.Errorf("LoadBackend() error = %q, want containing %q", err.Error(), "unknown backend") + } +} + +func TestK3dBackendInit(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ + ConfigDir: tmpDir, + DataDir: filepath.Join(tmpDir, "data"), + } + + b := &K3dBackend{} + if err := b.Init(cfg, "test-stack"); err != nil { + t.Fatalf("K3dBackend.Init() error: %v", err) + } + + // Verify config file was written + configPath := filepath.Join(tmpDir, k3dConfigFile) + data, err := os.ReadFile(configPath) + if err != nil { + t.Fatalf("Failed to read generated config: %v", err) + } + + content := string(data) + + // Verify placeholders were replaced + if strings.Contains(content, "{{STACK_ID}}") { + t.Error("Config still contains {{STACK_ID}} placeholder") + } + if strings.Contains(content, "{{DATA_DIR}}") { + t.Error("Config still contains {{DATA_DIR}} placeholder") + } + if strings.Contains(content, "{{CONFIG_DIR}}") { + t.Error("Config still contains {{CONFIG_DIR}} placeholder") + } + + // Verify actual values are present + if !strings.Contains(content, "test-stack") { + t.Error("Config does not contain stack ID 'test-stack'") + } + + // Verify paths are absolute + if !strings.Contains(content, tmpDir) { + t.Errorf("Config does not contain absolute data dir path %q", tmpDir) + } +} + +func TestK3sBackendInit(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ + ConfigDir: tmpDir, + DataDir: filepath.Join(tmpDir, "data"), + } + + b := &K3sBackend{} + if err := b.Init(cfg, "my-cluster"); err != nil { + t.Fatalf("K3sBackend.Init() error: %v", err) + } + + // Verify config file was written + configPath := filepath.Join(tmpDir, k3sConfigFile) + data, err := os.ReadFile(configPath) + if err != nil { + t.Fatalf("Failed to read generated config: %v", err) + } + + content := string(data) + + // Verify placeholders were replaced + if strings.Contains(content, "{{STACK_ID}}") { + t.Error("Config still contains {{STACK_ID}} placeholder") + } + if strings.Contains(content, "{{DATA_DIR}}") { + t.Error("Config still contains {{DATA_DIR}} placeholder") + } + + // Verify actual values are present + if !strings.Contains(content, "my-cluster") { + t.Error("Config does not contain stack ID 'my-cluster'") + } + + // Verify data-dir uses absolute path + absDataDir, _ := filepath.Abs(filepath.Join(tmpDir, "data")) + expectedDataDir := absDataDir + "/k3s" + if !strings.Contains(content, expectedDataDir) { + t.Errorf("Config does not contain absolute data-dir %q", expectedDataDir) + } +} + +func TestGetStackID(t *testing.T) { + tests := []struct { + name string + content string + want string + }{ + {name: "simple id", content: "happy-panda", want: "happy-panda"}, + {name: "with trailing newline", content: "happy-panda\n", want: "happy-panda"}, + {name: "with whitespace", content: " happy-panda \n", want: "happy-panda"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + path := filepath.Join(tmpDir, stackIDFile) + if err := os.WriteFile(path, []byte(tt.content), 0644); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + got := getStackID(cfg) + if got != tt.want { + t.Errorf("getStackID() = %q, want %q", got, tt.want) + } + }) + } + + t.Run("missing file returns empty", func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + got := getStackID(cfg) + if got != "" { + t.Errorf("getStackID() with no file = %q, want empty string", got) + } + }) +} diff --git a/internal/stack/integration_test.go b/internal/stack/integration_test.go new file mode 100644 index 0000000..66088bc --- /dev/null +++ b/internal/stack/integration_test.go @@ -0,0 +1,255 @@ +//go:build integration + +package stack_test + +import ( + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" +) + +// Integration tests for the k3s backend user flows. +// Requires: sudo access, k3s binary, OBOL_DEVELOPMENT=true. +// +// Run with: +// go test -tags integration -timeout 15m -v ./internal/stack/ + +func TestK3sUserFlows(t *testing.T) { + if os.Getenv("OBOL_DEVELOPMENT") != "true" { + t.Skip("OBOL_DEVELOPMENT not set, skipping integration test") + } + + projectRoot := findProjectRoot(t) + obol := filepath.Join(projectRoot, ".workspace", "bin", "obol") + if _, err := os.Stat(obol); os.IsNotExist(err) { + t.Fatalf("obol binary not found at %s — build it first", obol) + } + + configDir := filepath.Join(projectRoot, ".workspace", "config") + binDir := filepath.Join(projectRoot, ".workspace", "bin") + + // Helper to run obol commands + run := func(t *testing.T, args ...string) (string, error) { + t.Helper() + cmd := exec.Command(obol, args...) + cmd.Env = append(os.Environ(), + "OBOL_DEVELOPMENT=true", + "PATH="+binDir+":"+os.Getenv("PATH"), + ) + cmd.Dir = projectRoot + out, err := cmd.CombinedOutput() + return string(out), err + } + + // Cleanup before tests + run(t, "stack", "purge", "--force") + + // Cleanup after all tests + t.Cleanup(func() { + run(t, "stack", "purge", "--force") + }) + + t.Run("init", func(t *testing.T) { + out, err := run(t, "stack", "init", "--backend", "k3s") + if err != nil { + t.Fatalf("stack init failed: %v\n%s", err, out) + } + + // Verify config files created + for _, f := range []string{"k3s-config.yaml", ".stack-id", ".stack-backend"} { + if _, err := os.Stat(filepath.Join(configDir, f)); os.IsNotExist(err) { + t.Errorf("expected %s to exist after init", f) + } + } + + // Verify defaults directory + if _, err := os.Stat(filepath.Join(configDir, "defaults")); os.IsNotExist(err) { + t.Error("expected defaults/ directory after init") + } + + // Verify backend is k3s + data, _ := os.ReadFile(filepath.Join(configDir, ".stack-backend")) + if got := strings.TrimSpace(string(data)); got != "k3s" { + t.Errorf("backend = %q, want k3s", got) + } + }) + + t.Run("init_rejects_without_force", func(t *testing.T) { + _, err := run(t, "stack", "init", "--backend", "k3s") + if err == nil { + t.Error("init without --force should fail when config exists") + } + }) + + t.Run("init_force_preserves_stack_id", func(t *testing.T) { + idBefore, _ := os.ReadFile(filepath.Join(configDir, ".stack-id")) + out, err := run(t, "stack", "init", "--backend", "k3s", "--force") + if err != nil { + t.Fatalf("stack init --force failed: %v\n%s", err, out) + } + idAfter, _ := os.ReadFile(filepath.Join(configDir, ".stack-id")) + if string(idBefore) != string(idAfter) { + t.Errorf("stack ID changed: %q → %q", string(idBefore), string(idAfter)) + } + }) + + t.Run("up", func(t *testing.T) { + out, err := run(t, "stack", "up") + if err != nil { + t.Fatalf("stack up failed: %v\n%s", err, out) + } + + // Verify PID file and kubeconfig exist + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); os.IsNotExist(err) { + t.Error("PID file not found after stack up") + } + if _, err := os.Stat(filepath.Join(configDir, "kubeconfig.yaml")); os.IsNotExist(err) { + t.Error("kubeconfig not found after stack up") + } + }) + + t.Run("kubectl_passthrough", func(t *testing.T) { + out, err := run(t, "kubectl", "get", "nodes", "--no-headers") + if err != nil { + t.Fatalf("kubectl passthrough failed: %v\n%s", err, out) + } + lines := strings.Split(strings.TrimSpace(out), "\n") + if len(lines) < 1 { + t.Error("kubectl get nodes returned no nodes") + } + + out, err = run(t, "kubectl", "get", "namespaces", "--no-headers") + if err != nil { + t.Fatalf("kubectl get namespaces failed: %v\n%s", err, out) + } + lines = strings.Split(strings.TrimSpace(out), "\n") + if len(lines) < 1 { + t.Error("kubectl get namespaces returned no namespaces") + } + }) + + t.Run("up_idempotent", func(t *testing.T) { + pidBefore, _ := os.ReadFile(filepath.Join(configDir, ".k3s.pid")) + + out, err := run(t, "stack", "up") + if err != nil { + t.Fatalf("stack up (idempotent) failed: %v\n%s", err, out) + } + + pidAfter, _ := os.ReadFile(filepath.Join(configDir, ".k3s.pid")) + if string(pidBefore) != string(pidAfter) { + t.Errorf("PID changed on idempotent up: %q → %q", string(pidBefore), string(pidAfter)) + } + }) + + t.Run("down", func(t *testing.T) { + out, err := run(t, "stack", "down") + if err != nil { + t.Fatalf("stack down failed: %v\n%s", err, out) + } + + // PID file should be cleaned up + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); !os.IsNotExist(err) { + t.Error("PID file should be removed after down") + } + + // Config should be preserved + if _, err := os.Stat(filepath.Join(configDir, ".stack-id")); os.IsNotExist(err) { + t.Error("stack ID should be preserved after down") + } + }) + + t.Run("down_already_stopped", func(t *testing.T) { + out, err := run(t, "stack", "down") + if err != nil { + t.Fatalf("stack down (already stopped) failed: %v\n%s", err, out) + } + }) + + t.Run("up_restart_after_down", func(t *testing.T) { + out, err := run(t, "stack", "up") + if err != nil { + t.Fatalf("stack up (restart) failed: %v\n%s", err, out) + } + + // Verify PID file exists + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); os.IsNotExist(err) { + t.Error("PID file not found after restart") + } + + // Wait for node to be ready + deadline := time.Now().Add(60 * time.Second) + for time.Now().Before(deadline) { + out, err := run(t, "kubectl", "get", "nodes", "--no-headers") + if err == nil && strings.Contains(out, "Ready") { + break + } + time.Sleep(3 * time.Second) + } + + out, _ = run(t, "kubectl", "get", "nodes", "--no-headers") + if !strings.Contains(out, "Ready") { + t.Error("node not ready after restart") + } + }) + + t.Run("purge", func(t *testing.T) { + out, err := run(t, "stack", "purge") + if err != nil { + t.Fatalf("stack purge failed: %v\n%s", err, out) + } + + time.Sleep(2 * time.Second) + + if _, err := os.Stat(filepath.Join(configDir, ".stack-id")); !os.IsNotExist(err) { + t.Error("stack ID should be removed after purge") + } + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); !os.IsNotExist(err) { + t.Error("PID file should be removed after purge") + } + }) + + t.Run("full_cycle_purge_force", func(t *testing.T) { + out, err := run(t, "stack", "init", "--backend", "k3s") + if err != nil { + t.Fatalf("init: %v\n%s", err, out) + } + + out, err = run(t, "stack", "up") + if err != nil { + t.Fatalf("up: %v\n%s", err, out) + } + + out, err = run(t, "stack", "purge", "--force") + if err != nil { + t.Fatalf("purge --force: %v\n%s", err, out) + } + + time.Sleep(2 * time.Second) + + if _, err := os.Stat(filepath.Join(configDir, ".stack-id")); !os.IsNotExist(err) { + t.Error("config should be removed after purge --force") + } + }) +} + +func findProjectRoot(t *testing.T) string { + t.Helper() + dir, err := os.Getwd() + if err != nil { + t.Fatalf("failed to get working directory: %v", err) + } + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + t.Fatal("could not find project root (no go.mod)") + } + dir = parent + } +} diff --git a/internal/stack/stack.go b/internal/stack/stack.go index c8366f6..8e2442b 100644 --- a/internal/stack/stack.go +++ b/internal/stack/stack.go @@ -13,21 +13,30 @@ import ( ) const ( - k3dConfigFile = "k3d.yaml" kubeconfigFile = "kubeconfig.yaml" stackIDFile = ".stack-id" ) // Init initializes the stack configuration -func Init(cfg *config.Config, force bool) error { - // Create flat stack config directory - k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) - - // Check if config already exists - if _, err := os.Stat(k3dConfigPath); err == nil { - if !force { - return fmt.Errorf("stack configuration already exists at %s\nUse --force to overwrite", k3dConfigPath) - } +func Init(cfg *config.Config, force bool, backendName string) error { + // Check if any stack config already exists + stackIDPath := filepath.Join(cfg.ConfigDir, stackIDFile) + backendFilePath := filepath.Join(cfg.ConfigDir, stackBackendFile) + + hasExistingConfig := false + if _, err := os.Stat(stackIDPath); err == nil { + hasExistingConfig = true + } + if _, err := os.Stat(backendFilePath); err == nil { + hasExistingConfig = true + } + // Also check legacy k3d.yaml for backward compatibility + if _, err := os.Stat(filepath.Join(cfg.ConfigDir, k3dConfigFile)); err == nil { + hasExistingConfig = true + } + + if hasExistingConfig && !force { + return fmt.Errorf("stack configuration already exists at %s\nUse --force to overwrite", cfg.ConfigDir) } if err := os.MkdirAll(cfg.ConfigDir, 0755); err != nil { @@ -35,46 +44,37 @@ func Init(cfg *config.Config, force bool) error { } // Check if stack ID already exists (preserve on --force) - stackIDPath := filepath.Join(cfg.ConfigDir, stackIDFile) var stackID string if existingID, err := os.ReadFile(stackIDPath); err == nil { - stackID = string(existingID) + stackID = strings.TrimSpace(string(existingID)) fmt.Printf("Preserving existing stack ID: %s (use purge to reset)\n", stackID) } else { - // Generate unique stack ID only if one doesn't exist stackID = petname.Generate(2, "-") } - fmt.Println("Initializing cluster configuration") - fmt.Printf("Cluster ID: %s\n", stackID) - - absDataDir, err := filepath.Abs(cfg.DataDir) - if err != nil { - return fmt.Errorf("failed to get absolute path for data directory: %w", err) + // Default to k3d if no backend specified + if backendName == "" { + backendName = BackendK3d } - absConfigDir, err := filepath.Abs(cfg.ConfigDir) + backend, err := NewBackend(backendName) if err != nil { - return fmt.Errorf("failed to get absolute path for config directory: %w", err) - } - - // Check if overwriting config - if _, err := os.Stat(k3dConfigPath); err == nil { - fmt.Printf("Overwriting existing stack configuration: %s\n", k3dConfigPath) + return err } - // Replace placeholder in k3d config with actual stack ID - k3dConfig := embed.K3dConfig - k3dConfig = strings.ReplaceAll(k3dConfig, "{{STACK_ID}}", stackID) - k3dConfig = strings.ReplaceAll(k3dConfig, "{{DATA_DIR}}", absDataDir) - k3dConfig = strings.ReplaceAll(k3dConfig, "{{CONFIG_DIR}}", absConfigDir) + fmt.Println("Initializing cluster configuration") + fmt.Printf("Cluster ID: %s\n", stackID) + fmt.Printf("Backend: %s\n", backend.Name()) - // Write k3d config with stack ID to destination - if err := os.WriteFile(k3dConfigPath, []byte(k3dConfig), 0644); err != nil { - return fmt.Errorf("failed to write k3d config: %w", err) + // Check prerequisites + if err := backend.Prerequisites(cfg); err != nil { + return fmt.Errorf("prerequisites check failed: %w", err) } - fmt.Printf("K3d config saved to: %s\n", k3dConfigPath) + // Generate backend-specific config + if err := backend.Init(cfg, stackID); err != nil { + return err + } // Copy embedded defaults (helmfile + charts for infrastructure) defaultsDir := filepath.Join(cfg.ConfigDir, "defaults") @@ -83,100 +83,50 @@ func Init(cfg *config.Config, force bool) error { } fmt.Printf("Defaults copied to: %s\n", defaultsDir) - // Store stack ID for later use (stackIDPath already declared above) + // Store stack ID if err := os.WriteFile(stackIDPath, []byte(stackID), 0644); err != nil { return fmt.Errorf("failed to write stack ID: %w", err) } - fmt.Printf("Initialized stack configuration: %s\n", k3dConfigPath) + // Save backend choice + if err := SaveBackend(cfg, backendName); err != nil { + return fmt.Errorf("failed to save backend choice: %w", err) + } + + fmt.Printf("Initialized stack configuration\n") fmt.Printf("Stack ID: %s\n", stackID) return nil } -// Up starts the k3d cluster +// Up starts the cluster using the configured backend func Up(cfg *config.Config) error { - k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) - kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) - - // Check if config exists - if _, err := os.Stat(k3dConfigPath); os.IsNotExist(err) { - return fmt.Errorf("stack config not found, run 'obol stack init' first") - } - - // Get stack ID and full stack name stackID := getStackID(cfg) if stackID == "" { return fmt.Errorf("stack ID not found, run 'obol stack init' first") } - stackName := getStackName(cfg) - - // Check if cluster already exists using cluster list - listCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "list", "--no-headers") - listCmdOutput, err := listCmd.Output() + backend, err := LoadBackend(cfg) if err != nil { - return fmt.Errorf("k3d list command failed: %w", err) + return fmt.Errorf("failed to load backend: %w", err) } - if stackExists(string(listCmdOutput), stackName) { - // Cluster exists - check if it's stopped or running - fmt.Printf("Stack already exists, attempting to start: %s (id: %s)\n", stackName, stackID) - startCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "start", stackName) - startCmd.Stdout = os.Stdout - startCmd.Stderr = os.Stderr - if err := startCmd.Run(); err != nil { - return fmt.Errorf("failed to start existing cluster: %w", err) - } - - if err := syncDefaults(cfg, kubeconfigPath); err != nil { - return err - } - - fmt.Println("Stack restarted successfully") - fmt.Printf("Stack ID: %s\n", stackID) - return nil - } - - fmt.Printf("Starting stack: %s (id: %s)\n", stackName, stackID) - - // Get absolute path to data directory for k3d volume mount - absDataDir, err := filepath.Abs(cfg.DataDir) - if err != nil { - return fmt.Errorf("failed to get absolute path for data directory: %w", err) - } - - // Create data directory if it doesn't exist - if err := os.MkdirAll(absDataDir, 0755); err != nil { - return fmt.Errorf("failed to create data directory: %w", err) - } - - // Create cluster using k3d config with custom name - fmt.Println("Creating k3d cluster...") - createCmd := exec.Command( - filepath.Join(cfg.BinDir, "k3d"), - "cluster", "create", stackName, - "--config", k3dConfigPath, - "--kubeconfig-update-default=false", - ) - createCmd.Stdout = os.Stdout - createCmd.Stderr = os.Stderr + kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) - if err := createCmd.Run(); err != nil { - return fmt.Errorf("failed to create cluster: %w", err) - } + fmt.Printf("Starting stack (id: %s, backend: %s)\n", stackID, backend.Name()) - // Export kubeconfig - kubeconfigCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "kubeconfig", "get", stackName) - kubeconfigData, err := kubeconfigCmd.Output() + kubeconfigData, err := backend.Up(cfg, stackID) if err != nil { - return fmt.Errorf("failed to get kubeconfig: %w", err) + return err } + // Write kubeconfig (backend may have already written it, but ensure consistency) if err := os.WriteFile(kubeconfigPath, kubeconfigData, 0600); err != nil { return fmt.Errorf("failed to write kubeconfig: %w", err) } - if err := syncDefaults(cfg, kubeconfigPath); err != nil { + // Sync defaults with backend-aware dataDir + dataDir := backend.DataDir(cfg) + if err := syncDefaults(cfg, kubeconfigPath, dataDir); err != nil { return err } @@ -187,85 +137,50 @@ func Up(cfg *config.Config) error { return nil } -// Down stops the k3d cluster +// Down stops the cluster func Down(cfg *config.Config) error { stackID := getStackID(cfg) if stackID == "" { return fmt.Errorf("stack ID not found, stack may not be initialized") } - stackName := getStackName(cfg) - - fmt.Printf("Stopping stack gracefully: %s (id: %s)\n", stackName, stackID) - - // First attempt graceful stop (allows processes to shutdown gracefully) - stopCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "stop", stackName) - stopCmd.Stdout = os.Stdout - stopCmd.Stderr = os.Stderr - - if err := stopCmd.Run(); err != nil { - fmt.Println("Graceful stop timed out or failed, forcing cluster deletion") - // Fallback to delete if stop fails - deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) - deleteCmd.Stdout = os.Stdout - deleteCmd.Stderr = os.Stderr - if err := deleteCmd.Run(); err != nil { - return fmt.Errorf("failed to stop cluster: %w", err) - } + + backend, err := LoadBackend(cfg) + if err != nil { + return fmt.Errorf("failed to load backend: %w", err) } - fmt.Println("Stack stopped successfully") - return nil + return backend.Down(cfg, stackID) } // Purge deletes the cluster config and optionally data func Purge(cfg *config.Config, force bool) error { - // Delete cluster containers - stackName := getStackName(cfg) - if stackName != "" { + stackID := getStackID(cfg) + + backend, err := LoadBackend(cfg) + if err != nil { + return fmt.Errorf("failed to load backend: %w", err) + } + + // Destroy cluster if we have a stack ID + if stackID != "" { if force { - // Force delete without graceful shutdown - fmt.Printf("Force deleting cluster containers: %s\n", stackName) - deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) - deleteCmd.Stdout = os.Stdout - deleteCmd.Stderr = os.Stderr - if err := deleteCmd.Run(); err != nil { - fmt.Printf("Failed to delete cluster (may already be deleted): %v\n", err) - } - fmt.Println("Cluster containers force deleted") + fmt.Printf("Force destroying cluster (id: %s)\n", stackID) } else { - // Graceful shutdown first to ensure data is written properly - fmt.Printf("Gracefully stopping cluster before deletion: %s\n", stackName) - stopCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "stop", stackName) - stopCmd.Stdout = os.Stdout - stopCmd.Stderr = os.Stderr - if err := stopCmd.Run(); err != nil { - fmt.Println("Graceful stop timed out or failed, proceeding with deletion anyway") - } else { - fmt.Println("Cluster stopped gracefully") - } - - // Now delete the stopped cluster - fmt.Println("Deleting cluster containers") - deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) - deleteCmd.Stdout = os.Stdout - deleteCmd.Stderr = os.Stderr - if err := deleteCmd.Run(); err != nil { - fmt.Printf("Failed to delete cluster (may already be deleted): %v\n", err) - } - fmt.Println("Cluster containers deleted") + fmt.Printf("Destroying cluster (id: %s)\n", stackID) + } + if err := backend.Destroy(cfg, stackID); err != nil { + fmt.Printf("Failed to destroy cluster (may already be deleted): %v\n", err) } } // Remove stack config directory - stackConfigDir := filepath.Join(cfg.ConfigDir) - if err := os.RemoveAll(stackConfigDir); err != nil { + if err := os.RemoveAll(cfg.ConfigDir); err != nil { return fmt.Errorf("failed to remove stack config: %w", err) } fmt.Println("Removed cluster config directory") // Remove data directory only if force flag is set if force { - // Use sudo to remove data directory since it may contain root-owned files fmt.Println("Removing data directory...") rmCmd := exec.Command("sudo", "rm", "-rf", cfg.DataDir) rmCmd.Stdout = os.Stdout @@ -284,12 +199,6 @@ func Purge(cfg *config.Config, force bool) error { return nil } -// stackExists checks if stack name exists in k3d cluster list output -func stackExists(output, name string) bool { - // Check if the stack name appears in the output - return strings.Contains(output, name) -} - // getStackID reads the stored stack ID func getStackID(cfg *config.Config) string { stackIDPath := filepath.Join(cfg.ConfigDir, stackIDFile) @@ -300,15 +209,6 @@ func getStackID(cfg *config.Config) string { return strings.TrimSpace(string(data)) } -// getStackName returns the full stack name (obol-stack-{stackid}) -func getStackName(cfg *config.Config) string { - stackID := getStackID(cfg) - if stackID == "" { - return "" - } - return fmt.Sprintf("obol-stack-%s", stackID) -} - // GetStackID reads the stored stack ID (exported for use in main) func GetStackID(cfg *config.Config) string { return getStackID(cfg) @@ -316,23 +216,25 @@ func GetStackID(cfg *config.Config) string { // syncDefaults deploys the default infrastructure using helmfile // If deployment fails, the cluster is automatically stopped via Down() -func syncDefaults(cfg *config.Config, kubeconfigPath string) error { +func syncDefaults(cfg *config.Config, kubeconfigPath string, dataDir string) error { fmt.Println("Deploying default infrastructure with helmfile") - // Sync defaults using helmfile (handles Helm hooks properly) defaultsHelmfilePath := filepath.Join(cfg.ConfigDir, "defaults") helmfileCmd := exec.Command( filepath.Join(cfg.BinDir, "helmfile"), - "--file", filepath.Join(defaultsHelmfilePath, "helmfile.yaml"), + "--file", filepath.Join(defaultsHelmfilePath, "helmfile.yaml.gotmpl"), "--kubeconfig", kubeconfigPath, "sync", ) + helmfileCmd.Env = append(os.Environ(), + fmt.Sprintf("KUBECONFIG=%s", kubeconfigPath), + fmt.Sprintf("STACK_DATA_DIR=%s", dataDir), + ) helmfileCmd.Stdout = os.Stdout helmfileCmd.Stderr = os.Stderr if err := helmfileCmd.Run(); err != nil { fmt.Println("Failed to apply defaults helmfile, stopping cluster") - // Attempt to stop the cluster to clean up if downErr := Down(cfg); downErr != nil { fmt.Printf("Failed to stop cluster during cleanup: %v\n", downErr) }