From a05bce5460c3cb659d59238a6fd7d847b84d0380 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 7 Feb 2026 01:16:23 +0400 Subject: [PATCH 1/5] feat(stack): add pluggable backend system with native k3s support Introduce a Backend interface that abstracts cluster lifecycle management, enabling both k3d (Docker-based, default) and k3s (native bare-metal) backends. This is a prerequisite for TEE/Confidential Computing workloads which require direct hardware access that k3d cannot provide. Changes: - Add Backend interface (Init, Up, Down, Destroy, IsRunning, DataDir) - Extract k3d logic into K3dBackend with backward-compatible fallback - Add K3sBackend with sudo process management, PID tracking, and API server readiness checks - Convert helmfile.yaml to helmfile.yaml.gotmpl using env vars instead of .Values references (fixes first-pass template rendering) - Fix eRPC secretEnv type mismatch (map vs string for b64enc) - Fix obol-frontend escaped quotes in gotmpl expressions - Add KUBECONFIG env var to helmfile command for hook compatibility - Add 26 unit tests and 10 integration test scenarios Closes #134 --- cmd/obol/bootstrap.go | 2 +- cmd/obol/main.go | 7 +- internal/embed/embed.go | 3 + .../base/templates/local-path.yaml | 2 +- .../{helmfile.yaml => helmfile.yaml.gotmpl} | 16 +- .../infrastructure/values/erpc.yaml.gotmpl | 20 +- .../values/obol-frontend.yaml.gotmpl | 12 +- internal/embed/k3s-config.yaml | 24 ++ internal/stack/backend.go | 77 +++++ internal/stack/backend_k3d.go | 164 +++++++++ internal/stack/backend_k3s.go | 320 +++++++++++++++++ internal/stack/backend_k3s_test.go | 97 ++++++ internal/stack/backend_test.go | 321 ++++++++++++++++++ internal/stack/integration_test.go | 255 ++++++++++++++ internal/stack/stack.go | 258 +++++--------- 15 files changed, 1375 insertions(+), 203 deletions(-) rename internal/embed/infrastructure/{helmfile.yaml => helmfile.yaml.gotmpl} (94%) create mode 100644 internal/embed/k3s-config.yaml create mode 100644 internal/stack/backend.go create mode 100644 internal/stack/backend_k3d.go create mode 100644 internal/stack/backend_k3s.go create mode 100644 internal/stack/backend_k3s_test.go create mode 100644 internal/stack/backend_test.go create mode 100644 internal/stack/integration_test.go diff --git a/cmd/obol/bootstrap.go b/cmd/obol/bootstrap.go index f2d3eb2..60683d3 100644 --- a/cmd/obol/bootstrap.go +++ b/cmd/obol/bootstrap.go @@ -27,7 +27,7 @@ func bootstrapCommand(cfg *config.Config) *cli.Command { // Step 1: Initialize stack fmt.Println("Initializing stack configuration...") - if err := stack.Init(cfg, false); err != nil { + if err := stack.Init(cfg, false, ""); err != nil { // Check if it's an "already exists" error - that's okay if !strings.Contains(err.Error(), "already exists") { return fmt.Errorf("bootstrap init failed: %w", err) diff --git a/cmd/obol/main.go b/cmd/obol/main.go index 69f92c5..871eb07 100644 --- a/cmd/obol/main.go +++ b/cmd/obol/main.go @@ -102,9 +102,14 @@ GLOBAL OPTIONS: Aliases: []string{"f"}, Usage: "Force overwrite existing configuration", }, + &cli.StringFlag{ + Name: "backend", + Usage: "Cluster backend: k3d (Docker-based) or k3s (bare-metal)", + EnvVars: []string{"OBOL_BACKEND"}, + }, }, Action: func(c *cli.Context) error { - return stack.Init(cfg, c.Bool("force")) + return stack.Init(cfg, c.Bool("force"), c.String("backend")) }, }, { diff --git a/internal/embed/embed.go b/internal/embed/embed.go index 2c189eb..7a0d723 100644 --- a/internal/embed/embed.go +++ b/internal/embed/embed.go @@ -15,6 +15,9 @@ import ( //go:embed k3d-config.yaml var K3dConfig string +//go:embed k3s-config.yaml +var K3sConfig string + //go:embed all:infrastructure var infrastructureFS embed.FS diff --git a/internal/embed/infrastructure/base/templates/local-path.yaml b/internal/embed/infrastructure/base/templates/local-path.yaml index 77713e9..2547c50 100644 --- a/internal/embed/infrastructure/base/templates/local-path.yaml +++ b/internal/embed/infrastructure/base/templates/local-path.yaml @@ -11,7 +11,7 @@ data: "nodePathMap":[ { "node":"DEFAULT_PATH_FOR_NON_LISTED_NODES", - "paths":["/data"] + "paths":["{{ .Values.dataDir }}"] } ] } diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml.gotmpl similarity index 94% rename from internal/embed/infrastructure/helmfile.yaml rename to internal/embed/infrastructure/helmfile.yaml.gotmpl index e3ce9a3..d5b1d8a 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml.gotmpl @@ -1,7 +1,10 @@ # Helmfile for Obol Stack default infrastructure # Orchestrates core infrastructure components deployed with every stack # Uses Traefik with Gateway API for routing (replaces nginx-ingress) -{{- $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} +{{ $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} +{{- $dataDir := env "STACK_DATA_DIR" | default "/data" -}} +{{- $network := env "STACK_NETWORK" | default "mainnet" -}} +{{- $gatewayApiVersion := "v1.4.1" }} repositories: - name: traefik @@ -17,19 +20,14 @@ repositories: - name: stakater url: https://stakater.github.io/stakater-charts -# Single source of truth: change this to switch networks -values: - - network: mainnet - - gatewayApiVersion: v1.4.1 - releases: # Local storage provisioner (raw manifests wrapped as chart) - name: base namespace: kube-system chart: ./base values: - - dataDir: /data - - network: "{{ .Values.network }}" + - dataDir: '{{ $dataDir }}' + - network: "{{ $network }}" # Monitoring stack (Prometheus operator + Prometheus) - name: monitoring @@ -54,7 +52,7 @@ releases: args: - apply - -f - - https://github.com/kubernetes-sigs/gateway-api/releases/download/{{ .Values.gatewayApiVersion }}/standard-install.yaml + - https://github.com/kubernetes-sigs/gateway-api/releases/download/{{ $gatewayApiVersion }}/standard-install.yaml # Traefik ingress controller with Gateway API support - name: traefik diff --git a/internal/embed/infrastructure/values/erpc.yaml.gotmpl b/internal/embed/infrastructure/values/erpc.yaml.gotmpl index b7c07f8..78274e9 100644 --- a/internal/embed/infrastructure/values/erpc.yaml.gotmpl +++ b/internal/embed/infrastructure/values/erpc.yaml.gotmpl @@ -1,4 +1,4 @@ -{{- $network := .Values.network -}} +{{- $network := env "STACK_NETWORK" | default "mainnet" -}} {{- $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} {{- $chainId := 1 -}} {{/* Default: mainnet */}} {{- if eq $network "hoodi" -}} @@ -87,12 +87,18 @@ config: |- allowCredentials: true maxAge: 3600 -# Secret env variables -secretEnv: - OBOL_OAUTH_TOKEN: - secretKeyRef: - name: obol-oauth-token - key: token +# Secret env variables (chart expects flat string map, e.g. KEY: "value") +# The OBOL_OAUTH_TOKEN is injected from a Kubernetes secret via extraEnv instead +secretEnv: {} + +# Inject the OAuth token from the Kubernetes secret +extraEnv: + - name: OBOL_OAUTH_TOKEN + valueFrom: + secretKeyRef: + name: obol-oauth-token + key: token + optional: true # Extra args for the erpc container extraArgs: [] diff --git a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl index caff157..66f068b 100644 --- a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl +++ b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl @@ -1,4 +1,4 @@ -{{- $network := .Values.network -}} +{{- $network := env "STACK_NETWORK" | default "mainnet" -}} {{- $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} replicaCount: 1 @@ -11,17 +11,17 @@ image: - name: NEXT_PUBLIC_HELIOS_CLIENT_URL value: "http://helios-{{ $network }}.helios.svc.cluster.local:8545" - name: NEXT_PUBLIC_ERPC_URL - value: "{{ printf \"https://%s/rpc\" $publicDomain }}" + value: "https://{{ $publicDomain }}/rpc" - name: NEXT_PUBLIC_AZTEC_SEQUENCER_URL value: "http://l2-sequencer-node-mainnet-node.aztec.svc.cluster.local:8080" - name: BETTER_AUTH_SECRET - value: "{{ env \"BETTER_AUTH_SECRET\" }}" + value: '{{ env "BETTER_AUTH_SECRET" }}' - name: BETTER_AUTH_URL - value: "{{ printf \"https://%s\" $publicDomain }}" + value: "https://{{ $publicDomain }}" - name: OBOL_GOOGLE_CLIENT_ID - value: "{{ env \"OBOL_GOOGLE_CLIENT_ID\" }}" + value: '{{ env "OBOL_GOOGLE_CLIENT_ID" }}' - name: OBOL_GOOGLE_CLIENT_SECRET - value: "{{ env \"OBOL_GOOGLE_CLIENT_SECRET\" }}" + value: '{{ env "OBOL_GOOGLE_CLIENT_SECRET" }}' - name: OBOL_AUTH_DB_PATH value: "/data/auth.sqlite" diff --git a/internal/embed/k3s-config.yaml b/internal/embed/k3s-config.yaml new file mode 100644 index 0000000..1c75e5a --- /dev/null +++ b/internal/embed/k3s-config.yaml @@ -0,0 +1,24 @@ +# k3s server configuration for Obol Stack +# Generated by: obol stack init --backend k3s + +# Disable components we manage ourselves (matching k3d config) +disable: + - traefik + - local-storage + +# Data directory for k3s internal state +data-dir: {{DATA_DIR}}/k3s + +# Bind to all interfaces for local access +bind-address: 0.0.0.0 +https-listen-port: 6443 + +# TLS SANs for local access +tls-san: + - "127.0.0.1" + - "localhost" + - "obol.stack" + +# Node labels +node-label: + - "obol.cluster-id={{STACK_ID}}" diff --git a/internal/stack/backend.go b/internal/stack/backend.go new file mode 100644 index 0000000..f26014d --- /dev/null +++ b/internal/stack/backend.go @@ -0,0 +1,77 @@ +package stack + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +const ( + // BackendK3d is the k3d backend (Docker-based, default) + BackendK3d = "k3d" + // BackendK3s is the standalone k3s backend (bare-metal) + BackendK3s = "k3s" + + stackBackendFile = ".stack-backend" +) + +// Backend abstracts the Kubernetes cluster runtime (k3d, k3s) +type Backend interface { + // Name returns the backend identifier (e.g., "k3d", "k3s") + Name() string + + // Init generates backend-specific cluster configuration files + Init(cfg *config.Config, stackID string) error + + // Up creates or starts the cluster and returns kubeconfig contents + Up(cfg *config.Config, stackID string) (kubeconfigData []byte, err error) + + // IsRunning returns true if the cluster is currently running + IsRunning(cfg *config.Config, stackID string) (bool, error) + + // Down stops the cluster without destroying configuration or data + Down(cfg *config.Config, stackID string) error + + // Destroy removes the cluster entirely (containers/processes) + Destroy(cfg *config.Config, stackID string) error + + // DataDir returns the storage path for the local-path-provisioner. + // For k3d this is "/data" (Docker volume mount point). + // For k3s this is the absolute host path to cfg.DataDir. + DataDir(cfg *config.Config) string + + // Prerequisites checks that required software/permissions are available + Prerequisites(cfg *config.Config) error +} + +// NewBackend creates a Backend by name +func NewBackend(name string) (Backend, error) { + switch name { + case BackendK3d: + return &K3dBackend{}, nil + case BackendK3s: + return &K3sBackend{}, nil + default: + return nil, fmt.Errorf("unknown backend: %s (supported: k3d, k3s)", name) + } +} + +// LoadBackend reads the persisted backend choice from .stack-backend file. +// Falls back to k3d if no file exists (backward compatibility). +func LoadBackend(cfg *config.Config) (Backend, error) { + path := filepath.Join(cfg.ConfigDir, stackBackendFile) + data, err := os.ReadFile(path) + if err != nil { + return &K3dBackend{}, nil + } + return NewBackend(strings.TrimSpace(string(data))) +} + +// SaveBackend persists the backend choice +func SaveBackend(cfg *config.Config, name string) error { + path := filepath.Join(cfg.ConfigDir, stackBackendFile) + return os.WriteFile(path, []byte(name), 0644) +} diff --git a/internal/stack/backend_k3d.go b/internal/stack/backend_k3d.go new file mode 100644 index 0000000..8fdd3de --- /dev/null +++ b/internal/stack/backend_k3d.go @@ -0,0 +1,164 @@ +package stack + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + + "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/embed" +) + +const ( + k3dConfigFile = "k3d.yaml" +) + +// K3dBackend manages clusters via k3d (k3s inside Docker containers) +type K3dBackend struct{} + +func (b *K3dBackend) Name() string { return BackendK3d } + +func (b *K3dBackend) Prerequisites(cfg *config.Config) error { + // Check Docker is running + cmd := exec.Command("docker", "info") + cmd.Stdout = nil + cmd.Stderr = nil + if err := cmd.Run(); err != nil { + return fmt.Errorf("Docker is not running. k3d backend requires Docker.\nStart Docker and try again") + } + + // Check k3d binary exists + k3dPath := filepath.Join(cfg.BinDir, "k3d") + if _, err := os.Stat(k3dPath); os.IsNotExist(err) { + return fmt.Errorf("k3d not found at %s\nRun obolup.sh to install dependencies", k3dPath) + } + return nil +} + +func (b *K3dBackend) Init(cfg *config.Config, stackID string) error { + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + + absConfigDir, err := filepath.Abs(cfg.ConfigDir) + if err != nil { + return fmt.Errorf("failed to get absolute path for config directory: %w", err) + } + + // Template k3d config with actual values + k3dConfig := embed.K3dConfig + k3dConfig = strings.ReplaceAll(k3dConfig, "{{STACK_ID}}", stackID) + k3dConfig = strings.ReplaceAll(k3dConfig, "{{DATA_DIR}}", absDataDir) + k3dConfig = strings.ReplaceAll(k3dConfig, "{{CONFIG_DIR}}", absConfigDir) + + k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) + if err := os.WriteFile(k3dConfigPath, []byte(k3dConfig), 0644); err != nil { + return fmt.Errorf("failed to write k3d config: %w", err) + } + + fmt.Printf("K3d config saved to: %s\n", k3dConfigPath) + return nil +} + +func (b *K3dBackend) IsRunning(cfg *config.Config, stackID string) (bool, error) { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + listCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "list", "--no-headers") + output, err := listCmd.Output() + if err != nil { + return false, fmt.Errorf("k3d list command failed: %w", err) + } + return strings.Contains(string(output), stackName), nil +} + +func (b *K3dBackend) Up(cfg *config.Config, stackID string) ([]byte, error) { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) + + running, err := b.IsRunning(cfg, stackID) + if err != nil { + return nil, err + } + + if running { + fmt.Printf("Stack already exists, attempting to start: %s (id: %s)\n", stackName, stackID) + startCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "start", stackName) + startCmd.Stdout = os.Stdout + startCmd.Stderr = os.Stderr + if err := startCmd.Run(); err != nil { + return nil, fmt.Errorf("failed to start existing cluster: %w", err) + } + } else { + // Create data directory if it doesn't exist + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return nil, fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + if err := os.MkdirAll(absDataDir, 0755); err != nil { + return nil, fmt.Errorf("failed to create data directory: %w", err) + } + + fmt.Println("Creating k3d cluster...") + createCmd := exec.Command( + filepath.Join(cfg.BinDir, "k3d"), + "cluster", "create", stackName, + "--config", k3dConfigPath, + "--kubeconfig-update-default=false", + ) + createCmd.Stdout = os.Stdout + createCmd.Stderr = os.Stderr + if err := createCmd.Run(); err != nil { + return nil, fmt.Errorf("failed to create cluster: %w", err) + } + } + + // Export kubeconfig + kubeconfigCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "kubeconfig", "get", stackName) + kubeconfigData, err := kubeconfigCmd.Output() + if err != nil { + return nil, fmt.Errorf("failed to get kubeconfig: %w", err) + } + + return kubeconfigData, nil +} + +func (b *K3dBackend) Down(cfg *config.Config, stackID string) error { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + + fmt.Printf("Stopping stack gracefully: %s (id: %s)\n", stackName, stackID) + + stopCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "stop", stackName) + stopCmd.Stdout = os.Stdout + stopCmd.Stderr = os.Stderr + if err := stopCmd.Run(); err != nil { + fmt.Println("Graceful stop timed out or failed, forcing cluster deletion") + deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) + deleteCmd.Stdout = os.Stdout + deleteCmd.Stderr = os.Stderr + if err := deleteCmd.Run(); err != nil { + return fmt.Errorf("failed to stop cluster: %w", err) + } + } + + return nil +} + +func (b *K3dBackend) Destroy(cfg *config.Config, stackID string) error { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + + fmt.Printf("Deleting cluster containers: %s\n", stackName) + deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) + deleteCmd.Stdout = os.Stdout + deleteCmd.Stderr = os.Stderr + if err := deleteCmd.Run(); err != nil { + fmt.Printf("Failed to delete cluster (may already be deleted): %v\n", err) + } + + return nil +} + +func (b *K3dBackend) DataDir(cfg *config.Config) string { + return "/data" +} diff --git a/internal/stack/backend_k3s.go b/internal/stack/backend_k3s.go new file mode 100644 index 0000000..482d7e8 --- /dev/null +++ b/internal/stack/backend_k3s.go @@ -0,0 +1,320 @@ +package stack + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" + "syscall" + "time" + + "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/embed" +) + +const ( + k3sConfigFile = "k3s-config.yaml" + k3sPidFile = ".k3s.pid" + k3sLogFile = "k3s.log" +) + +// K3sBackend manages a standalone k3s cluster (bare-metal) +type K3sBackend struct{} + +func (b *K3sBackend) Name() string { return BackendK3s } + +func (b *K3sBackend) Prerequisites(cfg *config.Config) error { + if runtime.GOOS != "linux" { + return fmt.Errorf("k3s backend is only supported on Linux") + } + + // Check sudo access (allow interactive password prompt) + cmd := exec.Command("sudo", "-v") + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("k3s backend requires root/sudo access") + } + + // Check k3s binary exists + k3sPath := filepath.Join(cfg.BinDir, "k3s") + if _, err := os.Stat(k3sPath); os.IsNotExist(err) { + return fmt.Errorf("k3s not found at %s\nRun obolup.sh to install dependencies", k3sPath) + } + + return nil +} + +func (b *K3sBackend) Init(cfg *config.Config, stackID string) error { + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + + // Template k3s config with actual values + k3sConfig := embed.K3sConfig + k3sConfig = strings.ReplaceAll(k3sConfig, "{{STACK_ID}}", stackID) + k3sConfig = strings.ReplaceAll(k3sConfig, "{{DATA_DIR}}", absDataDir) + + k3sConfigPath := filepath.Join(cfg.ConfigDir, k3sConfigFile) + if err := os.WriteFile(k3sConfigPath, []byte(k3sConfig), 0644); err != nil { + return fmt.Errorf("failed to write k3s config: %w", err) + } + + fmt.Printf("K3s config saved to: %s\n", k3sConfigPath) + return nil +} + +func (b *K3sBackend) IsRunning(cfg *config.Config, stackID string) (bool, error) { + pid, err := b.readPid(cfg) + if err != nil { + return false, nil + } + + return b.isProcessAlive(pid), nil +} + +func (b *K3sBackend) Up(cfg *config.Config, stackID string) ([]byte, error) { + running, _ := b.IsRunning(cfg, stackID) + if running { + fmt.Println("k3s is already running") + kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) + data, err := os.ReadFile(kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("k3s is running but kubeconfig not found: %w", err) + } + return data, nil + } + + // Clean up stale PID file if it exists (QA R6) + b.cleanStalePid(cfg) + + k3sConfigPath := filepath.Join(cfg.ConfigDir, k3sConfigFile) + if _, err := os.Stat(k3sConfigPath); os.IsNotExist(err) { + return nil, fmt.Errorf("k3s config not found at %s\nRun 'obol stack init --backend k3s' first", k3sConfigPath) + } + + // Create data directory + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return nil, fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + if err := os.MkdirAll(absDataDir, 0755); err != nil { + return nil, fmt.Errorf("failed to create data directory: %w", err) + } + + kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) + k3sBinary := filepath.Join(cfg.BinDir, "k3s") + logPath := filepath.Join(cfg.ConfigDir, k3sLogFile) + + // Remove stale kubeconfig so we wait for k3s to write a fresh one + os.Remove(kubeconfigPath) + + // Open log file for k3s output + logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) + if err != nil { + return nil, fmt.Errorf("failed to create k3s log file: %w", err) + } + + fmt.Println("Starting k3s server...") + + // Start k3s server as background process via sudo + cmd := exec.Command("sudo", + k3sBinary, "server", + "--config", k3sConfigPath, + "--write-kubeconfig", kubeconfigPath, + "--write-kubeconfig-mode", "0600", + ) + cmd.Stdout = logFile + cmd.Stderr = logFile + // Set process group so we can clean up child processes + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + if err := cmd.Start(); err != nil { + logFile.Close() + return nil, fmt.Errorf("failed to start k3s: %w", err) + } + + // Save PID before releasing the process handle + pid := cmd.Process.Pid + + // Write PID file + pidPath := filepath.Join(cfg.ConfigDir, k3sPidFile) + if err := os.WriteFile(pidPath, []byte(strconv.Itoa(pid)), 0600); err != nil { + logFile.Close() + return nil, fmt.Errorf("failed to write k3s PID file: %w", err) + } + + // Detach the process + cmd.Process.Release() + logFile.Close() + + fmt.Printf("k3s started (pid: %d)\n", pid) + fmt.Printf("Logs: %s\n", logPath) + + // Wait for kubeconfig to be written by k3s + fmt.Println("Waiting for kubeconfig...") + deadline := time.Now().Add(2 * time.Minute) + for time.Now().Before(deadline) { + if info, err := os.Stat(kubeconfigPath); err == nil && info.Size() > 0 { + // Fix ownership: k3s writes kubeconfig as root via sudo + exec.Command("sudo", "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), kubeconfigPath).Run() + + data, err := os.ReadFile(kubeconfigPath) + if err == nil && len(data) > 0 { + fmt.Println("Kubeconfig ready, waiting for API server...") + + // Wait for the API server to actually respond + apiDeadline := time.Now().Add(90 * time.Second) + kubectlPath := filepath.Join(cfg.BinDir, "kubectl") + for time.Now().Before(apiDeadline) { + probe := exec.Command(kubectlPath, "--kubeconfig", kubeconfigPath, + "get", "nodes", "--no-headers") + if out, err := probe.Output(); err == nil && len(out) > 0 { + fmt.Println("API server ready") + return data, nil + } + time.Sleep(3 * time.Second) + } + + // Return kubeconfig even if API isn't fully ready yet + fmt.Println("Warning: API server not fully ready, proceeding anyway") + return data, nil + } + } + time.Sleep(2 * time.Second) + } + + return nil, fmt.Errorf("k3s did not write kubeconfig within timeout\nCheck logs: %s", logPath) +} + +func (b *K3sBackend) Down(cfg *config.Config, stackID string) error { + pid, err := b.readPid(cfg) + if err != nil { + fmt.Println("k3s PID file not found, may not be running") + return nil + } + + if !b.isProcessAlive(pid) { + fmt.Println("k3s process not running, cleaning up PID file") + b.removePidFile(cfg) + return nil + } + + fmt.Printf("Stopping k3s (pid: %d)...\n", pid) + + // Send SIGTERM to the process group for clean shutdown (negative PID = process group) + pgid := fmt.Sprintf("-%d", pid) + stopCmd := exec.Command("sudo", "kill", "-TERM", pgid) + stopCmd.Stdout = os.Stdout + stopCmd.Stderr = os.Stderr + if err := stopCmd.Run(); err != nil { + fmt.Printf("SIGTERM to process group failed, sending SIGKILL: %v\n", err) + exec.Command("sudo", "kill", "-9", pgid).Run() + } + + // Wait for process to exit (up to 30 seconds) + deadline := time.Now().Add(30 * time.Second) + for time.Now().Before(deadline) { + if !b.isProcessAlive(pid) { + break + } + time.Sleep(1 * time.Second) + } + + // Run k3s-killall.sh if available (cleans up containerd/iptables) + killallPath := "/usr/local/bin/k3s-killall.sh" + if _, err := os.Stat(killallPath); err == nil { + fmt.Println("Running k3s cleanup...") + cleanCmd := exec.Command("sudo", killallPath) + cleanCmd.Stdout = os.Stdout + cleanCmd.Stderr = os.Stderr + cleanCmd.Run() + } + + b.removePidFile(cfg) + fmt.Println("k3s stopped") + return nil +} + +func (b *K3sBackend) Destroy(cfg *config.Config, stackID string) error { + // Stop if running + b.Down(cfg, stackID) + + // Clean up k3s state directories (default + custom data-dir) + absDataDir, _ := filepath.Abs(cfg.DataDir) + cleanDirs := []string{ + "/var/lib/rancher/k3s", + "/etc/rancher/k3s", + filepath.Join(absDataDir, "k3s"), + } + for _, dir := range cleanDirs { + if _, err := os.Stat(dir); err == nil { + fmt.Printf("Cleaning up: %s\n", dir) + exec.Command("sudo", "rm", "-rf", dir).Run() + } + } + + // Run uninstall script if available + uninstallPath := "/usr/local/bin/k3s-uninstall.sh" + if _, err := os.Stat(uninstallPath); err == nil { + fmt.Println("Running k3s uninstall...") + uninstallCmd := exec.Command("sudo", uninstallPath) + uninstallCmd.Stdout = os.Stdout + uninstallCmd.Stderr = os.Stderr + uninstallCmd.Run() + } + + return nil +} + +func (b *K3sBackend) DataDir(cfg *config.Config) string { + absDataDir, _ := filepath.Abs(cfg.DataDir) + return absDataDir +} + +// readPid reads the k3s PID from the PID file +func (b *K3sBackend) readPid(cfg *config.Config) (int, error) { + pidPath := filepath.Join(cfg.ConfigDir, k3sPidFile) + data, err := os.ReadFile(pidPath) + if err != nil { + return 0, err + } + pid, err := strconv.Atoi(strings.TrimSpace(string(data))) + if err != nil { + return 0, fmt.Errorf("invalid PID in %s: %w", pidPath, err) + } + if pid <= 0 { + return 0, fmt.Errorf("invalid PID in %s: %d", pidPath, pid) + } + return pid, nil +} + +// cleanStalePid removes the PID file if the process is no longer running +func (b *K3sBackend) cleanStalePid(cfg *config.Config) { + pid, err := b.readPid(cfg) + if err != nil { + return + } + if !b.isProcessAlive(pid) { + fmt.Printf("Cleaning up stale PID file (pid %d no longer running)\n", pid) + b.removePidFile(cfg) + } +} + +// isProcessAlive checks if a root-owned process is still running. +// Uses sudo kill -0 since the k3s process runs as root and direct +// signal(0) from an unprivileged user returns EPERM. +func (b *K3sBackend) isProcessAlive(pid int) bool { + return exec.Command("sudo", "kill", "-0", strconv.Itoa(pid)).Run() == nil +} + +// removePidFile removes the k3s PID file +func (b *K3sBackend) removePidFile(cfg *config.Config) { + pidPath := filepath.Join(cfg.ConfigDir, k3sPidFile) + os.Remove(pidPath) +} diff --git a/internal/stack/backend_k3s_test.go b/internal/stack/backend_k3s_test.go new file mode 100644 index 0000000..e7a09ba --- /dev/null +++ b/internal/stack/backend_k3s_test.go @@ -0,0 +1,97 @@ +package stack + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +func TestK3sReadPid(t *testing.T) { + tests := []struct { + name string + content string + wantPid int + wantErr bool + errContains string + }{ + {name: "valid pid", content: "12345", wantPid: 12345}, + {name: "with trailing newline", content: "12345\n", wantPid: 12345}, + {name: "with whitespace", content: " 12345 ", wantPid: 12345}, + {name: "pid 1", content: "1", wantPid: 1}, + {name: "large pid", content: "4194304", wantPid: 4194304}, + {name: "not a number", content: "not-a-number", wantErr: true, errContains: "invalid PID"}, + {name: "empty content", content: "", wantErr: true, errContains: "invalid PID"}, + {name: "float", content: "123.45", wantErr: true, errContains: "invalid PID"}, + {name: "negative", content: "-1", wantErr: true, errContains: "invalid PID"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + pidPath := filepath.Join(tmpDir, k3sPidFile) + if err := os.WriteFile(pidPath, []byte(tt.content), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + b := &K3sBackend{} + pid, err := b.readPid(cfg) + if tt.wantErr { + if err == nil { + t.Fatalf("readPid() = %d, nil error; want error containing %q", pid, tt.errContains) + } + if !strings.Contains(err.Error(), tt.errContains) { + t.Errorf("readPid() error = %q, want containing %q", err.Error(), tt.errContains) + } + return + } + if err != nil { + t.Fatalf("readPid() unexpected error: %v", err) + } + if pid != tt.wantPid { + t.Errorf("readPid() = %d, want %d", pid, tt.wantPid) + } + }) + } + + t.Run("missing file", func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + b := &K3sBackend{} + _, err := b.readPid(cfg) + if err == nil { + t.Fatal("readPid() with no file should return error") + } + }) +} + +func TestK3sRemovePidFile(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + pidPath := filepath.Join(tmpDir, k3sPidFile) + if err := os.WriteFile(pidPath, []byte("12345"), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + b := &K3sBackend{} + b.removePidFile(cfg) + + if _, err := os.Stat(pidPath); !os.IsNotExist(err) { + t.Error("PID file should have been removed") + } +} + +func TestK3sRemovePidFileNoop(t *testing.T) { + // Removing a non-existent PID file should not panic or error + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + b := &K3sBackend{} + b.removePidFile(cfg) // should not panic +} diff --git a/internal/stack/backend_test.go b/internal/stack/backend_test.go new file mode 100644 index 0000000..e59836c --- /dev/null +++ b/internal/stack/backend_test.go @@ -0,0 +1,321 @@ +package stack + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +// Compile-time interface compliance checks +var ( + _ Backend = (*K3dBackend)(nil) + _ Backend = (*K3sBackend)(nil) +) + +func TestNewBackend(t *testing.T) { + tests := []struct { + name string + input string + wantName string + wantErr bool + errContains string + }{ + {name: "k3d backend", input: "k3d", wantName: "k3d"}, + {name: "k3s backend", input: "k3s", wantName: "k3s"}, + {name: "unknown backend", input: "docker", wantErr: true, errContains: "unknown backend"}, + {name: "empty string", input: "", wantErr: true, errContains: "unknown backend"}, + {name: "case sensitive", input: "K3D", wantErr: true, errContains: "unknown backend"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + backend, err := NewBackend(tt.input) + if tt.wantErr { + if err == nil { + t.Fatalf("NewBackend(%q) = nil error, want error containing %q", tt.input, tt.errContains) + } + if !strings.Contains(err.Error(), tt.errContains) { + t.Errorf("NewBackend(%q) error = %q, want containing %q", tt.input, err.Error(), tt.errContains) + } + return + } + if err != nil { + t.Fatalf("NewBackend(%q) unexpected error: %v", tt.input, err) + } + if backend.Name() != tt.wantName { + t.Errorf("NewBackend(%q).Name() = %q, want %q", tt.input, backend.Name(), tt.wantName) + } + }) + } +} + +func TestK3dBackendName(t *testing.T) { + b := &K3dBackend{} + if got := b.Name(); got != BackendK3d { + t.Errorf("K3dBackend.Name() = %q, want %q", got, BackendK3d) + } +} + +func TestK3sBackendName(t *testing.T) { + b := &K3sBackend{} + if got := b.Name(); got != BackendK3s { + t.Errorf("K3sBackend.Name() = %q, want %q", got, BackendK3s) + } +} + +func TestK3dBackendDataDir(t *testing.T) { + // k3d DataDir must always return "/data" regardless of cfg.DataDir, + // because k3d mounts the host data dir to /data inside the container. + tests := []struct { + name string + dataDir string + }{ + {name: "absolute path", dataDir: "/home/user/.local/share/obol"}, + {name: "relative path", dataDir: ".workspace/data"}, + {name: "empty string", dataDir: ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + b := &K3dBackend{} + cfg := &config.Config{DataDir: tt.dataDir} + if got := b.DataDir(cfg); got != "/data" { + t.Errorf("K3dBackend.DataDir() = %q, want %q (must always be /data for Docker mount)", got, "/data") + } + }) + } +} + +func TestK3sBackendDataDir(t *testing.T) { + // k3s DataDir must return an absolute version of cfg.DataDir, + // because k3s runs directly on the host. + b := &K3sBackend{} + + t.Run("absolute path passthrough", func(t *testing.T) { + cfg := &config.Config{DataDir: "/home/user/.local/share/obol"} + got := b.DataDir(cfg) + if got != "/home/user/.local/share/obol" { + t.Errorf("K3sBackend.DataDir() = %q, want %q", got, "/home/user/.local/share/obol") + } + }) + + t.Run("relative path resolved to absolute", func(t *testing.T) { + cfg := &config.Config{DataDir: "relative/path"} + got := b.DataDir(cfg) + if !filepath.IsAbs(got) { + t.Errorf("K3sBackend.DataDir() = %q, want absolute path", got) + } + if !strings.HasSuffix(got, "relative/path") { + t.Errorf("K3sBackend.DataDir() = %q, want suffix %q", got, "relative/path") + } + }) +} + +func TestSaveAndLoadBackend(t *testing.T) { + tests := []struct { + name string + backend string + wantName string + }{ + {name: "save k3s load k3s", backend: "k3s", wantName: "k3s"}, + {name: "save k3d load k3d", backend: "k3d", wantName: "k3d"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + if err := SaveBackend(cfg, tt.backend); err != nil { + t.Fatalf("SaveBackend() error: %v", err) + } + + backend, err := LoadBackend(cfg) + if err != nil { + t.Fatalf("LoadBackend() error: %v", err) + } + if backend.Name() != tt.wantName { + t.Errorf("LoadBackend().Name() = %q, want %q", backend.Name(), tt.wantName) + } + }) + } +} + +func TestLoadBackendFallsBackToK3d(t *testing.T) { + // When no .stack-backend file exists, LoadBackend must return k3d + // for backward compatibility with existing stacks. + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + backend, err := LoadBackend(cfg) + if err != nil { + t.Fatalf("LoadBackend() error: %v", err) + } + if backend.Name() != BackendK3d { + t.Errorf("LoadBackend() with no file = %q, want %q (backward compat)", backend.Name(), BackendK3d) + } +} + +func TestLoadBackendWithWhitespace(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + // Write file with trailing newline and whitespace + path := filepath.Join(tmpDir, stackBackendFile) + if err := os.WriteFile(path, []byte("k3s\n "), 0644); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + backend, err := LoadBackend(cfg) + if err != nil { + t.Fatalf("LoadBackend() error: %v", err) + } + if backend.Name() != BackendK3s { + t.Errorf("LoadBackend() = %q, want %q", backend.Name(), BackendK3s) + } +} + +func TestLoadBackendInvalidName(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + path := filepath.Join(tmpDir, stackBackendFile) + if err := os.WriteFile(path, []byte("docker-swarm"), 0644); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + _, err := LoadBackend(cfg) + if err == nil { + t.Fatal("LoadBackend() with invalid backend name should return error") + } + if !strings.Contains(err.Error(), "unknown backend") { + t.Errorf("LoadBackend() error = %q, want containing %q", err.Error(), "unknown backend") + } +} + +func TestK3dBackendInit(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ + ConfigDir: tmpDir, + DataDir: filepath.Join(tmpDir, "data"), + } + + b := &K3dBackend{} + if err := b.Init(cfg, "test-stack"); err != nil { + t.Fatalf("K3dBackend.Init() error: %v", err) + } + + // Verify config file was written + configPath := filepath.Join(tmpDir, k3dConfigFile) + data, err := os.ReadFile(configPath) + if err != nil { + t.Fatalf("Failed to read generated config: %v", err) + } + + content := string(data) + + // Verify placeholders were replaced + if strings.Contains(content, "{{STACK_ID}}") { + t.Error("Config still contains {{STACK_ID}} placeholder") + } + if strings.Contains(content, "{{DATA_DIR}}") { + t.Error("Config still contains {{DATA_DIR}} placeholder") + } + if strings.Contains(content, "{{CONFIG_DIR}}") { + t.Error("Config still contains {{CONFIG_DIR}} placeholder") + } + + // Verify actual values are present + if !strings.Contains(content, "test-stack") { + t.Error("Config does not contain stack ID 'test-stack'") + } + + // Verify paths are absolute + if !strings.Contains(content, tmpDir) { + t.Errorf("Config does not contain absolute data dir path %q", tmpDir) + } +} + +func TestK3sBackendInit(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ + ConfigDir: tmpDir, + DataDir: filepath.Join(tmpDir, "data"), + } + + b := &K3sBackend{} + if err := b.Init(cfg, "my-cluster"); err != nil { + t.Fatalf("K3sBackend.Init() error: %v", err) + } + + // Verify config file was written + configPath := filepath.Join(tmpDir, k3sConfigFile) + data, err := os.ReadFile(configPath) + if err != nil { + t.Fatalf("Failed to read generated config: %v", err) + } + + content := string(data) + + // Verify placeholders were replaced + if strings.Contains(content, "{{STACK_ID}}") { + t.Error("Config still contains {{STACK_ID}} placeholder") + } + if strings.Contains(content, "{{DATA_DIR}}") { + t.Error("Config still contains {{DATA_DIR}} placeholder") + } + + // Verify actual values are present + if !strings.Contains(content, "my-cluster") { + t.Error("Config does not contain stack ID 'my-cluster'") + } + + // Verify data-dir uses absolute path + absDataDir, _ := filepath.Abs(filepath.Join(tmpDir, "data")) + expectedDataDir := absDataDir + "/k3s" + if !strings.Contains(content, expectedDataDir) { + t.Errorf("Config does not contain absolute data-dir %q", expectedDataDir) + } +} + +func TestGetStackID(t *testing.T) { + tests := []struct { + name string + content string + want string + }{ + {name: "simple id", content: "happy-panda", want: "happy-panda"}, + {name: "with trailing newline", content: "happy-panda\n", want: "happy-panda"}, + {name: "with whitespace", content: " happy-panda \n", want: "happy-panda"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + path := filepath.Join(tmpDir, stackIDFile) + if err := os.WriteFile(path, []byte(tt.content), 0644); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + got := getStackID(cfg) + if got != tt.want { + t.Errorf("getStackID() = %q, want %q", got, tt.want) + } + }) + } + + t.Run("missing file returns empty", func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + got := getStackID(cfg) + if got != "" { + t.Errorf("getStackID() with no file = %q, want empty string", got) + } + }) +} diff --git a/internal/stack/integration_test.go b/internal/stack/integration_test.go new file mode 100644 index 0000000..66088bc --- /dev/null +++ b/internal/stack/integration_test.go @@ -0,0 +1,255 @@ +//go:build integration + +package stack_test + +import ( + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" +) + +// Integration tests for the k3s backend user flows. +// Requires: sudo access, k3s binary, OBOL_DEVELOPMENT=true. +// +// Run with: +// go test -tags integration -timeout 15m -v ./internal/stack/ + +func TestK3sUserFlows(t *testing.T) { + if os.Getenv("OBOL_DEVELOPMENT") != "true" { + t.Skip("OBOL_DEVELOPMENT not set, skipping integration test") + } + + projectRoot := findProjectRoot(t) + obol := filepath.Join(projectRoot, ".workspace", "bin", "obol") + if _, err := os.Stat(obol); os.IsNotExist(err) { + t.Fatalf("obol binary not found at %s — build it first", obol) + } + + configDir := filepath.Join(projectRoot, ".workspace", "config") + binDir := filepath.Join(projectRoot, ".workspace", "bin") + + // Helper to run obol commands + run := func(t *testing.T, args ...string) (string, error) { + t.Helper() + cmd := exec.Command(obol, args...) + cmd.Env = append(os.Environ(), + "OBOL_DEVELOPMENT=true", + "PATH="+binDir+":"+os.Getenv("PATH"), + ) + cmd.Dir = projectRoot + out, err := cmd.CombinedOutput() + return string(out), err + } + + // Cleanup before tests + run(t, "stack", "purge", "--force") + + // Cleanup after all tests + t.Cleanup(func() { + run(t, "stack", "purge", "--force") + }) + + t.Run("init", func(t *testing.T) { + out, err := run(t, "stack", "init", "--backend", "k3s") + if err != nil { + t.Fatalf("stack init failed: %v\n%s", err, out) + } + + // Verify config files created + for _, f := range []string{"k3s-config.yaml", ".stack-id", ".stack-backend"} { + if _, err := os.Stat(filepath.Join(configDir, f)); os.IsNotExist(err) { + t.Errorf("expected %s to exist after init", f) + } + } + + // Verify defaults directory + if _, err := os.Stat(filepath.Join(configDir, "defaults")); os.IsNotExist(err) { + t.Error("expected defaults/ directory after init") + } + + // Verify backend is k3s + data, _ := os.ReadFile(filepath.Join(configDir, ".stack-backend")) + if got := strings.TrimSpace(string(data)); got != "k3s" { + t.Errorf("backend = %q, want k3s", got) + } + }) + + t.Run("init_rejects_without_force", func(t *testing.T) { + _, err := run(t, "stack", "init", "--backend", "k3s") + if err == nil { + t.Error("init without --force should fail when config exists") + } + }) + + t.Run("init_force_preserves_stack_id", func(t *testing.T) { + idBefore, _ := os.ReadFile(filepath.Join(configDir, ".stack-id")) + out, err := run(t, "stack", "init", "--backend", "k3s", "--force") + if err != nil { + t.Fatalf("stack init --force failed: %v\n%s", err, out) + } + idAfter, _ := os.ReadFile(filepath.Join(configDir, ".stack-id")) + if string(idBefore) != string(idAfter) { + t.Errorf("stack ID changed: %q → %q", string(idBefore), string(idAfter)) + } + }) + + t.Run("up", func(t *testing.T) { + out, err := run(t, "stack", "up") + if err != nil { + t.Fatalf("stack up failed: %v\n%s", err, out) + } + + // Verify PID file and kubeconfig exist + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); os.IsNotExist(err) { + t.Error("PID file not found after stack up") + } + if _, err := os.Stat(filepath.Join(configDir, "kubeconfig.yaml")); os.IsNotExist(err) { + t.Error("kubeconfig not found after stack up") + } + }) + + t.Run("kubectl_passthrough", func(t *testing.T) { + out, err := run(t, "kubectl", "get", "nodes", "--no-headers") + if err != nil { + t.Fatalf("kubectl passthrough failed: %v\n%s", err, out) + } + lines := strings.Split(strings.TrimSpace(out), "\n") + if len(lines) < 1 { + t.Error("kubectl get nodes returned no nodes") + } + + out, err = run(t, "kubectl", "get", "namespaces", "--no-headers") + if err != nil { + t.Fatalf("kubectl get namespaces failed: %v\n%s", err, out) + } + lines = strings.Split(strings.TrimSpace(out), "\n") + if len(lines) < 1 { + t.Error("kubectl get namespaces returned no namespaces") + } + }) + + t.Run("up_idempotent", func(t *testing.T) { + pidBefore, _ := os.ReadFile(filepath.Join(configDir, ".k3s.pid")) + + out, err := run(t, "stack", "up") + if err != nil { + t.Fatalf("stack up (idempotent) failed: %v\n%s", err, out) + } + + pidAfter, _ := os.ReadFile(filepath.Join(configDir, ".k3s.pid")) + if string(pidBefore) != string(pidAfter) { + t.Errorf("PID changed on idempotent up: %q → %q", string(pidBefore), string(pidAfter)) + } + }) + + t.Run("down", func(t *testing.T) { + out, err := run(t, "stack", "down") + if err != nil { + t.Fatalf("stack down failed: %v\n%s", err, out) + } + + // PID file should be cleaned up + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); !os.IsNotExist(err) { + t.Error("PID file should be removed after down") + } + + // Config should be preserved + if _, err := os.Stat(filepath.Join(configDir, ".stack-id")); os.IsNotExist(err) { + t.Error("stack ID should be preserved after down") + } + }) + + t.Run("down_already_stopped", func(t *testing.T) { + out, err := run(t, "stack", "down") + if err != nil { + t.Fatalf("stack down (already stopped) failed: %v\n%s", err, out) + } + }) + + t.Run("up_restart_after_down", func(t *testing.T) { + out, err := run(t, "stack", "up") + if err != nil { + t.Fatalf("stack up (restart) failed: %v\n%s", err, out) + } + + // Verify PID file exists + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); os.IsNotExist(err) { + t.Error("PID file not found after restart") + } + + // Wait for node to be ready + deadline := time.Now().Add(60 * time.Second) + for time.Now().Before(deadline) { + out, err := run(t, "kubectl", "get", "nodes", "--no-headers") + if err == nil && strings.Contains(out, "Ready") { + break + } + time.Sleep(3 * time.Second) + } + + out, _ = run(t, "kubectl", "get", "nodes", "--no-headers") + if !strings.Contains(out, "Ready") { + t.Error("node not ready after restart") + } + }) + + t.Run("purge", func(t *testing.T) { + out, err := run(t, "stack", "purge") + if err != nil { + t.Fatalf("stack purge failed: %v\n%s", err, out) + } + + time.Sleep(2 * time.Second) + + if _, err := os.Stat(filepath.Join(configDir, ".stack-id")); !os.IsNotExist(err) { + t.Error("stack ID should be removed after purge") + } + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); !os.IsNotExist(err) { + t.Error("PID file should be removed after purge") + } + }) + + t.Run("full_cycle_purge_force", func(t *testing.T) { + out, err := run(t, "stack", "init", "--backend", "k3s") + if err != nil { + t.Fatalf("init: %v\n%s", err, out) + } + + out, err = run(t, "stack", "up") + if err != nil { + t.Fatalf("up: %v\n%s", err, out) + } + + out, err = run(t, "stack", "purge", "--force") + if err != nil { + t.Fatalf("purge --force: %v\n%s", err, out) + } + + time.Sleep(2 * time.Second) + + if _, err := os.Stat(filepath.Join(configDir, ".stack-id")); !os.IsNotExist(err) { + t.Error("config should be removed after purge --force") + } + }) +} + +func findProjectRoot(t *testing.T) string { + t.Helper() + dir, err := os.Getwd() + if err != nil { + t.Fatalf("failed to get working directory: %v", err) + } + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + t.Fatal("could not find project root (no go.mod)") + } + dir = parent + } +} diff --git a/internal/stack/stack.go b/internal/stack/stack.go index c8366f6..8e2442b 100644 --- a/internal/stack/stack.go +++ b/internal/stack/stack.go @@ -13,21 +13,30 @@ import ( ) const ( - k3dConfigFile = "k3d.yaml" kubeconfigFile = "kubeconfig.yaml" stackIDFile = ".stack-id" ) // Init initializes the stack configuration -func Init(cfg *config.Config, force bool) error { - // Create flat stack config directory - k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) - - // Check if config already exists - if _, err := os.Stat(k3dConfigPath); err == nil { - if !force { - return fmt.Errorf("stack configuration already exists at %s\nUse --force to overwrite", k3dConfigPath) - } +func Init(cfg *config.Config, force bool, backendName string) error { + // Check if any stack config already exists + stackIDPath := filepath.Join(cfg.ConfigDir, stackIDFile) + backendFilePath := filepath.Join(cfg.ConfigDir, stackBackendFile) + + hasExistingConfig := false + if _, err := os.Stat(stackIDPath); err == nil { + hasExistingConfig = true + } + if _, err := os.Stat(backendFilePath); err == nil { + hasExistingConfig = true + } + // Also check legacy k3d.yaml for backward compatibility + if _, err := os.Stat(filepath.Join(cfg.ConfigDir, k3dConfigFile)); err == nil { + hasExistingConfig = true + } + + if hasExistingConfig && !force { + return fmt.Errorf("stack configuration already exists at %s\nUse --force to overwrite", cfg.ConfigDir) } if err := os.MkdirAll(cfg.ConfigDir, 0755); err != nil { @@ -35,46 +44,37 @@ func Init(cfg *config.Config, force bool) error { } // Check if stack ID already exists (preserve on --force) - stackIDPath := filepath.Join(cfg.ConfigDir, stackIDFile) var stackID string if existingID, err := os.ReadFile(stackIDPath); err == nil { - stackID = string(existingID) + stackID = strings.TrimSpace(string(existingID)) fmt.Printf("Preserving existing stack ID: %s (use purge to reset)\n", stackID) } else { - // Generate unique stack ID only if one doesn't exist stackID = petname.Generate(2, "-") } - fmt.Println("Initializing cluster configuration") - fmt.Printf("Cluster ID: %s\n", stackID) - - absDataDir, err := filepath.Abs(cfg.DataDir) - if err != nil { - return fmt.Errorf("failed to get absolute path for data directory: %w", err) + // Default to k3d if no backend specified + if backendName == "" { + backendName = BackendK3d } - absConfigDir, err := filepath.Abs(cfg.ConfigDir) + backend, err := NewBackend(backendName) if err != nil { - return fmt.Errorf("failed to get absolute path for config directory: %w", err) - } - - // Check if overwriting config - if _, err := os.Stat(k3dConfigPath); err == nil { - fmt.Printf("Overwriting existing stack configuration: %s\n", k3dConfigPath) + return err } - // Replace placeholder in k3d config with actual stack ID - k3dConfig := embed.K3dConfig - k3dConfig = strings.ReplaceAll(k3dConfig, "{{STACK_ID}}", stackID) - k3dConfig = strings.ReplaceAll(k3dConfig, "{{DATA_DIR}}", absDataDir) - k3dConfig = strings.ReplaceAll(k3dConfig, "{{CONFIG_DIR}}", absConfigDir) + fmt.Println("Initializing cluster configuration") + fmt.Printf("Cluster ID: %s\n", stackID) + fmt.Printf("Backend: %s\n", backend.Name()) - // Write k3d config with stack ID to destination - if err := os.WriteFile(k3dConfigPath, []byte(k3dConfig), 0644); err != nil { - return fmt.Errorf("failed to write k3d config: %w", err) + // Check prerequisites + if err := backend.Prerequisites(cfg); err != nil { + return fmt.Errorf("prerequisites check failed: %w", err) } - fmt.Printf("K3d config saved to: %s\n", k3dConfigPath) + // Generate backend-specific config + if err := backend.Init(cfg, stackID); err != nil { + return err + } // Copy embedded defaults (helmfile + charts for infrastructure) defaultsDir := filepath.Join(cfg.ConfigDir, "defaults") @@ -83,100 +83,50 @@ func Init(cfg *config.Config, force bool) error { } fmt.Printf("Defaults copied to: %s\n", defaultsDir) - // Store stack ID for later use (stackIDPath already declared above) + // Store stack ID if err := os.WriteFile(stackIDPath, []byte(stackID), 0644); err != nil { return fmt.Errorf("failed to write stack ID: %w", err) } - fmt.Printf("Initialized stack configuration: %s\n", k3dConfigPath) + // Save backend choice + if err := SaveBackend(cfg, backendName); err != nil { + return fmt.Errorf("failed to save backend choice: %w", err) + } + + fmt.Printf("Initialized stack configuration\n") fmt.Printf("Stack ID: %s\n", stackID) return nil } -// Up starts the k3d cluster +// Up starts the cluster using the configured backend func Up(cfg *config.Config) error { - k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) - kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) - - // Check if config exists - if _, err := os.Stat(k3dConfigPath); os.IsNotExist(err) { - return fmt.Errorf("stack config not found, run 'obol stack init' first") - } - - // Get stack ID and full stack name stackID := getStackID(cfg) if stackID == "" { return fmt.Errorf("stack ID not found, run 'obol stack init' first") } - stackName := getStackName(cfg) - - // Check if cluster already exists using cluster list - listCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "list", "--no-headers") - listCmdOutput, err := listCmd.Output() + backend, err := LoadBackend(cfg) if err != nil { - return fmt.Errorf("k3d list command failed: %w", err) + return fmt.Errorf("failed to load backend: %w", err) } - if stackExists(string(listCmdOutput), stackName) { - // Cluster exists - check if it's stopped or running - fmt.Printf("Stack already exists, attempting to start: %s (id: %s)\n", stackName, stackID) - startCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "start", stackName) - startCmd.Stdout = os.Stdout - startCmd.Stderr = os.Stderr - if err := startCmd.Run(); err != nil { - return fmt.Errorf("failed to start existing cluster: %w", err) - } - - if err := syncDefaults(cfg, kubeconfigPath); err != nil { - return err - } - - fmt.Println("Stack restarted successfully") - fmt.Printf("Stack ID: %s\n", stackID) - return nil - } - - fmt.Printf("Starting stack: %s (id: %s)\n", stackName, stackID) - - // Get absolute path to data directory for k3d volume mount - absDataDir, err := filepath.Abs(cfg.DataDir) - if err != nil { - return fmt.Errorf("failed to get absolute path for data directory: %w", err) - } - - // Create data directory if it doesn't exist - if err := os.MkdirAll(absDataDir, 0755); err != nil { - return fmt.Errorf("failed to create data directory: %w", err) - } - - // Create cluster using k3d config with custom name - fmt.Println("Creating k3d cluster...") - createCmd := exec.Command( - filepath.Join(cfg.BinDir, "k3d"), - "cluster", "create", stackName, - "--config", k3dConfigPath, - "--kubeconfig-update-default=false", - ) - createCmd.Stdout = os.Stdout - createCmd.Stderr = os.Stderr + kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) - if err := createCmd.Run(); err != nil { - return fmt.Errorf("failed to create cluster: %w", err) - } + fmt.Printf("Starting stack (id: %s, backend: %s)\n", stackID, backend.Name()) - // Export kubeconfig - kubeconfigCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "kubeconfig", "get", stackName) - kubeconfigData, err := kubeconfigCmd.Output() + kubeconfigData, err := backend.Up(cfg, stackID) if err != nil { - return fmt.Errorf("failed to get kubeconfig: %w", err) + return err } + // Write kubeconfig (backend may have already written it, but ensure consistency) if err := os.WriteFile(kubeconfigPath, kubeconfigData, 0600); err != nil { return fmt.Errorf("failed to write kubeconfig: %w", err) } - if err := syncDefaults(cfg, kubeconfigPath); err != nil { + // Sync defaults with backend-aware dataDir + dataDir := backend.DataDir(cfg) + if err := syncDefaults(cfg, kubeconfigPath, dataDir); err != nil { return err } @@ -187,85 +137,50 @@ func Up(cfg *config.Config) error { return nil } -// Down stops the k3d cluster +// Down stops the cluster func Down(cfg *config.Config) error { stackID := getStackID(cfg) if stackID == "" { return fmt.Errorf("stack ID not found, stack may not be initialized") } - stackName := getStackName(cfg) - - fmt.Printf("Stopping stack gracefully: %s (id: %s)\n", stackName, stackID) - - // First attempt graceful stop (allows processes to shutdown gracefully) - stopCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "stop", stackName) - stopCmd.Stdout = os.Stdout - stopCmd.Stderr = os.Stderr - - if err := stopCmd.Run(); err != nil { - fmt.Println("Graceful stop timed out or failed, forcing cluster deletion") - // Fallback to delete if stop fails - deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) - deleteCmd.Stdout = os.Stdout - deleteCmd.Stderr = os.Stderr - if err := deleteCmd.Run(); err != nil { - return fmt.Errorf("failed to stop cluster: %w", err) - } + + backend, err := LoadBackend(cfg) + if err != nil { + return fmt.Errorf("failed to load backend: %w", err) } - fmt.Println("Stack stopped successfully") - return nil + return backend.Down(cfg, stackID) } // Purge deletes the cluster config and optionally data func Purge(cfg *config.Config, force bool) error { - // Delete cluster containers - stackName := getStackName(cfg) - if stackName != "" { + stackID := getStackID(cfg) + + backend, err := LoadBackend(cfg) + if err != nil { + return fmt.Errorf("failed to load backend: %w", err) + } + + // Destroy cluster if we have a stack ID + if stackID != "" { if force { - // Force delete without graceful shutdown - fmt.Printf("Force deleting cluster containers: %s\n", stackName) - deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) - deleteCmd.Stdout = os.Stdout - deleteCmd.Stderr = os.Stderr - if err := deleteCmd.Run(); err != nil { - fmt.Printf("Failed to delete cluster (may already be deleted): %v\n", err) - } - fmt.Println("Cluster containers force deleted") + fmt.Printf("Force destroying cluster (id: %s)\n", stackID) } else { - // Graceful shutdown first to ensure data is written properly - fmt.Printf("Gracefully stopping cluster before deletion: %s\n", stackName) - stopCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "stop", stackName) - stopCmd.Stdout = os.Stdout - stopCmd.Stderr = os.Stderr - if err := stopCmd.Run(); err != nil { - fmt.Println("Graceful stop timed out or failed, proceeding with deletion anyway") - } else { - fmt.Println("Cluster stopped gracefully") - } - - // Now delete the stopped cluster - fmt.Println("Deleting cluster containers") - deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) - deleteCmd.Stdout = os.Stdout - deleteCmd.Stderr = os.Stderr - if err := deleteCmd.Run(); err != nil { - fmt.Printf("Failed to delete cluster (may already be deleted): %v\n", err) - } - fmt.Println("Cluster containers deleted") + fmt.Printf("Destroying cluster (id: %s)\n", stackID) + } + if err := backend.Destroy(cfg, stackID); err != nil { + fmt.Printf("Failed to destroy cluster (may already be deleted): %v\n", err) } } // Remove stack config directory - stackConfigDir := filepath.Join(cfg.ConfigDir) - if err := os.RemoveAll(stackConfigDir); err != nil { + if err := os.RemoveAll(cfg.ConfigDir); err != nil { return fmt.Errorf("failed to remove stack config: %w", err) } fmt.Println("Removed cluster config directory") // Remove data directory only if force flag is set if force { - // Use sudo to remove data directory since it may contain root-owned files fmt.Println("Removing data directory...") rmCmd := exec.Command("sudo", "rm", "-rf", cfg.DataDir) rmCmd.Stdout = os.Stdout @@ -284,12 +199,6 @@ func Purge(cfg *config.Config, force bool) error { return nil } -// stackExists checks if stack name exists in k3d cluster list output -func stackExists(output, name string) bool { - // Check if the stack name appears in the output - return strings.Contains(output, name) -} - // getStackID reads the stored stack ID func getStackID(cfg *config.Config) string { stackIDPath := filepath.Join(cfg.ConfigDir, stackIDFile) @@ -300,15 +209,6 @@ func getStackID(cfg *config.Config) string { return strings.TrimSpace(string(data)) } -// getStackName returns the full stack name (obol-stack-{stackid}) -func getStackName(cfg *config.Config) string { - stackID := getStackID(cfg) - if stackID == "" { - return "" - } - return fmt.Sprintf("obol-stack-%s", stackID) -} - // GetStackID reads the stored stack ID (exported for use in main) func GetStackID(cfg *config.Config) string { return getStackID(cfg) @@ -316,23 +216,25 @@ func GetStackID(cfg *config.Config) string { // syncDefaults deploys the default infrastructure using helmfile // If deployment fails, the cluster is automatically stopped via Down() -func syncDefaults(cfg *config.Config, kubeconfigPath string) error { +func syncDefaults(cfg *config.Config, kubeconfigPath string, dataDir string) error { fmt.Println("Deploying default infrastructure with helmfile") - // Sync defaults using helmfile (handles Helm hooks properly) defaultsHelmfilePath := filepath.Join(cfg.ConfigDir, "defaults") helmfileCmd := exec.Command( filepath.Join(cfg.BinDir, "helmfile"), - "--file", filepath.Join(defaultsHelmfilePath, "helmfile.yaml"), + "--file", filepath.Join(defaultsHelmfilePath, "helmfile.yaml.gotmpl"), "--kubeconfig", kubeconfigPath, "sync", ) + helmfileCmd.Env = append(os.Environ(), + fmt.Sprintf("KUBECONFIG=%s", kubeconfigPath), + fmt.Sprintf("STACK_DATA_DIR=%s", dataDir), + ) helmfileCmd.Stdout = os.Stdout helmfileCmd.Stderr = os.Stderr if err := helmfileCmd.Run(); err != nil { fmt.Println("Failed to apply defaults helmfile, stopping cluster") - // Attempt to stop the cluster to clean up if downErr := Down(cfg); downErr != nil { fmt.Printf("Failed to stop cluster during cleanup: %v\n", downErr) } From 61a7e2067f542af54fecac5abeeeaa52090b793d Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 7 Feb 2026 15:09:49 +0400 Subject: [PATCH 2/5] test(stack): add test-backend skill for k3d/k3s integration testing Adds a Claude Code skill (`/test-backend`) with bash scripts that exercise the full backend lifecycle: init, up, kubectl, down, restart, and purge for both k3d and k3s backends. --- .agents/skills/test-backend/SKILL.md | 70 ++++++++ .../skills/test-backend/scripts/test-k3d.sh | 153 ++++++++++++++++++ .../skills/test-backend/scripts/test-k3s.sh | 142 ++++++++++++++++ 3 files changed, 365 insertions(+) create mode 100644 .agents/skills/test-backend/SKILL.md create mode 100755 .agents/skills/test-backend/scripts/test-k3d.sh create mode 100755 .agents/skills/test-backend/scripts/test-k3s.sh diff --git a/.agents/skills/test-backend/SKILL.md b/.agents/skills/test-backend/SKILL.md new file mode 100644 index 0000000..2696c80 --- /dev/null +++ b/.agents/skills/test-backend/SKILL.md @@ -0,0 +1,70 @@ +--- +name: test-backend +description: Launch and test the k3d or k3s backend lifecycle (init, up, kubectl, down, purge). Use when you want to run a full integration test of a stack backend. +user_invocable: true +metadata: + author: obol-team + version: "1.0.0" + domain: testing + triggers: test backend, test k3d, test k3s, integration test, flow test, backend test + role: tester + scope: validation + output-format: report +--- + +# Test Backend Skill + +Runs a full lifecycle integration test for the obol stack backend (k3d or k3s). + +## Arguments + +The skill accepts an optional argument specifying which backend to test: + +- `k3s` - Test the k3s (bare-metal) backend only +- `k3d` - Test the k3d (Docker-based) backend only +- `all` - Test both backends sequentially (default) +- No argument defaults to `all` + +Examples: +- `/test-backend k3s` +- `/test-backend k3d` +- `/test-backend all` +- `/test-backend` (same as `all`) + +## Workflow + +### 1. Pre-flight + +- Build the obol binary: `go build -o .workspace/bin/obol ./cmd/obol` from the project root +- Verify the binary was created successfully +- Set `OBOL_DEVELOPMENT=true` and add `.workspace/bin` to PATH + +### 2. Run Test Script + +Based on the argument, run the appropriate test script(s) located alongside this skill: + +- **k3s**: Run `.agents/skills/test-backend/scripts/test-k3s.sh` +- **k3d**: Run `.agents/skills/test-backend/scripts/test-k3d.sh` +- **all**: Run k3s first, then k3d (k3s requires sudo so test it first while credentials are fresh) + +Execute the script via Bash tool from the project root directory. The scripts require: +- **k3s**: Linux, sudo access, k3s binary in `.workspace/bin/` +- **k3d**: Docker running, k3d binary in `.workspace/bin/` + +### 3. Report Results + +After each script completes, report: +- Total pass/fail counts (shown in the RESULTS line) +- Any specific test failures with their names +- Overall verdict: all green or needs attention + +If a test script fails (non-zero exit), read the output to identify which test(s) failed and summarize. + +## Important Notes + +- The k3s backend requires **sudo access** - the user may need to enter their password +- The k3d backend requires **Docker to be running** +- Each test script performs its own cleanup (purge) before and after +- Tests are sequential and ordered: init -> up -> verify -> down -> restart -> purge +- Typical runtime: ~2-4 minutes per backend +- If the environment has issues (Docker not starting, k3s not installing), report the problem clearly rather than retrying endlessly diff --git a/.agents/skills/test-backend/scripts/test-k3d.sh b/.agents/skills/test-backend/scripts/test-k3d.sh new file mode 100755 index 0000000..9657254 --- /dev/null +++ b/.agents/skills/test-backend/scripts/test-k3d.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +set -euo pipefail + +# K3d Backend Integration Test +# Requires: Docker running, k3d binary, OBOL_DEVELOPMENT=true + +PROJECT_ROOT="$(cd "$(dirname "$0")/../../../.." && pwd)" +OBOL="${PROJECT_ROOT}/.workspace/bin/obol" +export OBOL_DEVELOPMENT=true +export PATH="${PROJECT_ROOT}/.workspace/bin:$PATH" + +cd "$PROJECT_ROOT" + +PASS=0 +FAIL=0 + +log() { echo "$(date +%H:%M:%S) $*"; } +pass() { log " PASS: $*"; PASS=$((PASS + 1)); } +fail() { log " FAIL: $*"; FAIL=$((FAIL + 1)); } + +check() { + local desc="$1"; shift + if "$@"; then pass "$desc"; else fail "$desc"; fi +} + +check_fail() { + local desc="$1"; shift + if ! "$@" 2>/dev/null; then pass "$desc"; else fail "$desc (should have failed)"; fi +} + +k3d_is_functional() { + $OBOL kubectl get nodes --no-headers 2>/dev/null | grep -q "Ready" +} + +# Pre-flight: verify Docker is running +if ! docker info >/dev/null 2>&1; then + log "ERROR: Docker is not running. Start Docker and try again." + exit 1 +fi + +log "=========================================" +log "K3d Backend Integration Test" +log "=========================================" + +# --- Cleanup --- +log "--- Cleanup: purging any existing stack ---" +$OBOL stack purge --force 2>/dev/null || true + +# --- TEST 1: stack init (default = k3d) --- +log "" +log "--- TEST 1: stack init (default = k3d) ---" +check "stack init" $OBOL stack init +check "k3d.yaml exists" test -f .workspace/config/k3d.yaml +check ".stack-id exists" test -f .workspace/config/.stack-id +check ".stack-backend exists" test -f .workspace/config/.stack-backend +check "defaults/ directory exists" test -d .workspace/config/defaults +BACKEND=$(cat .workspace/config/.stack-backend) +check "backend is k3d" test "$BACKEND" = "k3d" +STACK_ID=$(cat .workspace/config/.stack-id) +log " Stack ID: $STACK_ID" + +# --- TEST 2: stack init again (should fail without --force) --- +log "" +log "--- TEST 2: stack init again (should fail without --force) ---" +check_fail "init without --force correctly rejected" $OBOL stack init + +# --- TEST 3: stack init --force --- +log "" +log "--- TEST 3: stack init --force ---" +$OBOL stack init --force +NEW_ID=$(cat .workspace/config/.stack-id) +check "stack ID preserved on --force ($STACK_ID)" test "$STACK_ID" = "$NEW_ID" + +# --- TEST 4: stack up --- +log "" +log "--- TEST 4: stack up ---" +check "stack up" $OBOL stack up +check "kubeconfig.yaml exists" test -f .workspace/config/kubeconfig.yaml + +# Wait for nodes to be ready (k3d can take a moment) +log " Waiting for nodes to be ready..." +DEADLINE=$((SECONDS + 120)) +while [ $SECONDS -lt $DEADLINE ]; do + if k3d_is_functional; then break; fi + sleep 3 +done +check "k3d is functional (nodes ready)" k3d_is_functional + +# --- TEST 5: kubectl passthrough --- +log "" +log "--- TEST 5: kubectl passthrough ---" +NODES=$($OBOL kubectl get nodes --no-headers 2>/dev/null | wc -l) +check "kubectl sees nodes ($NODES)" test "$NODES" -ge 1 + +NS=$($OBOL kubectl get namespaces --no-headers 2>/dev/null | wc -l) +check "kubectl sees namespaces ($NS)" test "$NS" -ge 1 + +# --- TEST 6: stack down --- +log "" +log "--- TEST 6: stack down ---" +check "stack down" $OBOL stack down +check "config preserved after down" test -f .workspace/config/.stack-id + +# Verify cluster stopped (kubectl should fail) +sleep 2 +check_fail "kubectl unreachable after down" $OBOL kubectl get nodes --no-headers + +# --- TEST 7: stack down already stopped --- +log "" +log "--- TEST 7: stack down already stopped ---" +check "stack down (already stopped)" $OBOL stack down + +# --- TEST 8: stack up (restart after down) --- +log "" +log "--- TEST 8: stack up (restart) ---" +check "stack up (restart)" $OBOL stack up + +# Wait for nodes to be ready after restart +log " Waiting for nodes to be ready..." +DEADLINE=$((SECONDS + 120)) +while [ $SECONDS -lt $DEADLINE ]; do + if k3d_is_functional; then break; fi + sleep 3 +done +check "k3d functional after restart" k3d_is_functional + +READY=$($OBOL kubectl get nodes --no-headers 2>/dev/null | grep -c "Ready" || true) +check "node ready after restart ($READY)" test "$READY" -ge 1 + +# --- TEST 9: stack purge --- +log "" +log "--- TEST 9: stack purge ---" +check "stack purge" $OBOL stack purge +sleep 2 +check "config removed" test ! -f .workspace/config/.stack-id + +# --- TEST 10: full cycle + purge --force --- +log "" +log "--- TEST 10: full cycle + purge --force ---" +check "init for purge test" $OBOL stack init +check "up for purge test" $OBOL stack up +check "purge --force" $OBOL stack purge --force +sleep 2 +check "config removed after purge --force" test ! -f .workspace/config/.stack-id + +log "" +log "=========================================" +log "K3d RESULTS: $PASS passed, $FAIL failed" +log "=========================================" + +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi diff --git a/.agents/skills/test-backend/scripts/test-k3s.sh b/.agents/skills/test-backend/scripts/test-k3s.sh new file mode 100755 index 0000000..1dcaac4 --- /dev/null +++ b/.agents/skills/test-backend/scripts/test-k3s.sh @@ -0,0 +1,142 @@ +#!/usr/bin/env bash +set -euo pipefail + +# K3s Backend Integration Test +# Requires: Linux, sudo access, k3s binary, OBOL_DEVELOPMENT=true + +PROJECT_ROOT="$(cd "$(dirname "$0")/../../../.." && pwd)" +OBOL="${PROJECT_ROOT}/.workspace/bin/obol" +export OBOL_DEVELOPMENT=true +export PATH="${PROJECT_ROOT}/.workspace/bin:$PATH" + +cd "$PROJECT_ROOT" + +PASS=0 +FAIL=0 + +log() { echo "$(date +%H:%M:%S) $*"; } +pass() { log " PASS: $*"; PASS=$((PASS + 1)); } +fail() { log " FAIL: $*"; FAIL=$((FAIL + 1)); } + +check() { + local desc="$1"; shift + if "$@"; then pass "$desc"; else fail "$desc"; fi +} + +check_fail() { + local desc="$1"; shift + if ! "$@" 2>/dev/null; then pass "$desc"; else fail "$desc (should have failed)"; fi +} + +k3s_is_functional() { + $OBOL kubectl get nodes --no-headers 2>/dev/null | grep -q "Ready" +} + +log "=========================================" +log "K3s Backend Integration Test" +log "=========================================" + +# --- Cleanup --- +log "--- Cleanup: purging any existing stack ---" +$OBOL stack purge --force 2>/dev/null || true + +# --- TEST 1: stack init --backend k3s --- +log "" +log "--- TEST 1: stack init --backend k3s ---" +check "stack init --backend k3s" $OBOL stack init --backend k3s +check "k3s-config.yaml exists" test -f .workspace/config/k3s-config.yaml +check ".stack-id exists" test -f .workspace/config/.stack-id +check ".stack-backend exists" test -f .workspace/config/.stack-backend +check "defaults/ directory exists" test -d .workspace/config/defaults +BACKEND=$(cat .workspace/config/.stack-backend) +check "backend is k3s" test "$BACKEND" = "k3s" +STACK_ID=$(cat .workspace/config/.stack-id) +log " Stack ID: $STACK_ID" + +# --- TEST 2: stack init again (should fail without --force) --- +log "" +log "--- TEST 2: stack init again (should fail without --force) ---" +check_fail "init without --force correctly rejected" $OBOL stack init --backend k3s + +# --- TEST 3: stack init --force (should preserve stack ID) --- +log "" +log "--- TEST 3: stack init --force (should preserve stack ID) ---" +$OBOL stack init --backend k3s --force +NEW_ID=$(cat .workspace/config/.stack-id) +check "stack ID preserved on --force ($STACK_ID)" test "$STACK_ID" = "$NEW_ID" + +# --- TEST 4: stack up --- +log "" +log "--- TEST 4: stack up ---" +check "stack up" $OBOL stack up +check "PID file exists" test -f .workspace/config/.k3s.pid +check "kubeconfig.yaml exists" test -f .workspace/config/kubeconfig.yaml +check "k3s is functional (nodes ready)" k3s_is_functional + +# --- TEST 5: kubectl passthrough --- +log "" +log "--- TEST 5: kubectl passthrough ---" +NODES=$($OBOL kubectl get nodes --no-headers 2>/dev/null | wc -l) +check "kubectl sees nodes ($NODES)" test "$NODES" -ge 1 + +NS=$($OBOL kubectl get namespaces --no-headers 2>/dev/null | wc -l) +check "kubectl sees namespaces ($NS)" test "$NS" -ge 1 + +# --- TEST 6: stack up idempotent (already running) --- +log "" +log "--- TEST 6: stack up idempotent ---" +OLD_PID=$(cat .workspace/config/.k3s.pid) +check "stack up while running" $OBOL stack up +NEW_PID=$(cat .workspace/config/.k3s.pid) +check "PID unchanged (idempotent) ($OLD_PID = $NEW_PID)" test "$OLD_PID" = "$NEW_PID" + +# --- TEST 7: stack down --- +log "" +log "--- TEST 7: stack down ---" +check "stack down" $OBOL stack down +check "PID file cleaned up" test ! -f .workspace/config/.k3s.pid +check "config preserved after down" test -f .workspace/config/.stack-id +log " Waiting for API server to become unreachable..." +sleep 5 +check_fail "kubectl unreachable after down" $OBOL kubectl get nodes --no-headers + +# --- TEST 8: stack down again (already stopped) --- +log "" +log "--- TEST 8: stack down already stopped ---" +check "stack down (already stopped)" $OBOL stack down + +# --- TEST 9: stack up (restart after down) --- +log "" +log "--- TEST 9: stack up (restart) ---" +check "stack up (restart)" $OBOL stack up +check "PID file exists after restart" test -f .workspace/config/.k3s.pid +check "k3s functional after restart" k3s_is_functional + +READY=$($OBOL kubectl get nodes --no-headers 2>/dev/null | grep -c "Ready" || true) +check "node ready after restart ($READY)" test "$READY" -ge 1 + +# --- TEST 10: stack purge (without --force) --- +log "" +log "--- TEST 10: stack purge ---" +check "stack purge" $OBOL stack purge +sleep 2 +check "config removed" test ! -f .workspace/config/.stack-id +check "k3s pid file removed" test ! -f .workspace/config/.k3s.pid + +# --- TEST 11: full cycle + purge --force --- +log "" +log "--- TEST 11: full cycle + purge --force ---" +check "init for purge test" $OBOL stack init --backend k3s +check "up for purge test" $OBOL stack up +check "purge --force" $OBOL stack purge --force +sleep 2 +check "config removed after purge --force" test ! -f .workspace/config/.stack-id + +log "" +log "=========================================" +log "K3s RESULTS: $PASS passed, $FAIL failed" +log "=========================================" + +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi From af08e4f959d6ec43c7df827bdb8134bb6f430142 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Mon, 12 Jan 2026 12:26:49 +0400 Subject: [PATCH 3/5] chore: upgrade pinned dependency versions in obolup.sh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update dependency versions to latest stable releases: - kubectl: 1.31.0 → 1.35.0 - helm: 3.19.1 → 3.19.4 - helmfile: 1.2.2 → 1.2.3 - k9s: 0.32.5 → 0.50.18 - helm-diff: 3.9.11 → 3.14.1 k3d remains at 5.8.3 (already current). --- obolup.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/obolup.sh b/obolup.sh index 2741a53..f6430ab 100755 --- a/obolup.sh +++ b/obolup.sh @@ -49,12 +49,12 @@ fi # Pinned dependency versions # Update these versions to upgrade dependencies across all installations -readonly KUBECTL_VERSION="1.31.0" -readonly HELM_VERSION="3.19.1" +readonly KUBECTL_VERSION="1.35.0" +readonly HELM_VERSION="3.19.4" readonly K3D_VERSION="5.8.3" -readonly HELMFILE_VERSION="1.2.2" -readonly K9S_VERSION="0.32.5" -readonly HELM_DIFF_VERSION="3.9.11" +readonly HELMFILE_VERSION="1.2.3" +readonly K9S_VERSION="0.50.18" +readonly HELM_DIFF_VERSION="3.14.1" # Repository URL for building from source readonly OBOL_REPO_URL="git@github.com:ObolNetwork/obol-stack.git" From e16135d74afeaf7477abb8704d020071fae221fc Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 27 Jan 2026 12:47:15 +0100 Subject: [PATCH 4/5] docs: update CLAUDE.md with new dependency versions Update documentation to reflect the upgraded dependency versions in obolup.sh. This keeps the documentation in sync with the actual pinned versions used by the bootstrap installer. --- CLAUDE.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index bc40752..8aa79e8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -58,12 +58,12 @@ Uses local workspace: **Pinned versions** (lines 50-57): ```bash -KUBECTL_VERSION="1.31.0" -HELM_VERSION="3.16.2" +KUBECTL_VERSION="1.35.0" +HELM_VERSION="3.19.4" K3D_VERSION="5.8.3" -HELMFILE_VERSION="0.169.1" -K9S_VERSION="0.32.5" -HELM_DIFF_VERSION="3.9.11" +HELMFILE_VERSION="1.2.3" +K9S_VERSION="0.50.18" +HELM_DIFF_VERSION="3.14.1" ``` **Smart installation logic**: @@ -811,12 +811,12 @@ obol network delete ethereum- --force - Go 1.21+ (for building from source) **Installed by obolup.sh**: -- kubectl 1.31.0 -- helm 3.16.2 +- kubectl 1.35.0 +- helm 3.19.4 - k3d 5.8.3 -- helmfile 0.169.1 -- k9s 0.32.5 -- helm-diff plugin 3.9.11 +- helmfile 1.2.3 +- k9s 0.50.18 +- helm-diff plugin 3.14.1 **Go dependencies** (key packages): - `github.com/urfave/cli/v2` - CLI framework From 46481830a6c215d2fe2ea0b9beffa113024c42ec Mon Sep 17 00:00:00 2001 From: bussyjd Date: Fri, 13 Feb 2026 18:01:07 +0400 Subject: [PATCH 5/5] fix(stack): prevent process group kill from crashing desktop session The k3s Down() method was using kill -TERM with a negative PID (process group kill), which could kill unrelated system processes like systemd-logind sharing the same process group as the sudo wrapper. This caused the entire desktop session to crash. Changes: - Kill only the specific sudo/k3s process, not the process group - Remove unused Setpgid/syscall since we no longer use process groups - Add containerd-shim cleanup fallback for binary-only k3s installs - Add 600s helm timeout for kube-prometheus-stack deployment - Disable admission webhook pre-install hooks that timeout on fresh k3s - Fix flaky test: replace fixed sleep with polling loop for API shutdown --- .../skills/test-backend/scripts/test-k3s.sh | 11 ++++++-- .../embed/infrastructure/helmfile.yaml.gotmpl | 1 + .../values/monitoring.yaml.gotmpl | 4 +++ internal/stack/backend_k3s.go | 28 +++++++++++++------ 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/.agents/skills/test-backend/scripts/test-k3s.sh b/.agents/skills/test-backend/scripts/test-k3s.sh index 1dcaac4..03e3bca 100755 --- a/.agents/skills/test-backend/scripts/test-k3s.sh +++ b/.agents/skills/test-backend/scripts/test-k3s.sh @@ -97,8 +97,15 @@ check "stack down" $OBOL stack down check "PID file cleaned up" test ! -f .workspace/config/.k3s.pid check "config preserved after down" test -f .workspace/config/.stack-id log " Waiting for API server to become unreachable..." -sleep 5 -check_fail "kubectl unreachable after down" $OBOL kubectl get nodes --no-headers +API_DOWN=false +for i in $(seq 1 15); do + if ! $OBOL kubectl get nodes --no-headers 2>/dev/null; then + API_DOWN=true + break + fi + sleep 2 +done +check "kubectl unreachable after down" test "$API_DOWN" = "true" # --- TEST 8: stack down again (already stopped) --- log "" diff --git a/internal/embed/infrastructure/helmfile.yaml.gotmpl b/internal/embed/infrastructure/helmfile.yaml.gotmpl index d5b1d8a..1fd2e7e 100644 --- a/internal/embed/infrastructure/helmfile.yaml.gotmpl +++ b/internal/embed/infrastructure/helmfile.yaml.gotmpl @@ -35,6 +35,7 @@ releases: createNamespace: true chart: prometheus-community/kube-prometheus-stack version: 79.5.0 + timeout: 600 values: - ./values/monitoring.yaml.gotmpl diff --git a/internal/embed/infrastructure/values/monitoring.yaml.gotmpl b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl index d7a0dc1..a7a6095 100644 --- a/internal/embed/infrastructure/values/monitoring.yaml.gotmpl +++ b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl @@ -20,6 +20,10 @@ prometheus: cpu: 500m memory: 1Gi +prometheusOperator: + admissionWebhooks: + enabled: false # Disable webhook pre-install hooks (avoids timeout on fresh k3s) + grafana: enabled: false # Enable when we want UI access diff --git a/internal/stack/backend_k3s.go b/internal/stack/backend_k3s.go index 482d7e8..3325b13 100644 --- a/internal/stack/backend_k3s.go +++ b/internal/stack/backend_k3s.go @@ -8,7 +8,6 @@ import ( "runtime" "strconv" "strings" - "syscall" "time" "github.com/ObolNetwork/obol-stack/internal/config" @@ -131,8 +130,6 @@ func (b *K3sBackend) Up(cfg *config.Config, stackID string) ([]byte, error) { ) cmd.Stdout = logFile cmd.Stderr = logFile - // Set process group so we can clean up child processes - cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} if err := cmd.Start(); err != nil { logFile.Close() @@ -207,14 +204,18 @@ func (b *K3sBackend) Down(cfg *config.Config, stackID string) error { fmt.Printf("Stopping k3s (pid: %d)...\n", pid) - // Send SIGTERM to the process group for clean shutdown (negative PID = process group) - pgid := fmt.Sprintf("-%d", pid) - stopCmd := exec.Command("sudo", "kill", "-TERM", pgid) + // Send SIGTERM to the sudo/k3s process only (not the process group). + // Using negative PID (process group kill) is unsafe here because the saved PID + // is the sudo wrapper, whose process group can include unrelated system processes + // like systemd-logind — killing those crashes the desktop session. + // sudo forwards SIGTERM to k3s, which handles its own child process cleanup. + pidStr := strconv.Itoa(pid) + stopCmd := exec.Command("sudo", "kill", "-TERM", pidStr) stopCmd.Stdout = os.Stdout stopCmd.Stderr = os.Stderr if err := stopCmd.Run(); err != nil { - fmt.Printf("SIGTERM to process group failed, sending SIGKILL: %v\n", err) - exec.Command("sudo", "kill", "-9", pgid).Run() + fmt.Printf("SIGTERM failed, sending SIGKILL: %v\n", err) + exec.Command("sudo", "kill", "-9", pidStr).Run() } // Wait for process to exit (up to 30 seconds) @@ -226,7 +227,8 @@ func (b *K3sBackend) Down(cfg *config.Config, stackID string) error { time.Sleep(1 * time.Second) } - // Run k3s-killall.sh if available (cleans up containerd/iptables) + // Clean up orphaned k3s child processes (containerd-shim, etc.) + // Use k3s-killall.sh if available, otherwise kill containerd shims directly. killallPath := "/usr/local/bin/k3s-killall.sh" if _, err := os.Stat(killallPath); err == nil { fmt.Println("Running k3s cleanup...") @@ -234,6 +236,14 @@ func (b *K3sBackend) Down(cfg *config.Config, stackID string) error { cleanCmd.Stdout = os.Stdout cleanCmd.Stderr = os.Stderr cleanCmd.Run() + } else { + // k3s-killall.sh not installed (binary-only install via obolup). + // Kill orphaned containerd-shim processes that use the k3s socket. + fmt.Println("Cleaning up k3s child processes...") + exec.Command("sudo", "pkill", "-TERM", "-f", "containerd-shim.*k3s").Run() + time.Sleep(2 * time.Second) + // Force-kill any that survived SIGTERM + exec.Command("sudo", "pkill", "-KILL", "-f", "containerd-shim.*k3s").Run() } b.removePidFile(cfg)