diff --git a/.github/workflows/ecr.yml b/.github/workflows/ecr.yml index ece11d73..5192a748 100644 --- a/.github/workflows/ecr.yml +++ b/.github/workflows/ecr.yml @@ -40,24 +40,8 @@ jobs: cache-from: type=registry,ref=${{ steps.ecr-login.outputs.registry }}/sei/build-cache:shared cache-to: type=registry,ref=${{ steps.ecr-login.outputs.registry }}/sei/build-cache:shared,mode=max - # Monolithic Workflow-Task primitive binary (keygen, provision-snd, - # runner, ...) per https://github.com/sei-protocol/bdchatham-designs/blob/main/designs/test-harness/test-harness-lld.md. Published to - # sei/seitask-runner so scenarios keep the image-name muscle memory; - # the runner capability moves to args: ["runner", ...]. - - name: Build and push seitask image - uses: docker/build-push-action@v6 - with: - context: . - file: cmd/seitask/Dockerfile - push: true - platforms: linux/amd64 - tags: ${{ steps.ecr-login.outputs.registry }}/sei/seitask-runner:${{ inputs.tag || github.sha }} - cache-from: type=registry,ref=${{ steps.ecr-login.outputs.registry }}/sei/build-cache:shared - cache-to: type=registry,ref=${{ steps.ecr-login.outputs.registry }}/sei/build-cache:shared,mode=max - - # The Go-native integration harness (go test -c -tags integration), run by - # one CronJob per target (-test.run TestX). Replaces seitask-runner + the - # Chaos-Mesh Workflow scenarios once the nightly CronJobs cut over. + # The integration test suite compiled to an image (go test -c -tags + # integration), run by one nightly CronJob per target (-test.run TestX). - name: Build and push integration-harness image uses: docker/build-push-action@v6 with: diff --git a/CLAUDE.md b/CLAUDE.md index cd3855d5..d4edb250 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -64,7 +64,7 @@ setCondition(obj, ConditionNetworkingReady, metav1.ConditionFalse, The narrow exceptions to the always-present rule: - **`*Needed`-style conditions** where `True` is the exception and `False` would be tautological with the absence of the feature. No current instances in this codebase; the exception is retained for future conditions where it genuinely fits. -- **`kubectl wait` consumer conditions** where present-vs-absent semantics are explicitly load-bearing. `SeiNodeTask.Status.Conditions[Ready|Failed]` is documented as latch-on-terminal-state because the seitask-runner depends on `kubectl wait --for=condition=Ready=true` (which matches `True` only) and `--for=condition=Failed=true` as the dual exit signal. The Ready+Failed pair is the documented exception to the "no mixed polarities for the same subject" rule below — both latch independently on terminal state. +- **`kubectl wait` consumer conditions** where present-vs-absent semantics are explicitly load-bearing. `SeiNodeTask.Status.Conditions[Ready|Failed]` is documented as latch-on-terminal-state because consumers wait on `kubectl wait --for=condition=Ready=true` (which matches `True` only) and `--for=condition=Failed=true` as the dual exit signal. The Ready+Failed pair is the documented exception to the "no mixed polarities for the same subject" rule below — both latch independently on terminal state. Any new condition that doesn't fit one of these exceptions defaults to always-present. diff --git a/Makefile b/Makefile index 0ce83ae5..320f7dd7 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,4 @@ IMG ?= sei-k8s-controller:latest -RUNNER_IMG ?= seitask-runner:latest GOLANGCI_LINT ?= $(shell which golangci-lint 2>/dev/null || echo $(HOME)/go/bin/golangci-lint) # Pinned tool versions. Bump together: setup-envtest's release branch tracks @@ -18,14 +17,11 @@ CONTROLLER_GEN_VERSION ?= v0.20.1 LOCALBIN ?= $(CURDIR)/bin SETUP_ENVTEST ?= $(LOCALBIN)/setup-envtest -.PHONY: build runner test test-integration test-all lint manifests generate verify-generated setup-envtest ci docker-build docker-push runner-image runner-push +.PHONY: build test test-integration test-all lint manifests generate verify-generated setup-envtest ci docker-build docker-push build: ## Build manager binary. go build -o bin/manager ./cmd/ -runner: ## Build seitask-runner binary. - go build -o bin/seitask-runner ./cmd/runner/ - test: ## Run tests. go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out @@ -69,9 +65,3 @@ docker-build: ## Build docker image. docker-push: ## Push docker image. docker push ${IMG} - -runner-image: ## Build seitask-runner container image. - docker build --platform linux/amd64 -t ${RUNNER_IMG} -f runner/Dockerfile . - -runner-push: ## Push seitask-runner container image. - docker push ${RUNNER_IMG} diff --git a/api/v1alpha1/seinodetask_types.go b/api/v1alpha1/seinodetask_types.go index 87f8b926..1f611ce9 100644 --- a/api/v1alpha1/seinodetask_types.go +++ b/api/v1alpha1/seinodetask_types.go @@ -85,8 +85,7 @@ const ( const ( // ConditionSeiNodeTaskReady reflects whether the task has reached a // terminal successful state. True only when status.phase == Complete. - // Load-bearing for `kubectl wait --for=condition=Ready=true` in the - // seitask-runner. + // Load-bearing for `kubectl wait --for=condition=Ready=true`. ConditionSeiNodeTaskReady = "Ready" // ConditionSeiNodeTaskFailed reflects whether the task has reached a @@ -131,7 +130,7 @@ type SeiNodeTaskSpec struct { // Target identifies the single SeiNode this task operates on. Fan-out // targeting (label selectors) is intentionally out of scope at the CRD - // layer — express fan-out at the seitask-runner / Chaos Workflow layer. + // layer — express fan-out in the orchestrating caller (one task per node). Target SeiNodeTaskTarget `json:"target"` // TimeoutSeconds bounds execution time, measured from @@ -179,8 +178,8 @@ type SeiNodeTaskSpec struct { } // SeiNodeTaskTarget identifies the single SeiNode this task operates on. -// Selector-based fan-out is intentionally out of scope for MVP — express -// multi-node operations at the seitask-runner / Chaos Workflow layer. +// Selector-based fan-out is intentionally out of scope — express multi-node +// operations in the orchestrating caller (one task per node). type SeiNodeTaskTarget struct { // NodeRef is a same-namespace reference to a SeiNode. NodeRef SeiNodeTaskNodeRef `json:"nodeRef"` diff --git a/cmd/seitask/Dockerfile b/cmd/seitask/Dockerfile deleted file mode 100644 index 46a72dd8..00000000 --- a/cmd/seitask/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -FROM golang:1.26 AS builder -ARG TARGETOS -ARG TARGETARCH - -WORKDIR /workspace -COPY go.mod go.mod -COPY go.sum go.sum -RUN go mod download - -COPY . . - -RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ - go build -a -ldflags="-s -w" -o seitask ./cmd/seitask/ - -FROM gcr.io/distroless/static-debian12:nonroot -WORKDIR / -COPY --from=builder /workspace/seitask /seitask -# Runner templates (SeiNodeTask CRs) at the conventional mount path. -COPY --from=builder /workspace/runner/templates /templates -# Per-scenario SeiNetwork templates (consumed by provision-snd's --template). -# Add a COPY line per scenario; Workflow YAMLs reference paths under -# /scenarios//. -COPY --from=builder /workspace/scenarios/release-test /scenarios/release-test -COPY --from=builder /workspace/scenarios/load-test /scenarios/load-test -COPY --from=builder /workspace/scenarios/major-upgrade /scenarios/major-upgrade -USER 65532:65532 - -ENTRYPOINT ["/seitask"] diff --git a/cmd/seitask/keygen.go b/cmd/seitask/keygen.go deleted file mode 100644 index 6fcd5dfb..00000000 --- a/cmd/seitask/keygen.go +++ /dev/null @@ -1,54 +0,0 @@ -package main - -import ( - "context" - "log" - - "github.com/urfave/cli/v3" - - "github.com/sei-protocol/sei-k8s-controller/internal/seitask/keygen" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -func newKeygenCommand() *cli.Command { - return &cli.Command{ - Name: "keygen", - Usage: "Generate a fresh BIP-39 mnemonic and cosmos secp256k1 keypair, write it to a " + - "per-run Secret, and stamp the address into workflow-vars", - Flags: []cli.Flag{ - &cli.StringFlag{ - Name: "key-name", - Aliases: []string{"k"}, - Usage: "Logical identity name; Secret is named -", - Sources: cli.EnvVars("KEY_NAME"), - Value: "admin", - Required: false, - }, - }, - Action: runKeygen, - } -} - -func runKeygen(ctx context.Context, cmd *cli.Command) error { - c, err := kubeClientFromEnv() - if err != nil { - return err - } - wf, err := taskruntime.LoadWorkflowIdentity(ctx, c) - if err != nil { - return err - } - - res, err := keygen.Run(ctx, c, keygen.Params{ - KeyName: cmd.String("key-name"), - Workflow: wf, - }) - if err != nil { - // Stamp EXIT_REASON so upload-report can recover the failure class. - taskruntime.WriteExitReason(ctx, c, wf, err) - return err - } - taskruntime.WriteExitReason(ctx, c, wf, nil) - log.Printf("keygen: created Secret %q with address %s", res.SecretName, res.Address) - return nil -} diff --git a/cmd/seitask/main.go b/cmd/seitask/main.go deleted file mode 100644 index 6102c2c2..00000000 --- a/cmd/seitask/main.go +++ /dev/null @@ -1,74 +0,0 @@ -// Command seitask is the monolithic Workflow-Task primitive binary: one -// binary, multiple urfave/cli subcommands (keygen, provision-snd, -// provision-node, …) that -// share the internal/taskruntime shared library. See -// https://github.com/sei-protocol/bdchatham-designs/blob/main/designs/test-harness/test-harness-lld.md. -package main - -import ( - "context" - "fmt" - "log" - "os" - "os/signal" - "syscall" - - "github.com/urfave/cli/v3" - "k8s.io/apimachinery/pkg/runtime" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - clientgoscheme "k8s.io/client-go/kubernetes/scheme" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - - seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -// taskScheme is the controller-runtime client scheme for every seitask -// subcommand: builtin K8s types + sei.io/v1alpha1 (SeiNetwork, -// SeiNodeTask, SeiNode) so typed Create/Get round-trips work. Chaos Mesh -// CRs are read via unstructured so they're not registered here. -var taskScheme = func() *runtime.Scheme { - s := runtime.NewScheme() - utilruntime.Must(clientgoscheme.AddToScheme(s)) - utilruntime.Must(seiv1alpha1.AddToScheme(s)) - return s -}() - -func main() { - ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) - defer stop() - - app := &cli.Command{ - Name: "seitask", - Usage: "Workflow Task primitives for the sei-k8s-controller test harness", - Commands: []*cli.Command{ - newKeygenCommand(), - newProvisionSNDCommand(), - newProvisionNodeCommand(), - newRunnerCommand(), - newUploadReportCommand(), - }, - } - - if err := app.Run(ctx, os.Args); err != nil { - // Subcommands wrap with taskruntime.Infra / taskruntime.Task so this - // mapping reaches the right 0/1/2 exit code. - log.Printf("seitask: %v", err) - os.Exit(taskruntime.ExitCodeFor(err)) - } -} - -// kubeClientFromEnv uses standard ctrl.GetConfig discovery (in-cluster SA -// → $KUBECONFIG → ~/.kube/config). -func kubeClientFromEnv() (client.Client, error) { - cfg, err := ctrl.GetConfig() - if err != nil { - return nil, taskruntime.Infra(fmt.Errorf("loading kubeconfig: %w", err)) - } - c, err := client.New(cfg, client.Options{Scheme: taskScheme}) - if err != nil { - return nil, taskruntime.Infra(fmt.Errorf("building client: %w", err)) - } - return c, nil -} diff --git a/cmd/seitask/main_test.go b/cmd/seitask/main_test.go deleted file mode 100644 index 9505a5c7..00000000 --- a/cmd/seitask/main_test.go +++ /dev/null @@ -1,47 +0,0 @@ -package main - -import ( - "testing" - - seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" -) - -const apiGroup = "sei.io" - -// TestTaskScheme_RoundTripsSeiNetwork would have caught the first manual fire's -// `no kind is registered for the type v1alpha1.SeiNetwork in scheme` -// regression at `go test`, not at first cluster fire. Asserts the -// package-level taskScheme has every type provision-snd / keygen / -// upload-report constructs via typed Get/Create. -func TestTaskScheme_RoundTripsSeiNetwork(t *testing.T) { - gvks, _, err := taskScheme.ObjectKinds(&seiv1alpha1.SeiNetwork{}) - if err != nil { - t.Fatalf("SeiNetwork not registered in taskScheme: %v", err) - } - if len(gvks) == 0 { - t.Fatalf("no GVKs returned for SeiNetwork") - } - if gvks[0].Group != apiGroup || gvks[0].Version != "v1alpha1" { - t.Fatalf("SeiNetwork GVK: %+v; want sei.io/v1alpha1", gvks[0]) - } -} - -func TestTaskScheme_RoundTripsSeiNode(t *testing.T) { - gvks, _, err := taskScheme.ObjectKinds(&seiv1alpha1.SeiNode{}) - if err != nil { - t.Fatalf("SeiNode not registered in taskScheme: %v", err) - } - if len(gvks) == 0 || gvks[0].Group != apiGroup || gvks[0].Version != "v1alpha1" { - t.Fatalf("SeiNode GVK wrong: %+v; want sei.io/v1alpha1", gvks) - } -} - -func TestTaskScheme_RoundTripsSeiNodeTask(t *testing.T) { - gvks, _, err := taskScheme.ObjectKinds(&seiv1alpha1.SeiNodeTask{}) - if err != nil { - t.Fatalf("SeiNodeTask not registered in taskScheme: %v", err) - } - if len(gvks) == 0 || gvks[0].Group != apiGroup { - t.Fatalf("SeiNodeTask GVK wrong: %+v", gvks) - } -} diff --git a/cmd/seitask/provision_node.go b/cmd/seitask/provision_node.go deleted file mode 100644 index 7edc1463..00000000 --- a/cmd/seitask/provision_node.go +++ /dev/null @@ -1,121 +0,0 @@ -package main - -import ( - "context" - "log" - "strings" - "time" - - "github.com/urfave/cli/v3" - - "github.com/sei-protocol/sei-k8s-controller/internal/seitask/provisionnode" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -// Flag names shared across the template-rendering subcommands (provision-snd, -// provision-node, runner), declared once so goconst stays green. -const ( - flagTemplate = "template" - flagVar = "var" -) - -func newProvisionNodeCommand() *cli.Command { - return &cli.Command{ - Name: "provision-node", - Usage: "Fan out N standalone SeiNode followers from a template, wait for " + - "Running + per-node TM/EVM readiness, and publish role-scoped endpoints", - Flags: []cli.Flag{ - &cli.StringFlag{ - Name: "role", - Usage: "Role tag for workflow-vars keys (e.g. rpc); uppercased to RPC_*", - Sources: cli.EnvVars("ROLE"), - Required: true, - }, - &cli.StringFlag{ - Name: "name", - Usage: "Base name; the N followers are -0..-(N-1) (defaults to -)", - Sources: cli.EnvVars("NODE_NAME"), - }, - &cli.StringFlag{ - Name: flagTemplate, - Usage: "Path to the Go text/template producing one kind: SeiNode YAML", - Sources: cli.EnvVars("NODE_TEMPLATE"), - Required: true, - }, - &cli.StringSliceFlag{ - Name: flagVar, - Usage: "KEY=VALUE substitution as .KEY (repeatable); .ORDINAL and .NODE_NAME are runtime-injected", - }, - &cli.IntFlag{ - Name: "replicas", - Usage: "N: number of follower SeiNode CRs to fan out", - Sources: cli.EnvVars("NODE_REPLICAS"), - Value: 1, - }, - &cli.StringFlag{ - Name: "network", - Usage: "Genesis SeiNetwork to follow; drives peer auto-wiring + the sei.io/seinetwork object label", - Sources: cli.EnvVars("NETWORK"), - }, - &cli.StringFlag{ - Name: "network-namespace", - Usage: "Namespace of the genesis SeiNetwork for the synthesized peer selector (defaults to the workflow namespace)", - }, - &cli.DurationFlag{ - Name: "running-timeout", - Usage: "Max wait for all N SeiNodes to reach status.phase=Running", - Value: 15 * time.Minute, - }, - &cli.DurationFlag{ - Name: "first-block-timeout", - Usage: "Per-node post-Running readiness budget (TM /status height>0 and EVM eth_blockNumber 200)", - Value: 5 * time.Minute, - }, - &cli.DurationFlag{ - Name: "poll-interval", - Usage: "Status + RPC poll cadence", - Value: 5 * time.Second, - }, - }, - Action: runProvisionNode, - } -} - -func runProvisionNode(ctx context.Context, cmd *cli.Command) error { - c, err := kubeClientFromEnv() - if err != nil { - return err - } - wf, err := taskruntime.LoadWorkflowIdentity(ctx, c) - if err != nil { - return err - } - - vars, err := parseKVPairs(cmd.StringSlice(flagVar)) - if err != nil { - return err - } - - p := provisionnode.Params{ - Role: cmd.String("role"), - Name: cmd.String("name"), - TemplatePath: cmd.String(flagTemplate), - Vars: vars, - Replicas: cmd.Int("replicas"), - Network: cmd.String("network"), - NetworkNamespace: cmd.String("network-namespace"), - RunningTimeout: cmd.Duration("running-timeout"), - FirstBlockTimeout: cmd.Duration("first-block-timeout"), - PollInterval: cmd.Duration("poll-interval"), - Workflow: wf, - } - res, err := provisionnode.Run(ctx, c, p) - if err != nil { - taskruntime.WriteExitReason(ctx, c, wf, err) - return err - } - taskruntime.WriteExitReason(ctx, c, wf, nil) - log.Printf("provision-node: %d SeiNode(s) Running [%s], chainID=%s, EVM_RPC_LIST=%s", - len(res.Names), strings.Join(res.Names, ","), res.ChainID, res.EVMRPCList) - return nil -} diff --git a/cmd/seitask/provision_snd.go b/cmd/seitask/provision_snd.go deleted file mode 100644 index 794f75be..00000000 --- a/cmd/seitask/provision_snd.go +++ /dev/null @@ -1,105 +0,0 @@ -package main - -import ( - "context" - "fmt" - "log" - "strings" - "time" - - "github.com/urfave/cli/v3" - - "github.com/sei-protocol/sei-k8s-controller/internal/seitask/provisionsnd" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -func newProvisionSNDCommand() *cli.Command { - return &cli.Command{ - Name: "provision-snd", - Usage: "Render a SeiNetwork template, apply it, wait for Ready + " + - "first block, and publish role-scoped endpoints to workflow-vars", - Flags: []cli.Flag{ - &cli.StringFlag{ - Name: "role", - Usage: "Role tag for workflow-vars keys (e.g. validator, rpc)", - Sources: cli.EnvVars("ROLE"), - Required: true, - }, - &cli.StringFlag{ - Name: "name", - Usage: "SeiNetwork metadata.name (defaults to -)", - Sources: cli.EnvVars("SND_NAME"), - }, - &cli.StringFlag{ - Name: flagTemplate, - Usage: "Path to the Go text/template producing a SeiNetwork YAML", - Sources: cli.EnvVars("SND_TEMPLATE"), - Required: true, - }, - &cli.StringSliceFlag{ - Name: flagVar, - Usage: "KEY=VALUE substitution exposed to the template as .KEY (repeatable)", - }, - &cli.DurationFlag{ - Name: "ready-timeout", - Usage: "Max wait for status.phase=Ready", - Value: 15 * time.Minute, - }, - &cli.DurationFlag{ - Name: "first-block-timeout", - Usage: "Max post-Ready wait for the chain to produce its first block", - Value: 5 * time.Minute, - }, - }, - Action: runProvisionSND, - } -} - -func runProvisionSND(ctx context.Context, cmd *cli.Command) error { - c, err := kubeClientFromEnv() - if err != nil { - return err - } - wf, err := taskruntime.LoadWorkflowIdentity(ctx, c) - if err != nil { - return err - } - - vars, err := parseKVPairs(cmd.StringSlice(flagVar)) - if err != nil { - return err - } - - p := provisionsnd.Params{ - Role: cmd.String("role"), - Name: cmd.String("name"), - TemplatePath: cmd.String(flagTemplate), - Vars: vars, - ReadyTimeout: cmd.Duration("ready-timeout"), - FirstBlockTimeout: cmd.Duration("first-block-timeout"), - Workflow: wf, - } - res, err := provisionsnd.Run(ctx, c, p) - if err != nil { - taskruntime.WriteExitReason(ctx, c, wf, err) - return err - } - taskruntime.WriteExitReason(ctx, c, wf, nil) - log.Printf("provision-snd: SeiNetwork %q Ready, chainID=%s, TM=%s", res.Name, res.ChainID, res.Endpoints.TendermintRpc) - return nil -} - -func parseKVPairs(pairs []string) (map[string]string, error) { - if len(pairs) == 0 { - return nil, nil - } - out := make(map[string]string, len(pairs)) - for _, kv := range pairs { - idx := strings.IndexByte(kv, '=') - if idx <= 0 { - return nil, fmt.Errorf("--var %q must be KEY=VALUE", kv) - } - out[kv[:idx]] = kv[idx+1:] - } - return out, nil -} diff --git a/cmd/seitask/runner.go b/cmd/seitask/runner.go deleted file mode 100644 index a550c436..00000000 --- a/cmd/seitask/runner.go +++ /dev/null @@ -1,180 +0,0 @@ -package main - -import ( - "context" - "fmt" - "os" - "strings" - "time" - - "github.com/urfave/cli/v3" - "k8s.io/client-go/dynamic" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/clientcmd" - - "github.com/sei-protocol/sei-k8s-controller/internal/runner" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -// newRunnerCommand wires the legacy seitask-runner CLI as a subcommand of -// the monolithic seitask binary. Flag names and semantics match the old -// standalone binary so scenario YAMLs only need to prepend "runner" to args. -// Implementation delegates to internal/runner unchanged. -func newRunnerCommand() *cli.Command { - return &cli.Command{ - Name: "runner", - Usage: "Apply a SeiNodeTask CR from a template and poll until terminal", - Flags: []cli.Flag{ - &cli.StringFlag{ - Name: flagTemplate, - Usage: "Path to the Go text/template producing a SeiNodeTask manifest (required)", - Required: true, - }, - &cli.StringSliceFlag{ - Name: flagVar, - Usage: "KEY=VALUE substitution exposed to the template as .KEY (repeatable)", - }, - &cli.StringSliceFlag{ - Name: "output-jsonpath", - Usage: "JSONPath=ENV_VAR extraction (repeatable)", - }, - &cli.StringFlag{ - Name: "output-env-file", - Usage: "File to append extracted KEY=value pairs to on Complete", - Value: "/workflow/vars/env.sh", - }, - &cli.StringFlag{ - Name: "env-file", - Usage: "Env file to source before render (defaults to /workflow/vars/env.sh when present)", - }, - &cli.DurationFlag{ - Name: "timeout", - Usage: "Total poll timeout per SeiNodeTask", - Value: 10 * time.Minute, - }, - &cli.DurationFlag{ - Name: "poll-interval", - Usage: "Cadence the runner re-reads status.phase", - Value: 5 * time.Second, - }, - &cli.StringFlag{ - Name: "namespace", - Usage: "Namespace to apply into (defaults to the SA's namespace)", - }, - &cli.StringFlag{ - Name: "per-node-selector", - Usage: "Label selector for fan-out over SeiNodes. Empty = single-node mode", - }, - &cli.StringFlag{ - Name: "fanout-mode", - Usage: "all-must-succeed | best-effort | quorum:N", - Value: "all-must-succeed", - }, - &cli.StringFlag{ - Name: "kubeconfig", - Usage: "Path to kubeconfig (defaults to in-cluster config)", - }, - }, - Action: runRunner, - } -} - -func runRunner(ctx context.Context, cmd *cli.Command) error { - varMap, err := parseKVSlice(cmd.StringSlice(flagVar)) - if err != nil { - return err - } - if cmd.String("per-node-selector") != "" { - if _, ok := varMap["NODE"]; ok { - return fmt.Errorf("--per-node-selector is incompatible with --var NODE=...; the runner sets .NODE per match") - } - } - ns, err := resolveRunnerNamespace(cmd.String("namespace")) - if err != nil { - return err - } - cfg, err := loadRunnerKubeConfig(cmd.String("kubeconfig")) - if err != nil { - return fmt.Errorf("load kube config: %w", err) - } - dyn, err := dynamic.NewForConfig(cfg) - if err != nil { - return fmt.Errorf("create dynamic client: %w", err) - } - - // Load the parent Workflow's identity so applied SeiNodeTask CRs - // carry an ownerRef to it — deleting the Workflow then cascades the - // per-step SeiNodeTasks. Matches the keygen / provision-snd pattern. - cliClient, err := kubeClientFromEnv() - if err != nil { - return err - } - wf, err := taskruntime.LoadWorkflowIdentity(ctx, cliClient) - if err != nil { - return err - } - ownerRef := wf.OwnerRef() - - r := &runner.Run{ - Opts: runner.Options{ - TemplatePath: cmd.String(flagTemplate), - Vars: varMap, - OutputJSONPaths: cmd.StringSlice("output-jsonpath"), - OutputEnvFile: cmd.String("output-env-file"), - EnvFile: cmd.String("env-file"), - Timeout: cmd.Duration("timeout"), - PollInterval: cmd.Duration("poll-interval"), - Namespace: ns, - PerNodeSelector: cmd.String("per-node-selector"), - FanoutMode: cmd.String("fanout-mode"), - }, - Stdout: os.Stdout, - Stderr: os.Stderr, - Renderer: runner.DefaultRenderer{OwnerRef: &ownerRef}, - Applier: runner.DynamicApplier{Client: dyn}, - Poller: runner.DynamicPoller{Client: dyn}, - Lister: runner.DynamicNodeLister{Client: dyn}, - Sourcer: runner.FileEnvSourcer{}, - Writer: runner.FileEnvWriter{}, - } - return r.Execute(ctx) -} - -func parseKVSlice(in []string) (map[string]string, error) { - out := map[string]string{} - for _, v := range in { - idx := strings.IndexByte(v, '=') - if idx <= 0 { - return nil, fmt.Errorf("--var %q must be KEY=VALUE", v) - } - out[v[:idx]] = v[idx+1:] - } - return out, nil -} - -// resolveRunnerNamespace falls back to the in-pod SA namespace file when -// --namespace is empty, matching kubectl's resolution order. -func resolveRunnerNamespace(flagNS string) (string, error) { - if flagNS != "" { - return flagNS, nil - } - const saNS = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" - b, err := os.ReadFile(saNS) - if err == nil { - return strings.TrimSpace(string(b)), nil - } - return "", fmt.Errorf("--namespace not provided and SA namespace file unreadable: %w", err) -} - -func loadRunnerKubeConfig(kubeconfig string) (*rest.Config, error) { - if kubeconfig != "" { - return clientcmd.BuildConfigFromFlags("", kubeconfig) - } - if cfg, err := rest.InClusterConfig(); err == nil { - return cfg, nil - } - return clientcmd.NewNonInteractiveDeferredLoadingClientConfig( - clientcmd.NewDefaultClientConfigLoadingRules(), - &clientcmd.ConfigOverrides{}, - ).ClientConfig() -} diff --git a/cmd/seitask/upload_report.go b/cmd/seitask/upload_report.go deleted file mode 100644 index f4c4760d..00000000 --- a/cmd/seitask/upload_report.go +++ /dev/null @@ -1,86 +0,0 @@ -package main - -import ( - "context" - "fmt" - "log" - - "github.com/aws/aws-sdk-go-v2/config" - "github.com/aws/aws-sdk-go-v2/service/s3" - "github.com/urfave/cli/v3" - - "github.com/sei-protocol/sei-k8s-controller/internal/seitask/uploadreport" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -func newUploadReportCommand() *cli.Command { - return &cli.Command{ - Name: "upload-report", - Usage: "Upload Workflow resource snapshot (workflow-vars, Workflow CR, WorkflowNode tree) " + - "to S3; exit code mirrors the EXIT_REASON workflow-vars key. Pod logs are not uploaded; " + - "Loki already ingests them.", - Flags: []cli.Flag{ - &cli.StringFlag{ - Name: "bucket", - Usage: "S3 bucket to upload to", - Sources: cli.EnvVars("S3_BUCKET"), - Required: true, - }, - &cli.StringFlag{ - Name: "prefix", - Usage: "S3 key prefix (typically ${NAMESPACE}/${SCENARIO}/${RUN_ID})", - Sources: cli.EnvVars("S3_PREFIX"), - Required: true, - }, - &cli.StringFlag{ - Name: "region", - Usage: "AWS region", - Sources: cli.EnvVars("AWS_REGION"), - Value: "eu-central-1", - }, - }, - Action: runUploadReport, - } -} - -func runUploadReport(ctx context.Context, cmd *cli.Command) error { - c, err := kubeClientFromEnv() - if err != nil { - return err - } - wf, err := taskruntime.LoadWorkflowIdentity(ctx, c) - if err != nil { - return err - } - - awsCfg, err := config.LoadDefaultConfig(ctx, config.WithRegion(cmd.String("region"))) - if err != nil { - return taskruntime.Infra(fmt.Errorf("loading AWS config: %w", err)) - } - s3client := s3.NewFromConfig(awsCfg) - - // upload-report is the terminal observer — never writes EXIT_REASON - // itself. An infra-fail in the upload would otherwise overwrite a - // genuine upstream task-fail and lose the underlying classification. - res, err := uploadreport.Run(ctx, c, uploadreport.Params{ - Bucket: cmd.String("bucket"), - Prefix: cmd.String("prefix"), - Workflow: wf, - S3: uploadreport.NewS3Uploader(s3client), - }) - if err != nil { - return err - } - log.Printf("upload-report: uploaded %d artifacts; upstream exit-reason=%s", - len(res.UploadedKeys), res.ExitReason) - - // Mirror upstream verdict so the Workflow's terminal phase reflects - // scenario outcome rather than upload-step success. - switch res.ExitReason { - case taskruntime.ExitReasonInfraFail: - return taskruntime.Infra(fmt.Errorf("upstream task reported infra-fail")) - case taskruntime.ExitReasonTaskFail: - return taskruntime.Task(fmt.Errorf("upstream task reported task-fail")) - } - return nil -} diff --git a/config/crd/sei.io_seinodetasks.yaml b/config/crd/sei.io_seinodetasks.yaml index 351caca6..4f595fc6 100644 --- a/config/crd/sei.io_seinodetasks.yaml +++ b/config/crd/sei.io_seinodetasks.yaml @@ -349,7 +349,7 @@ spec: description: |- Target identifies the single SeiNode this task operates on. Fan-out targeting (label selectors) is intentionally out of scope at the CRD - layer — express fan-out at the seitask-runner / Chaos Workflow layer. + layer — express fan-out in the orchestrating caller (one task per node). properties: nodeRef: description: NodeRef is a same-namespace reference to a SeiNode. diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 9dccc620..76cd7fd3 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -67,14 +67,6 @@ rules: - patch - update - watch -- apiGroups: - - chaos-mesh.org - resources: - - workflownodes - - workflows - verbs: - - get - - list - apiGroups: - sei.io resources: diff --git a/internal/keygen/keygen.go b/internal/keygen/keygen.go index a9c9e250..1c715286 100644 --- a/internal/keygen/keygen.go +++ b/internal/keygen/keygen.go @@ -6,8 +6,8 @@ // keyring. // // This is the general, k8s-free derivation primitive. Callers that need to stamp -// the result into a Secret / workflow-vars layer sit on top of it — see -// internal/seitask/keygen for the seitask-runner's Secret writer. +// the result into a Secret layer it on top (the integration harness writes a +// per-run Secret the release-test pod reads via secretKeyRef). package keygen import ( diff --git a/internal/runner/apply.go b/internal/runner/apply.go deleted file mode 100644 index 417fcd80..00000000 --- a/internal/runner/apply.go +++ /dev/null @@ -1,230 +0,0 @@ -package runner - -import ( - "bytes" - "context" - "crypto/sha256" - "encoding/hex" - "fmt" - "os" - "sort" - "strings" - "text/template" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/client-go/dynamic" - "sigs.k8s.io/yaml" -) - -const ( - // FieldOwner is the field-manager string used for server-side apply. - // Distinct from "seinode-task-controller" so apply diffs are attributable - // to the runner versus the reconciler. - FieldOwner = "seitask-runner" - - // shortHashLen is the number of hex chars taken from the SHA-256 of the - // (kind|vars|node) tuple to form the metadata.name suffix. - shortHashLen = 10 -) - -// DefaultRenderer renders Go text/template files. The resulting manifest is -// parsed back to assert it is a SeiNodeTask, and metadata.name is -// rewritten to a deterministic value derived from (kind, vars, NODE) so -// re-applies hit the same CR. When OwnerRef is non-nil, it replaces (not -// merges) ownerReferences so the rendered SeiNodeTask cascades on parent -// Workflow deletion. -type DefaultRenderer struct { - // OwnerRef, when non-nil, is stamped onto the rendered manifest as - // the sole entry of metadata.ownerReferences. The runner subcommand - // populates it from taskruntime.LoadWorkflowIdentity at startup. - OwnerRef *metav1.OwnerReference -} - -// Render parses templatePath as a Go text/template and executes it against -// vars. The template author can use {{ .NODE }}, {{ .PROPOSAL_ID }}, etc. -// All keys from vars are exposed as top-level fields with .KEY. -// -// After rendering, the metadata.name is replaced with -// "--" (NODE omitted if empty), where the hash -// covers the template content + sorted vars. This guarantees re-applies -// with identical inputs target the same CR (Workflow restart idempotency). -func (r DefaultRenderer) Render(templatePath string, vars map[string]string) ([]byte, string, error) { - raw, err := os.ReadFile(templatePath) //nolint:gosec // path is operator-controlled CLI arg - if err != nil { - return nil, "", fmt.Errorf("read template: %w", err) - } - return RenderBytes(templatePath, raw, vars, r.OwnerRef) -} - -// RenderBytes is the byte-input variant of Render, exposed for tests. -// When ownerRef is non-nil, it replaces (not merges) ownerReferences on -// the rendered manifest. -func RenderBytes(name string, raw []byte, vars map[string]string, ownerRef *metav1.OwnerReference) ([]byte, string, error) { - tmpl, err := template.New(name). - Option("missingkey=error"). - Parse(string(raw)) - if err != nil { - return nil, "", fmt.Errorf("parse template: %w", err) - } - var buf bytes.Buffer - if err := tmpl.Execute(&buf, vars); err != nil { - return nil, "", fmt.Errorf("execute template: %w", err) - } - - obj := &unstructured.Unstructured{} - if err := yaml.Unmarshal(buf.Bytes(), &obj.Object); err != nil { - return nil, "", fmt.Errorf("parse rendered manifest: %w", err) - } - if obj.GetKind() != "SeiNodeTask" { - return nil, "", fmt.Errorf("rendered manifest is %s, want SeiNodeTask", obj.GetKind()) - } - - // Discover the spec.kind so the deterministic name carries the - // per-kind prefix. - specKind, _, err := unstructured.NestedString(obj.Object, "spec", "kind") - if err != nil || specKind == "" { - return nil, "", fmt.Errorf("spec.kind missing on rendered manifest") - } - - deterministic := DeterministicName(specKind, vars, raw) - obj.SetName(deterministic) - - // Replace (not merge) ownerReferences so a template that smuggles a - // bogus ref can't leak through. Mirrors provisionsnd.stampMetadata. - if ownerRef != nil { - obj.SetOwnerReferences([]metav1.OwnerReference{*ownerRef}) - } - - out, err := yaml.Marshal(obj.Object) - if err != nil { - return nil, "", fmt.Errorf("re-marshal manifest: %w", err) - } - return out, deterministic, nil -} - -// DeterministicName produces a stable metadata.name from -// (spec.kind, vars, template-content). Format: -// -// [-]-<10-hex> -// -// NODE is included as a human-readable infix when present in vars; the hash -// alone already provides uniqueness (template content + sorted vars), so the -// infix is purely operator ergonomics for `kubectl get snt`. -func DeterministicName(specKind string, vars map[string]string, templateContent []byte) string { - keys := make([]string, 0, len(vars)) - for k := range vars { - keys = append(keys, k) - } - sort.Strings(keys) - - h := sha256.New() - h.Write(templateContent) - for _, k := range keys { - h.Write([]byte{0}) - h.Write([]byte(k)) - h.Write([]byte{'='}) - h.Write([]byte(vars[k])) - } - sum := hex.EncodeToString(h.Sum(nil))[:shortHashLen] - - prefix := kebab(specKind) - parts := []string{prefix} - if node := vars["NODE"]; node != "" { - parts = append(parts, sanitizeForDNS(node)) - } - parts = append(parts, sum) - return strings.Join(parts, "-") -} - -// kebab converts CamelCase to kebab-case (GovSoftwareUpgrade -> -// gov-software-upgrade). Plain ASCII only. -func kebab(s string) string { - var b strings.Builder - for i, r := range s { - switch { - case r >= 'A' && r <= 'Z': - if i > 0 { - b.WriteByte('-') - } - b.WriteRune(r + ('a' - 'A')) - default: - b.WriteRune(r) - } - } - return b.String() -} - -// sanitizeForDNS replaces characters that aren't DNS-1123 label safe with -// '-'. Truncates to 40 chars to keep the final name under K8s' 253-byte -// resource name limit even with a long kind prefix. -func sanitizeForDNS(s string) string { - const maxLen = 40 - b := make([]byte, 0, len(s)) - for i := 0; i < len(s); i++ { - c := s[i] - switch { - case c >= 'a' && c <= 'z', c >= '0' && c <= '9', c == '-': - b = append(b, c) - case c >= 'A' && c <= 'Z': - b = append(b, c+('a'-'A')) - default: - b = append(b, '-') - } - } - if len(b) > maxLen { - b = b[:maxLen] - } - // Trim trailing '-' so the joined name doesn't end with "--". - for len(b) > 0 && b[len(b)-1] == '-' { - b = b[:len(b)-1] - } - return string(b) -} - -// DynamicApplier implements Applier against the K8s dynamic client. -// Server-side apply is used so re-applies are no-ops at the apiserver level. -type DynamicApplier struct { - Client dynamic.Interface -} - -// SeiNodeTaskGVR is the GroupVersionResource for SeiNodeTask. -var SeiNodeTaskGVR = schema.GroupVersionResource{ - Group: "sei.io", - Version: "v1alpha1", - Resource: "seinodetasks", -} - -// Apply performs server-side apply of the rendered manifest. The manifest -// must already carry metadata.name; the namespace is taken from the runner's -// pod namespace. -func (a DynamicApplier) Apply(ctx context.Context, namespace string, manifest []byte) error { - obj := &unstructured.Unstructured{} - if err := yaml.Unmarshal(manifest, &obj.Object); err != nil { - return fmt.Errorf("parse manifest for apply: %w", err) - } - obj.SetNamespace(namespace) - - data, err := obj.MarshalJSON() - if err != nil { - return fmt.Errorf("marshal manifest for apply: %w", err) - } - force := true - _, err = a.Client. - Resource(SeiNodeTaskGVR). - Namespace(namespace). - Patch(ctx, obj.GetName(), apiTypesApplyPatch, data, metav1.PatchOptions{ - FieldManager: FieldOwner, - Force: &force, - }) - if err != nil && !apierrors.IsAlreadyExists(err) { - return fmt.Errorf("apply SeiNodeTask %q: %w", obj.GetName(), err) - } - return nil -} - -// apiTypesApplyPatch is a local alias so we don't pull k8s.io/apimachinery/pkg/types -// in the type signature of Apply (kept on a separate line for grep-ability). -var apiTypesApplyPatch = applyPatchTypeMarker() diff --git a/internal/runner/fanout.go b/internal/runner/fanout.go deleted file mode 100644 index 22aa8ecd..00000000 --- a/internal/runner/fanout.go +++ /dev/null @@ -1,73 +0,0 @@ -package runner - -import ( - "context" - "fmt" - "maps" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/client-go/dynamic" -) - -// SeiNodeGVR is the GroupVersionResource for SeiNode. -var SeiNodeGVR = schema.GroupVersionResource{ - Group: "sei.io", - Version: "v1alpha1", - Resource: "seinodes", -} - -// DynamicNodeLister lists SeiNodes by label selector. -type DynamicNodeLister struct { - Client dynamic.Interface -} - -// List returns the names of all SeiNodes in namespace matching selector. -// Empty selector means list-all. Names are returned in apiserver order -// (typically by creation timestamp); the runner does not stably sort because -// the fanout policies are insensitive to ordering. -func (l DynamicNodeLister) List(ctx context.Context, namespace, selector string) ([]string, error) { - opts := metav1.ListOptions{} - if selector != "" { - opts.LabelSelector = selector - } - list, err := l.Client.Resource(SeiNodeGVR).Namespace(namespace).List(ctx, opts) - if err != nil { - return nil, fmt.Errorf("list SeiNodes (selector=%q): %w", selector, err) - } - names := make([]string, 0, len(list.Items)) - for _, item := range list.Items { - names = append(names, item.GetName()) - } - return names, nil -} - -// FanoutTarget is one rendered SeiNodeTask in a fan-out batch. -type FanoutTarget struct { - // Node is the SeiNode name that .NODE was set to during render. - Node string - // Name is the deterministic metadata.name the runner applied. - Name string - // Manifest is the rendered SeiNodeTask manifest. - Manifest []byte -} - -// RenderFanout produces one FanoutTarget per node in nodes, varying only -// the .NODE template variable. baseVars is shared across all renders. -func RenderFanout(r Renderer, templatePath string, baseVars map[string]string, nodes []string) ([]FanoutTarget, error) { - if len(nodes) == 0 { - return nil, fmt.Errorf("fanout: selector matched zero SeiNodes") - } - out := make([]FanoutTarget, 0, len(nodes)) - for _, n := range nodes { - vars := make(map[string]string, len(baseVars)+1) - maps.Copy(vars, baseVars) - vars["NODE"] = n - manifest, name, err := r.Render(templatePath, vars) - if err != nil { - return nil, fmt.Errorf("render for node %q: %w", n, err) - } - out = append(out, FanoutTarget{Node: n, Name: name, Manifest: manifest}) - } - return out, nil -} diff --git a/internal/runner/orchestrate.go b/internal/runner/orchestrate.go deleted file mode 100644 index a02ab0d6..00000000 --- a/internal/runner/orchestrate.go +++ /dev/null @@ -1,256 +0,0 @@ -package runner - -import ( - "context" - "errors" - "fmt" - "sync" - "time" -) - -// Execute runs the full single-or-fanout orchestration for r and returns -// nil on success or an error suitable for printing to stderr + exit 1. -// -//nolint:gocyclo // single linear orchestration; splitting hurts readability -func (r *Run) Execute(ctx context.Context) error { - if r.Now == nil { - r.Now = time.Now - } - - if err := r.sourceEnv(); err != nil { - return fmt.Errorf("source env file: %w", err) - } - - policy, err := ParseFanoutMode(r.Opts.FanoutMode) - if err != nil { - return err - } - - targets, err := r.buildTargets(ctx) - if err != nil { - return err - } - - // Apply all targets up-front so they reconcile in parallel. - for _, t := range targets { - if err := r.Applier.Apply(ctx, r.Opts.Namespace, t.Manifest); err != nil { - return fmt.Errorf("apply %s (node=%s): %w", t.Name, t.Node, err) - } - _, _ = fmt.Fprintf(r.Stdout, "applied SeiNodeTask %s (node=%s)\n", t.Name, t.Node) - } - - // Single-node short-circuit avoids the goroutine-per-target overhead and - // preserves the natural exit-code semantics (failureReason on the single - // task is the runner's exit error). - if len(targets) == 1 { - return r.pollSingle(ctx, targets[0]) - } - return r.pollFanout(ctx, targets, policy) -} - -func (r *Run) sourceEnv() error { - path := r.Opts.EnvFile - if path == "" { - path = "/workflow/vars/env.sh" - } - pairs, err := r.Sourcer.Source(path) - if err != nil { - return err - } - // Sourced values are merged into Vars only if not already set on the CLI; - // CLI --var takes precedence so an explicit override at the Workflow step - // level wins over a stale env-file value. - if r.Opts.Vars == nil { - r.Opts.Vars = map[string]string{} - } - for k, v := range pairs { - if _, set := r.Opts.Vars[k]; !set { - r.Opts.Vars[k] = v - } - } - return nil -} - -func (r *Run) buildTargets(ctx context.Context) ([]FanoutTarget, error) { - // Single-node mode: render once with whatever NODE the operator passed via --var. - if r.Opts.PerNodeSelector == "" { - manifest, name, err := r.Renderer.Render(r.Opts.TemplatePath, r.Opts.Vars) - if err != nil { - return nil, fmt.Errorf("render: %w", err) - } - node := r.Opts.Vars["NODE"] - return []FanoutTarget{{Node: node, Name: name, Manifest: manifest}}, nil - } - // Fan-out: discover SeiNodes by selector, render one CR per match. - nodes, err := r.Lister.List(ctx, r.Opts.Namespace, r.Opts.PerNodeSelector) - if err != nil { - return nil, err - } - return RenderFanout(r.Renderer, r.Opts.TemplatePath, r.Opts.Vars, nodes) -} - -func (r *Run) pollSingle(ctx context.Context, t FanoutTarget) error { - pollCtx, cancel := context.WithTimeout(ctx, r.Opts.Timeout) - defer cancel() - phase, obj, reason, err := r.Poller.Poll(pollCtx, r.Opts.Namespace, t.Name, r.Opts.PollInterval) - if err != nil { - return err - } - if phase == PhaseFailed { - return fmt.Errorf("SeiNodeTask %s failed: %s", t.Name, reason) - } - return r.writeOutputs(obj) -} - -func (r *Run) pollFanout(ctx context.Context, targets []FanoutTarget, policy FanoutPolicy) error { - pollCtx, cancel := context.WithTimeout(ctx, r.Opts.Timeout) - defer cancel() - - type result struct { - idx int - outcome Outcome - obj map[string]any - reason string - err error - } - - // Goroutine-per-target: each loops on Poll until its task is terminal or - // the shared deadline cancels. We aggregate decisions as results arrive. - resultCh := make(chan result, len(targets)) - var wg sync.WaitGroup - for i, t := range targets { - wg.Add(1) - go func(idx int, target FanoutTarget) { - defer wg.Done() - phase, obj, reason, err := r.Poller.Poll(pollCtx, r.Opts.Namespace, target.Name, r.Opts.PollInterval) - out := OutcomeUnknown - switch { - case err != nil: - // Treat poll error (incl. deadline) as failed outcome with the underlying error message. - out = OutcomeFailed - if reason == "" { - reason = err.Error() - } - case phase == PhaseComplete: - out = OutcomeComplete - case phase == PhaseFailed: - out = OutcomeFailed - } - resultCh <- result{idx: idx, outcome: out, obj: obj, reason: reason, err: err} - }(i, t) - } - go func() { wg.Wait(); close(resultCh) }() - - outcomes := make([]Outcome, 0, len(targets)) - objects := make([]map[string]any, len(targets)) - var firstFailure string - - for res := range resultCh { - outcomes = append(outcomes, res.outcome) - if res.obj != nil { - objects[res.idx] = res.obj - } - if res.outcome == OutcomeFailed && firstFailure == "" { - firstFailure = fmt.Sprintf("%s: %s", targets[res.idx].Name, res.reason) - } - _, _ = fmt.Fprintf(r.Stdout, "fanout %s (node=%s): %s\n", targets[res.idx].Name, targets[res.idx].Node, outcomeLabel(res.outcome)) - - done, ok := policy.Decide(outcomes, len(targets)) - if done { - // Early-exit: cancel in-flight pollers; they'll surface as failed/unknown - // and be discarded (we already have a verdict). - cancel() - //nolint:revive // intentionally drain the channel to let the WaitGroup complete cleanly - for range resultCh { - } - if !ok { - if firstFailure == "" { - firstFailure = "fanout policy not satisfied" - } - return errors.New(firstFailure) - } - return r.writeFanoutOutputs(objects) - } - } - - // Channel drained without an early verdict (all outcomes in but Decide - // kept saying "not done"). That is a logic bug — Decide must terminate - // once total entries are seen. - done, ok := policy.Decide(outcomes, len(targets)) - if !done { - return fmt.Errorf("fanout policy %q failed to terminate with %d/%d outcomes (internal bug)", r.Opts.FanoutMode, len(outcomes), len(targets)) - } - if !ok { - if firstFailure == "" { - firstFailure = "fanout policy not satisfied" - } - return errors.New(firstFailure) - } - return r.writeFanoutOutputs(objects) -} - -func outcomeLabel(o Outcome) string { - switch o { - case OutcomeComplete: - return "Complete" - case OutcomeFailed: - return "Failed" - default: - return "Pending" - } -} - -func (r *Run) writeOutputs(obj map[string]any) error { - if obj == nil || len(r.Opts.OutputJSONPaths) == 0 { - return nil - } - kvs, err := ExtractOutputs(r.Opts.OutputJSONPaths, obj) - if err != nil { - return fmt.Errorf("extract outputs: %w", err) - } - if r.Opts.OutputEnvFile == "" { - // Nowhere to write — emit to stdout as a fallback so the runner is - // still useful when invoked standalone. - for _, kv := range kvs { - _, _ = fmt.Fprintf(r.Stdout, "%s=%s\n", kv.Key, kv.Value) - } - return nil - } - return r.Writer.Append(r.Opts.OutputEnvFile, kvs) -} - -// writeFanoutOutputs writes outputs from the *first* Complete object only. -// Fan-out jsonpath extraction across N nodes is undefined in the LLD — the -// canonical use case (UpdateNodeImage fanout) yields the same appliedImage -// per node, so first-Complete is a reasonable convention. -// -// If operators need per-node aggregation later, the right primitive is a -// separate --output-jsonpath-fanout flag with an explicit aggregation policy -// (join, json-array, max). Punted until a scenario demands it. -func (r *Run) writeFanoutOutputs(objects []map[string]any) error { - for _, obj := range objects { - if obj == nil { - continue - } - if nestedString(obj, "status", "phase") == PhaseComplete { - return r.writeOutputs(obj) - } - } - return nil -} - -// nestedString is a small map walker that returns the string value at the -// given key path, or "" if any intermediate node is missing or non-string. -// Kept inline so this file doesn't pull apimachinery just for one helper. -func nestedString(obj map[string]any, fields ...string) string { - var cur any = obj - for _, f := range fields { - m, ok := cur.(map[string]any) - if !ok { - return "" - } - cur = m[f] - } - s, _ := cur.(string) - return s -} diff --git a/internal/runner/output.go b/internal/runner/output.go deleted file mode 100644 index a1bdf448..00000000 --- a/internal/runner/output.go +++ /dev/null @@ -1,133 +0,0 @@ -package runner - -import ( - "bufio" - "bytes" - "fmt" - "os" - "strings" - - "k8s.io/client-go/util/jsonpath" -) - -// ExtractOutputs evaluates each "=" spec against obj and -// returns the env-var assignments. JSONPath syntax is the standard -// k8s.io/client-go/util/jsonpath (the same used by `kubectl get -o jsonpath`). -// -// Missing fields are not errors — the resulting KV is omitted. This is load- -// bearing for sidecar-backed kinds whose status.outputs. is empty in MVP -// (see LLD: "No structured outputs from sidecar-backed kinds"); a runner step -// that targets such an output should not fail just because the field is -// missing — the Workflow author can decide whether downstream steps need it. -func ExtractOutputs(specs []string, obj map[string]any) ([]KV, error) { - if len(specs) == 0 { - return nil, nil - } - out := make([]KV, 0, len(specs)) - for _, spec := range specs { - pathRaw, envVarRaw, ok := strings.Cut(spec, "=") - if !ok { - return nil, fmt.Errorf("output-jsonpath %q missing '=ENV_VAR' suffix", spec) - } - path := strings.TrimSpace(pathRaw) - envVar := strings.TrimSpace(envVarRaw) - if path == "" || envVar == "" { - return nil, fmt.Errorf("output-jsonpath %q has empty path or env var", spec) - } - val, ok, err := evalJSONPath(path, obj) - if err != nil { - return nil, fmt.Errorf("evaluate %q: %w", path, err) - } - if !ok { - continue - } - out = append(out, KV{Key: envVar, Value: val}) - } - return out, nil -} - -func evalJSONPath(path string, obj map[string]any) (string, bool, error) { - jp := jsonpath.New("output").AllowMissingKeys(true) - // kubectl's syntax accepts the leading dot (".status.phase"); the jsonpath - // lib wants "{.status.phase}". Normalize. - expr := path - if !strings.HasPrefix(expr, "{") { - expr = "{" + expr + "}" - } - if err := jp.Parse(expr); err != nil { - return "", false, err - } - var buf bytes.Buffer - if err := jp.Execute(&buf, obj); err != nil { - return "", false, err - } - s := strings.TrimSpace(buf.String()) - if s == "" { - return "", false, nil - } - return s, true, nil -} - -// FileEnvSourcer loads KEY=VALUE pairs from a shell-style env file. Lines -// starting with '#' and blank lines are skipped. Quoted values are not -// unquoted — the runner's writes are unquoted by construction, so any quoting -// the operator authored is preserved verbatim. -type FileEnvSourcer struct{} - -// Source reads KEY=VALUE pairs from path. Returns (nil, nil) when path -// doesn't exist — missing env files are not an error (first Workflow step -// has nothing to source). -func (FileEnvSourcer) Source(path string) (map[string]string, error) { - f, err := os.Open(path) //nolint:gosec // path is operator-controlled CLI arg - if err != nil { - if os.IsNotExist(err) { - return nil, nil - } - return nil, err - } - defer func() { _ = f.Close() }() - - out := map[string]string{} - scanner := bufio.NewScanner(f) - for scanner.Scan() { - line := strings.TrimSpace(scanner.Text()) - if line == "" || strings.HasPrefix(line, "#") { - continue - } - // Strip a leading "export " so files written by other tools are accepted. - line = strings.TrimPrefix(line, "export ") - idx := strings.IndexByte(line, '=') - if idx <= 0 { - continue - } - out[line[:idx]] = line[idx+1:] - } - return out, scanner.Err() -} - -// FileEnvWriter appends KEY=value lines to an env file. Parent directories -// must already exist (Chaos Mesh Workflow mounts the emptyDir for us). -type FileEnvWriter struct{} - -// Append appends KEY=value lines to path. The file is opened in append mode -// and flushed on close. Values are not quoted; values containing whitespace -// or shell metacharacters survive the runner's `source` because the runner -// reads with a line parser, not a shell. Operators wiring to a real -// `source` should keep values free of shell metacharacters — by convention, -// SeiNodeTask outputs (txHash, height, image) are. -func (FileEnvWriter) Append(path string, kv []KV) error { - if len(kv) == 0 { - return nil - } - f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644) //nolint:gosec // path is operator-controlled CLI arg - if err != nil { - return err - } - defer func() { _ = f.Close() }() - for _, p := range kv { - if _, err := fmt.Fprintf(f, "%s=%s\n", p.Key, p.Value); err != nil { - return err - } - } - return nil -} diff --git a/internal/runner/patchtype.go b/internal/runner/patchtype.go deleted file mode 100644 index 464c87d9..00000000 --- a/internal/runner/patchtype.go +++ /dev/null @@ -1,7 +0,0 @@ -package runner - -import "k8s.io/apimachinery/pkg/types" - -// applyPatchTypeMarker returns the patch type used for server-side apply. -// Split out so apply.go's imports stay tight. -func applyPatchTypeMarker() types.PatchType { return types.ApplyPatchType } diff --git a/internal/runner/poll.go b/internal/runner/poll.go deleted file mode 100644 index 04ea0413..00000000 --- a/internal/runner/poll.go +++ /dev/null @@ -1,56 +0,0 @@ -package runner - -import ( - "context" - "fmt" - "time" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/client-go/dynamic" -) - -// DynamicPoller polls SeiNodeTask.status.phase via the dynamic client. -type DynamicPoller struct { - Client dynamic.Interface -} - -// Poll re-reads the SeiNodeTask until phase is Complete or Failed, or the -// context is cancelled. The returned obj is the most recent observation, -// suitable for jsonpath extraction. failureReason is populated when phase=Failed -// (from .status.task.err) so callers can surface it on exit-1. -func (p DynamicPoller) Poll(ctx context.Context, namespace, name string, interval time.Duration) (string, map[string]any, string, error) { - ticker := time.NewTicker(interval) - defer ticker.Stop() - // Read once before sleeping so a fast-completing task isn't blocked on the - // first interval. - for { - obj, err := p.Client.Resource(SeiNodeTaskGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) - if err != nil { - if !apierrors.IsNotFound(err) { - return "", nil, "", fmt.Errorf("get SeiNodeTask %s/%s: %w", namespace, name, err) - } - // Not-found is expected briefly after apply on slow caches; keep polling. - } else { - phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") - switch phase { - case PhaseComplete, PhaseFailed: - reason := "" - if phase == PhaseFailed { - reason, _, _ = unstructured.NestedString(obj.Object, "status", "task", "err") - if reason == "" { - reason = "task reached Failed phase (no error message)" - } - } - return phase, obj.Object, reason, nil - } - } - - select { - case <-ctx.Done(): - return "", nil, "", fmt.Errorf("timeout waiting for %s/%s to reach terminal phase: %w", namespace, name, ctx.Err()) - case <-ticker.C: - } - } -} diff --git a/internal/runner/runner.go b/internal/runner/runner.go deleted file mode 100644 index 99327b42..00000000 --- a/internal/runner/runner.go +++ /dev/null @@ -1,257 +0,0 @@ -// Package runner implements the seitask-runner orchestration container that -// Chaos Mesh Workflow Task steps use to apply SeiNodeTask CRs and wait for -// completion. The runner is intentionally generic — the per-kind shape comes -// from text/template files mounted at /templates, not from CLI subcommands. -// -// Behavior contract (LLD: https://github.com/sei-protocol/bdchatham-designs/blob/main/designs/seinode-task/seinode-task-lld.md "Runner container"): -// -// 1. Source /workflow/vars/env.sh if present (env-file bridge between Workflow -// steps). -// 2. Render the template with --var KEY=VALUE substitutions. Single-node -// mode renders once with .NODE set from --var NODE=; fan-out mode -// lists SeiNodes by selector and renders once per match with .NODE set -// to each SeiNode's name. -// 3. Server-side apply each rendered SeiNodeTask (fieldOwner=seitask-runner). -// 4. Poll .status.phase until terminal (Complete | Failed) or --timeout. -// 5. On Complete, run --output-jsonpath extractions and append KEY=value -// lines to --output-env-file. Exit 0. -// 6. On Failed or timeout, exit 1. -// -// The runner talks to the K8s API only (no pods/exec, no kubectl binary). -package runner - -import ( - "context" - "fmt" - "io" - "strings" - "time" -) - -// Options is the parsed CLI invocation. -type Options struct { - // TemplatePath is the path to the Go text/template file producing a - // SeiNodeTask manifest. - TemplatePath string - - // Vars are KEY=VALUE substitutions exposed to the template as - // .KEY accessors. - Vars map[string]string - - // OutputJSONPaths are extraction expressions of the form - // '.status.outputs.govVote.txHash=TX_HASH'. The left side is a JSONPath - // against the SeiNodeTask object; the right side is the env var name - // written to OutputEnvFile. - OutputJSONPaths []string - - // OutputEnvFile is the path the runner appends KEY=value lines to on - // Complete. Conventionally /workflow/vars/env.sh. - OutputEnvFile string - - // Timeout bounds the total poll duration per apply. - Timeout time.Duration - - // PollInterval is the cadence the runner re-reads .status.phase. - PollInterval time.Duration - - // Namespace overrides the in-cluster namespace (defaults to the SA - // namespace mounted at /var/run/secrets/kubernetes.io/serviceaccount/namespace). - Namespace string - - // PerNodeSelector enables fan-out mode. Empty means single-node. - // Value is a Kubernetes label selector (e.g. "role=validator"). - PerNodeSelector string - - // FanoutMode selects the success policy: all-must-succeed (default), - // best-effort, or quorum:N. - FanoutMode string - - // EnvFile is the env-file the runner sources at startup. When empty, - // /workflow/vars/env.sh is used if it exists. - EnvFile string -} - -// FanoutPolicy modes. -const ( - fanoutModeAll = "all" - fanoutModeBestEffort = "best-effort" - fanoutModeQuorum = "quorum" - - // PhaseComplete and PhaseFailed mirror the SeiNodeTask CRD phase values. - // Centralized here so the runner doesn't import the api types package - // (keeps the runner build closure small). - PhaseComplete = "Complete" - PhaseFailed = "Failed" -) - -// FanoutPolicy is the parsed --fanout-mode value. -type FanoutPolicy struct { - // Mode is one of "all", "best-effort", "quorum". - Mode string - // Quorum is the N from "quorum:N"; zero otherwise. - Quorum int -} - -// ParseFanoutMode parses a --fanout-mode value into a policy. The empty -// string maps to "all-must-succeed" (the default). -// -// Fail-fast semantics: under "all-must-succeed", the runner exits non-zero -// as soon as one target fails — remaining targets are left in-flight, and -// the SeiNodeTaskReconciler keeps reconciling them until they reach a -// terminal phase or are garbage-collected. The runner does NOT delete -// CRs on exit (post-mortem `kubectl describe` would lose context). For -// tx-emitting kinds the broadcast has already happened by the time the -// runner polls, so early-exit changes runner wall-clock but not chain state. -func ParseFanoutMode(s string) (FanoutPolicy, error) { - switch s { - case "", "all-must-succeed": - return FanoutPolicy{Mode: fanoutModeAll}, nil - case fanoutModeBestEffort: - return FanoutPolicy{Mode: fanoutModeBestEffort}, nil - } - if rest, ok := strings.CutPrefix(s, "quorum:"); ok { - n, err := parsePositiveInt(rest) - if err != nil { - return FanoutPolicy{}, fmt.Errorf("invalid quorum value: %w", err) - } - return FanoutPolicy{Mode: fanoutModeQuorum, Quorum: n}, nil - } - return FanoutPolicy{}, fmt.Errorf("unknown fanout-mode %q (want all-must-succeed | best-effort | quorum:N)", s) -} - -// Decide evaluates a set of per-target outcomes against the policy. -// -// - all: every outcome must be true; any false fails. Returns ok=true only -// once every entry is true; ok=false the moment any entry is false. -// - best-effort: ok=true if at least one outcome is true; ok=false only -// when every entry has terminated and zero were true. -// - quorum:N: ok=true once Quorum entries are true; ok=false when so many -// have failed that Quorum can no longer be reached. -// -// done indicates whether the policy can conclude given the outcomes so far. -// When done=false, the caller should keep polling. When done=true, ok is the -// final verdict. -func (p FanoutPolicy) Decide(outcomes []Outcome, total int) (done, ok bool) { - var completed, failed, pending int - for _, o := range outcomes { - switch o { - case OutcomeComplete: - completed++ - case OutcomeFailed: - failed++ - default: - pending++ - } - } - pending += total - len(outcomes) - - switch p.Mode { - case fanoutModeAll: - if failed > 0 { - return true, false - } - if completed == total { - return true, true - } - return false, false - case fanoutModeBestEffort: - if completed > 0 && pending == 0 { - return true, true - } - if pending == 0 { - return true, completed > 0 - } - return false, false - case fanoutModeQuorum: - if completed >= p.Quorum { - return true, true - } - // If remaining successes can no longer reach quorum, fail fast. - if completed+pending < p.Quorum { - return true, false - } - return false, false - } - return true, false -} - -// Outcome is the terminal status of one applied SeiNodeTask. -type Outcome int - -const ( - // OutcomeUnknown means the task has not reached a terminal phase. - OutcomeUnknown Outcome = iota - // OutcomeComplete means .status.phase=Complete. - OutcomeComplete - // OutcomeFailed means .status.phase=Failed. - OutcomeFailed -) - -// Run is the top-level entrypoint. It is split out of main so it can be -// unit tested with stubbed K8s and filesystem dependencies. -type Run struct { - Opts Options - Stdout io.Writer - Stderr io.Writer - Now func() time.Time - Renderer Renderer - Applier Applier - Poller Poller - Lister NodeLister - Sourcer EnvSourcer - Writer EnvWriter -} - -// Renderer renders a template file with vars to a SeiNodeTask manifest. -type Renderer interface { - Render(templatePath string, vars map[string]string) (rendered []byte, name string, err error) -} - -// Applier applies a rendered SeiNodeTask manifest to the cluster. -type Applier interface { - Apply(ctx context.Context, namespace string, manifest []byte) error -} - -// Poller polls a SeiNodeTask's status until terminal or context is done. -// Returns the final phase plus the raw object (for jsonpath extraction). -type Poller interface { - Poll(ctx context.Context, namespace, name string, interval time.Duration) (phase string, obj map[string]any, failureReason string, err error) -} - -// NodeLister lists SeiNode names in a namespace matching a label selector. -type NodeLister interface { - List(ctx context.Context, namespace, selector string) (names []string, err error) -} - -// EnvSourcer reads a shell-style env file into a map. Lines that aren't -// KEY=VALUE are skipped silently (commenting/empty/etc). -type EnvSourcer interface { - Source(path string) (map[string]string, error) -} - -// EnvWriter appends KEY=value lines to an env file. -type EnvWriter interface { - Append(path string, kv []KV) error -} - -// KV is a single env-file pair. -type KV struct { - Key, Value string -} - -func parsePositiveInt(s string) (int, error) { - if s == "" { - return 0, fmt.Errorf("empty value") - } - n := 0 - for _, c := range s { - if c < '0' || c > '9' { - return 0, fmt.Errorf("not an integer: %q", s) - } - n = n*10 + int(c-'0') - } - if n == 0 { - return 0, fmt.Errorf("must be > 0") - } - return n, nil -} diff --git a/internal/runner/runner_test.go b/internal/runner/runner_test.go deleted file mode 100644 index 96b023d3..00000000 --- a/internal/runner/runner_test.go +++ /dev/null @@ -1,564 +0,0 @@ -package runner_test - -import ( - "context" - "errors" - "fmt" - "maps" - "os" - "path/filepath" - "strings" - "sync" - "testing" - "time" - - . "github.com/onsi/gomega" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/yaml" - - "github.com/sei-protocol/sei-k8s-controller/internal/runner" -) - -// Test fixtures kept as constants to satisfy goconst across the file. These -// are pure test inputs — refactoring callers to use them improves nothing -// semantically; the constants exist solely to keep the linter quiet on -// strings that recur >=3 times. -const ( - tChainID = "sei-localnet" - tChainIDKey = "CHAIN_ID" - tStatusKey = "status" - tPhaseKey = "phase" - tNodeKey = "NODE" - tComplete = "Complete" - tTxHashEnv = "TX_HASH" - tTxHashVal = "ABCD" - tStubName = "stub-name" - tImageV2 = "seid:v2" - tIgnored = "ignored" - tPropIDKey = "PROPOSAL_ID" - tValidator0 = "validator-0" -) - -// --------------------------------------------------------------------------- -// Apply / render -// --------------------------------------------------------------------------- - -func TestRenderBytes_DeterministicName(t *testing.T) { - g := NewWithT(t) - tmpl := []byte(`apiVersion: sei.io/v1alpha1 -kind: SeiNodeTask -metadata: - name: PLACEHOLDER -spec: - kind: GovVote - target: - nodeRef: - name: {{ .NODE }} - govVote: - chainId: {{ .CHAIN_ID }} - keyName: k - proposalId: {{ .PROPOSAL_ID }} - option: yes - fees: 2000usei - gas: 200000 -`) - vars := map[string]string{tNodeKey: tValidator0, tChainIDKey: tChainID, tPropIDKey: "47"} - - manifest1, name1, err := runner.RenderBytes("t.tmpl", tmpl, vars, nil) - g.Expect(err).NotTo(HaveOccurred()) - manifest2, name2, err := runner.RenderBytes("t.tmpl", tmpl, vars, nil) - g.Expect(err).NotTo(HaveOccurred()) - - g.Expect(name1).To(Equal(name2), "name must be deterministic for identical inputs") - g.Expect(string(manifest1)).To(Equal(string(manifest2))) - g.Expect(name1).To(HavePrefix("gov-vote-validator-0-"), "name should embed kind + NODE for operator ergonomics") - g.Expect(name1).To(MatchRegexp(`^gov-vote-validator-0-[0-9a-f]{10}$`)) - - // Re-render with a different var should change the hash. - vars[tPropIDKey] = "48" - _, name3, err := runner.RenderBytes("t.tmpl", tmpl, vars, nil) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(name3).NotTo(Equal(name1)) -} - -func TestRenderBytes_RejectsNonSeiNodeTask(t *testing.T) { - g := NewWithT(t) - tmpl := []byte("apiVersion: v1\nkind: ConfigMap\nmetadata: {name: PLACEHOLDER}\n") - _, _, err := runner.RenderBytes("t.tmpl", tmpl, nil, nil) - g.Expect(err).To(MatchError(ContainSubstring("rendered manifest is ConfigMap"))) -} - -func TestRenderBytes_StampsOwnerRef(t *testing.T) { - g := NewWithT(t) - tmpl := []byte(`apiVersion: sei.io/v1alpha1 -kind: SeiNodeTask -metadata: - name: PLACEHOLDER - ownerReferences: - - apiVersion: rogue.example.com/v1 - kind: Impostor - name: smuggled - uid: 00000000-0000-0000-0000-000000000000 -spec: - kind: GovVote - target: - nodeRef: - name: {{ .NODE }} - govVote: - chainId: c - keyName: k - proposalId: "1" - option: yes - fees: 0usei - gas: 0 -`) - ctrlF := false - blockF := false - ownerRef := &metav1.OwnerReference{ - APIVersion: "chaos-mesh.org/v1alpha1", - Kind: "Workflow", - Name: "release-test-20260521", - UID: types.UID("abcd-uid"), - Controller: &ctrlF, - BlockOwnerDeletion: &blockF, - } - - manifest, _, err := runner.RenderBytes("t.tmpl", tmpl, map[string]string{tNodeKey: tValidator0}, ownerRef) - g.Expect(err).NotTo(HaveOccurred()) - - obj := &unstructured.Unstructured{} - g.Expect(yaml.Unmarshal(manifest, &obj.Object)).To(Succeed()) - refs := obj.GetOwnerReferences() - g.Expect(refs).To(HaveLen(1), "render must REPLACE ownerReferences so a template-smuggled ref can't leak through") - g.Expect(refs[0].Kind).To(Equal("Workflow")) - g.Expect(refs[0].Name).To(Equal("release-test-20260521")) - g.Expect(refs[0].UID).To(Equal(types.UID("abcd-uid"))) - - // Nil ownerRef leaves template-declared refs alone (no-stamp path). - manifestNil, _, err := runner.RenderBytes("t.tmpl", tmpl, map[string]string{tNodeKey: tValidator0}, nil) - g.Expect(err).NotTo(HaveOccurred()) - objNil := &unstructured.Unstructured{} - g.Expect(yaml.Unmarshal(manifestNil, &objNil.Object)).To(Succeed()) - g.Expect(objNil.GetOwnerReferences()).To(HaveLen(1)) - g.Expect(objNil.GetOwnerReferences()[0].Kind).To(Equal("Impostor")) -} - -func TestRenderBytes_MissingKeyIsError(t *testing.T) { - g := NewWithT(t) - tmpl := []byte("apiVersion: sei.io/v1alpha1\nkind: SeiNodeTask\nmetadata: {name: PLACEHOLDER}\nspec:\n kind: GovVote\n target: {nodeRef: {name: {{ .NODE }}}}\n") - _, _, err := runner.RenderBytes("t.tmpl", tmpl, map[string]string{}, nil) - g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("execute template")) -} - -// --------------------------------------------------------------------------- -// Fanout policy -// --------------------------------------------------------------------------- - -func TestParseFanoutMode(t *testing.T) { - g := NewWithT(t) - - p, err := runner.ParseFanoutMode("") - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(p.Mode).To(Equal("all")) - - p, err = runner.ParseFanoutMode("best-effort") - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(p.Mode).To(Equal("best-effort")) - - p, err = runner.ParseFanoutMode("quorum:3") - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(p.Mode).To(Equal("quorum")) - g.Expect(p.Quorum).To(Equal(3)) - - _, err = runner.ParseFanoutMode("quorum:0") - g.Expect(err).To(HaveOccurred()) - - _, err = runner.ParseFanoutMode("nope") - g.Expect(err).To(HaveOccurred()) -} - -func TestFanoutPolicy_Decide(t *testing.T) { - g := NewWithT(t) - - all := runner.FanoutPolicy{Mode: "all"} - // All complete -> ok. - done, ok := all.Decide([]runner.Outcome{runner.OutcomeComplete, runner.OutcomeComplete}, 2) - g.Expect(done).To(BeTrue()) - g.Expect(ok).To(BeTrue()) - // Any fail -> fail fast. - done, ok = all.Decide([]runner.Outcome{runner.OutcomeFailed}, 3) - g.Expect(done).To(BeTrue()) - g.Expect(ok).To(BeFalse()) - // Partial complete, none failed -> keep going. - done, _ = all.Decide([]runner.Outcome{runner.OutcomeComplete}, 3) - g.Expect(done).To(BeFalse()) - - best := runner.FanoutPolicy{Mode: "best-effort"} - // One complete, others still pending -> wait. - done, _ = best.Decide([]runner.Outcome{runner.OutcomeComplete}, 3) - g.Expect(done).To(BeFalse()) - // All terminated, >=1 complete -> ok. - done, ok = best.Decide([]runner.Outcome{runner.OutcomeComplete, runner.OutcomeFailed, runner.OutcomeFailed}, 3) - g.Expect(done).To(BeTrue()) - g.Expect(ok).To(BeTrue()) - // All failed -> not ok. - done, ok = best.Decide([]runner.Outcome{runner.OutcomeFailed, runner.OutcomeFailed}, 2) - g.Expect(done).To(BeTrue()) - g.Expect(ok).To(BeFalse()) - - quorum := runner.FanoutPolicy{Mode: "quorum", Quorum: 2} - // Hit quorum -> ok early. - done, ok = quorum.Decide([]runner.Outcome{runner.OutcomeComplete, runner.OutcomeComplete}, 4) - g.Expect(done).To(BeTrue()) - g.Expect(ok).To(BeTrue()) - // Too many failed to reach quorum -> fail early. - done, ok = quorum.Decide([]runner.Outcome{runner.OutcomeFailed, runner.OutcomeFailed, runner.OutcomeFailed}, 4) - g.Expect(done).To(BeTrue()) - g.Expect(ok).To(BeFalse()) -} - -// --------------------------------------------------------------------------- -// Output extraction -// --------------------------------------------------------------------------- - -func TestExtractOutputs(t *testing.T) { - g := NewWithT(t) - obj := map[string]any{ - tStatusKey: map[string]any{ - tPhaseKey: tComplete, - "outputs": map[string]any{ - "govVote": map[string]any{ - "txHash": tTxHashVal, - "height": int64(1234), - }, - }, - }, - } - - kvs, err := runner.ExtractOutputs( - []string{".status.outputs.govVote.txHash=TX_HASH", ".status.outputs.govVote.height=HEIGHT"}, - obj, - ) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(kvs).To(HaveLen(2)) - g.Expect(kvs[0]).To(Equal(runner.KV{Key: tTxHashEnv, Value: tTxHashVal})) - g.Expect(kvs[1].Key).To(Equal("HEIGHT")) - g.Expect(kvs[1].Value).To(Equal("1234")) -} - -func TestExtractOutputs_MissingFieldOmitted(t *testing.T) { - g := NewWithT(t) - // Sidecar-backed kinds (govVote/govSoftwareUpgrade in MVP) have empty - // status.outputs. The extractor must drop missing fields, not error. - obj := map[string]any{tStatusKey: map[string]any{tPhaseKey: tComplete}} - kvs, err := runner.ExtractOutputs([]string{".status.outputs.govVote.txHash=TX_HASH"}, obj) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(kvs).To(BeEmpty()) -} - -func TestExtractOutputs_MalformedSpec(t *testing.T) { - g := NewWithT(t) - _, err := runner.ExtractOutputs([]string{"no-equals-sign"}, map[string]any{}) - g.Expect(err).To(MatchError(ContainSubstring("missing '=ENV_VAR'"))) -} - -// --------------------------------------------------------------------------- -// Env sourcer / writer -// --------------------------------------------------------------------------- - -func TestFileEnvSourcer_RoundTrip(t *testing.T) { - g := NewWithT(t) - dir := t.TempDir() - path := filepath.Join(dir, "env.sh") - - w := runner.FileEnvWriter{} - g.Expect(w.Append(path, []runner.KV{{Key: tTxHashEnv, Value: tTxHashVal}, {Key: tPropIDKey, Value: "47"}})).To(Succeed()) - g.Expect(w.Append(path, []runner.KV{{Key: "HEIGHT", Value: "1234"}})).To(Succeed()) - - got, err := runner.FileEnvSourcer{}.Source(path) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(got).To(Equal(map[string]string{ - tTxHashEnv: tTxHashVal, - tPropIDKey: "47", - "HEIGHT": "1234", - })) -} - -func TestFileEnvSourcer_MissingFileIsNotAnError(t *testing.T) { - g := NewWithT(t) - got, err := runner.FileEnvSourcer{}.Source(filepath.Join(t.TempDir(), "absent.sh")) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(got).To(BeNil()) -} - -// --------------------------------------------------------------------------- -// Stub-driven Execute() integration -// --------------------------------------------------------------------------- - -type stubRenderer struct { - calls []map[string]string - name string -} - -func (s *stubRenderer) Render(_ string, vars map[string]string) ([]byte, string, error) { - clone := make(map[string]string, len(vars)) - maps.Copy(clone, vars) - s.calls = append(s.calls, clone) - manifest := fmt.Appendf(nil, "apiVersion: sei.io/v1alpha1\nkind: SeiNodeTask\nmetadata:\n name: %s\nspec:\n kind: GovVote\n", s.name) - return manifest, s.name, nil -} - -type stubApplier struct { - mu sync.Mutex - applied []string -} - -func (s *stubApplier) Apply(_ context.Context, _ string, manifest []byte) error { - s.mu.Lock() - defer s.mu.Unlock() - obj := map[string]any{} - _ = yaml.Unmarshal(manifest, &obj) - if md, ok := obj["metadata"].(map[string]any); ok { - if name, ok := md["name"].(string); ok { - s.applied = append(s.applied, name) - } - } - return nil -} - -type stubPoller struct { - phase string - obj map[string]any - reason string - err error -} - -func (s stubPoller) Poll(_ context.Context, _, _ string, _ time.Duration) (string, map[string]any, string, error) { - return s.phase, s.obj, s.reason, s.err -} - -type stubLister struct{ names []string } - -func (s stubLister) List(_ context.Context, _, _ string) ([]string, error) { return s.names, nil } - -type stubSourcer struct{ env map[string]string } - -func (s stubSourcer) Source(_ string) (map[string]string, error) { return s.env, nil } - -type stubWriter struct { - mu sync.Mutex - written []runner.KV -} - -func (w *stubWriter) Append(_ string, kv []runner.KV) error { - w.mu.Lock() - defer w.mu.Unlock() - w.written = append(w.written, kv...) - return nil -} - -func newRun(opts runner.Options, poller runner.Poller, applier runner.Applier, lister runner.NodeLister) (*runner.Run, *stubWriter) { - w := &stubWriter{} - return &runner.Run{ - Opts: opts, - Stdout: os.Stderr, - Stderr: os.Stderr, - Renderer: &stubRenderer{name: tStubName}, - Applier: applier, - Poller: poller, - Lister: lister, - Sourcer: stubSourcer{}, - Writer: w, - }, w -} - -func TestExecute_SingleNode_CompleteExtractsOutputs(t *testing.T) { - g := NewWithT(t) - app := &stubApplier{} - poller := stubPoller{ - phase: tComplete, - obj: map[string]any{ - tStatusKey: map[string]any{ - tPhaseKey: tComplete, - "outputs": map[string]any{"updateNodeImage": map[string]any{"appliedImage": tImageV2}}, - }, - }, - } - r, w := newRun(runner.Options{ - TemplatePath: tIgnored, - Vars: map[string]string{tNodeKey: tValidator0}, - OutputJSONPaths: []string{".status.outputs.updateNodeImage.appliedImage=APPLIED_IMAGE"}, - OutputEnvFile: filepath.Join(t.TempDir(), "env.sh"), - Timeout: time.Second, - PollInterval: 10 * time.Millisecond, - Namespace: "ns", - }, poller, app, nil) - g.Expect(r.Execute(context.Background())).To(Succeed()) - g.Expect(app.applied).To(Equal([]string{tStubName})) - g.Expect(w.written).To(Equal([]runner.KV{{Key: "APPLIED_IMAGE", Value: tImageV2}})) -} - -func TestExecute_SingleNode_FailedReturnsReason(t *testing.T) { - g := NewWithT(t) - poller := stubPoller{phase: "Failed", reason: "deposit too small"} - r, _ := newRun(runner.Options{ - TemplatePath: tIgnored, - Vars: map[string]string{tNodeKey: tValidator0}, - Timeout: time.Second, - PollInterval: 10 * time.Millisecond, - Namespace: "ns", - }, poller, &stubApplier{}, nil) - err := r.Execute(context.Background()) - g.Expect(err).To(MatchError(ContainSubstring("deposit too small"))) -} - -func TestExecute_Fanout_AllMustSucceed(t *testing.T) { - g := NewWithT(t) - app := &stubApplier{} - poller := stubPoller{phase: tComplete, obj: map[string]any{tStatusKey: map[string]any{tPhaseKey: tComplete}}} - r, _ := newRun(runner.Options{ - TemplatePath: tIgnored, - Vars: map[string]string{"IMAGE": tImageV2}, - PerNodeSelector: "role=validator", - FanoutMode: "all-must-succeed", - Timeout: time.Second, - PollInterval: 10 * time.Millisecond, - Namespace: "ns", - }, poller, app, stubLister{names: []string{"v0", "v1", "v2"}}) - g.Expect(r.Execute(context.Background())).To(Succeed()) - g.Expect(app.applied).To(HaveLen(3)) -} - -func TestExecute_Fanout_BestEffortAllowsFailures(t *testing.T) { - g := NewWithT(t) - // Per-target poller that returns different verdicts based on the task name - // to simulate a partial-fail fan-out. - poller := poller2{ - results: map[string]stubPoller{ - tStubName: {phase: tComplete, obj: map[string]any{tStatusKey: map[string]any{tPhaseKey: tComplete}}}, - }, - def: stubPoller{phase: "Failed", reason: "boom"}, - } - app := &stubApplier{} - r, _ := newRun(runner.Options{ - TemplatePath: tIgnored, - PerNodeSelector: "role=validator", - FanoutMode: "best-effort", - Timeout: time.Second, - PollInterval: 10 * time.Millisecond, - Namespace: "ns", - }, poller, app, stubLister{names: []string{"v0", "v1"}}) - // stubRenderer returns the same name for both renders, so the poller's - // stub-name map applies to both. We just need at least one Complete to succeed. - g.Expect(r.Execute(context.Background())).To(Succeed()) -} - -// poller2 is a per-name dispatcher used by best-effort fanout tests. -type poller2 struct { - results map[string]stubPoller - def stubPoller -} - -func (p poller2) Poll(_ context.Context, _, name string, _ time.Duration) (string, map[string]any, string, error) { - if s, ok := p.results[name]; ok { - return s.phase, s.obj, s.reason, s.err - } - return p.def.phase, p.def.obj, p.def.reason, p.def.err -} - -// --------------------------------------------------------------------------- -// Embedded templates: each renders against representative vars. -// --------------------------------------------------------------------------- - -func TestEmbeddedTemplates_Render(t *testing.T) { - g := NewWithT(t) - repoRoot := findRepoRoot(t) - dir := filepath.Join(repoRoot, "runner", "templates") - - cases := []struct { - file string - vars map[string]string - }{ - { - file: "gov-software-upgrade.yaml.tmpl", - vars: map[string]string{ - tNodeKey: tValidator0, tChainIDKey: tChainID, "KEY_NAME": "admin", - "TITLE": "Upgrade to v2.0.0", "DESCRIPTION": "rollout v2", - "UPGRADE_NAME": "v2.0.0", "UPGRADE_HEIGHT": "1500", - "INITIAL_DEPOSIT": "10000000usei", "FEES": "2000usei", "GAS": "500000", - }, - }, - { - file: "gov-vote.yaml.tmpl", - vars: map[string]string{ - tNodeKey: tValidator0, tChainIDKey: tChainID, "KEY_NAME": "admin", - tPropIDKey: "47", "OPTION": "yes", "FEES": "2000usei", "GAS": "200000", - }, - }, - { - file: "await-condition.yaml.tmpl", - vars: map[string]string{tNodeKey: tValidator0, "TARGET_HEIGHT": "1500"}, - }, - { - file: "update-node-image.yaml.tmpl", - vars: map[string]string{tNodeKey: tValidator0, "IMAGE": "ghcr.io/sei/seid:v2.0.0"}, - }, - { - file: "await-nodes-at-height.yaml.tmpl", - vars: map[string]string{tNodeKey: tValidator0, "TARGET_HEIGHT": "1500"}, - }, - } - - for _, c := range cases { - t.Run(c.file, func(t *testing.T) { - g := NewWithT(t) - raw, err := os.ReadFile(filepath.Join(dir, c.file)) - g.Expect(err).NotTo(HaveOccurred(), "read template") - manifest, name, err := runner.RenderBytes(c.file, raw, c.vars, nil) - g.Expect(err).NotTo(HaveOccurred(), "render template") - g.Expect(name).NotTo(BeEmpty()) - - obj := map[string]any{} - g.Expect(yaml.Unmarshal(manifest, &obj)).To(Succeed()) - g.Expect(obj["apiVersion"]).To(Equal("sei.io/v1alpha1")) - g.Expect(obj["kind"]).To(Equal("SeiNodeTask")) - spec, ok := obj["spec"].(map[string]any) - g.Expect(ok).To(BeTrue()) - g.Expect(spec).To(HaveKey("kind")) - g.Expect(spec).To(HaveKey("target")) - }) - } - _ = g -} - -func findRepoRoot(t *testing.T) string { - t.Helper() - wd, err := os.Getwd() - if err != nil { - t.Fatal(err) - } - for dir := wd; dir != "/" && dir != "."; dir = filepath.Dir(dir) { - if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { - return dir - } - } - t.Fatal("repo root not found") - return "" -} - -// --------------------------------------------------------------------------- -// Sanity -// --------------------------------------------------------------------------- - -func TestNoStrayErrors(t *testing.T) { - // Tripwire so refactors that accidentally remove the deterministic-name - // contract surface as a one-line test failure. - g := NewWithT(t) - name := runner.DeterministicName("GovVote", map[string]string{tNodeKey: "v0", "X": "1"}, []byte("body")) - g.Expect(name).To(HavePrefix("gov-vote-v0-")) - g.Expect(strings.Count(name, "-")).To(BeNumerically(">=", 3)) - g.Expect(errors.New("noop")).To(HaveOccurred()) -} diff --git a/internal/seitask/keygen/keygen.go b/internal/seitask/keygen/keygen.go deleted file mode 100644 index 23ddeb91..00000000 --- a/internal/seitask/keygen/keygen.go +++ /dev/null @@ -1,115 +0,0 @@ -// Package keygen implements `seitask keygen`: derive a Sei account via the -// general internal/keygen primitive, write the mnemonic to a per-run Secret named -// "-", and publish ADMIN_ADDRESS / ADMIN_SECRET_NAME -// to workflow-vars. All created resources carry an ownerRef to the parent -// Workflow CR for cascade GC. The key derivation itself lives in -// internal/keygen (k8s-free, reused by the test harness); this package is the -// seitask-runner's Secret/workflow-vars writer on top of it. -package keygen - -import ( - "context" - "fmt" - - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client" - - keyderive "github.com/sei-protocol/sei-k8s-controller/internal/keygen" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -const fieldOwner client.FieldOwner = "seitask-keygen" - -// Params carries the typed inputs to Run. -type Params struct { - // KeyName is the logical identity (e.g. "admin"). Secret name is - // "-" to disambiguate concurrent runs. - KeyName string - Workflow taskruntime.WorkflowIdentity -} - -type Result struct { - SecretName string - Address string -} - -// Run generates the keypair, writes the Secret, and stamps workflow-vars. -// Idempotent: re-running on an existing Secret reuses the key. -func Run(ctx context.Context, c client.Client, p Params) (Result, error) { - if p.KeyName == "" { - return Result{}, fmt.Errorf("keygen: empty KeyName") - } - if p.Workflow.Name == "" || p.Workflow.Namespace == "" { - return Result{}, fmt.Errorf("keygen: workflow identity not loaded (downward-API env not projected)") - } - - secretName := p.KeyName + "-" + p.Workflow.Name - - // Check for an existing Secret first — re-running keygen on an already- - // initialized run should be a no-op so manual retries don't rotate the - // key out from under downstream steps. - existing := &corev1.Secret{} - err := c.Get(ctx, client.ObjectKey{Namespace: p.Workflow.Namespace, Name: secretName}, existing) - switch { - case err == nil: - // Re-stamp the workflow-vars CM in case it was cleared, then return. - addr, exists := existing.Data["address"] - if !exists { - return Result{}, taskruntime.Infra(fmt.Errorf("existing Secret %q is missing address data", secretName)) - } - if err := writeWorkflowVars(ctx, c, p.Workflow, string(addr), secretName); err != nil { - return Result{}, err - } - return Result{SecretName: secretName, Address: string(addr)}, nil - case !apierrors.IsNotFound(err): - return Result{}, taskruntime.Infra(fmt.Errorf("reading existing Secret %q: %w", secretName, err)) - } - - id, err := keyderive.Derive() - if err != nil { - return Result{}, taskruntime.Infra(fmt.Errorf("deriving identity: %w", err)) - } - - secret := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: secretName, - Namespace: p.Workflow.Namespace, - OwnerReferences: []metav1.OwnerReference{p.Workflow.OwnerRef()}, - }, - Type: corev1.SecretTypeOpaque, - Data: map[string][]byte{ - keyderive.SecretMnemonicKey: []byte(id.Mnemonic), - // address is duplicated into the Secret so a re-run of keygen - // can reuse the existing identity without re-deriving from the - // mnemonic (the Secret is the source of truth for both). - "address": []byte(id.Address), - }, - } - if err := c.Create(ctx, secret, fieldOwner); err != nil { - // Race: another keygen Pod won. Re-read and fall through to - // idempotent path. - if apierrors.IsAlreadyExists(err) { - return Run(ctx, c, p) - } - return Result{}, taskruntime.Infra(fmt.Errorf("creating Secret %q: %w", secretName, err)) - } - - if err := writeWorkflowVars(ctx, c, p.Workflow, id.Address, secretName); err != nil { - return Result{}, err - } - return Result{SecretName: secretName, Address: id.Address}, nil -} - -func writeWorkflowVars(ctx context.Context, c client.Client, w taskruntime.WorkflowIdentity, address, secretName string) error { - if err := taskruntime.EnsureWorkflowVarsCM(ctx, c, w, map[taskruntime.VarKey]string{ - taskruntime.KeyRunID: w.Name, - }); err != nil { - return err - } - return taskruntime.SetVars(ctx, c, w, map[taskruntime.VarKey]string{ - taskruntime.KeyAdminAddress: address, - taskruntime.KeyAdminSecretName: secretName, - }) -} diff --git a/internal/seitask/keygen/keygen_test.go b/internal/seitask/keygen/keygen_test.go deleted file mode 100644 index 4dbe69bc..00000000 --- a/internal/seitask/keygen/keygen_test.go +++ /dev/null @@ -1,171 +0,0 @@ -package keygen - -import ( - "context" - "strings" - "testing" - - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - keyderive "github.com/sei-protocol/sei-k8s-controller/internal/keygen" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -func newScheme(t *testing.T) *runtime.Scheme { - t.Helper() - s := runtime.NewScheme() - if err := corev1.AddToScheme(s); err != nil { - t.Fatal(err) - } - return s -} - -const ( - testKeyName = "admin" - testNamespace = "nightly" - testWorkflowName = "wf-test" - testSecretName = "admin-wf-test" - testWorkflowVarsCM = "workflow-vars-wf-test" -) - -func testWorkflow() taskruntime.WorkflowIdentity { - return taskruntime.WorkflowIdentity{Name: testWorkflowName, UID: "uid-test", Namespace: testNamespace} -} - -// Sanity check: keygen produces a Secret with the right shape + a -// workflow-vars ConfigMap with the right keys + the address is a valid -// "sei1..." bech32. -func TestRun_CreatesSecretAndWorkflowVars(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - w := testWorkflow() - - res, err := Run(context.Background(), c, Params{KeyName: testKeyName, Workflow: w}) - if err != nil { - t.Fatalf("Run: %v", err) - } - if res.SecretName != testSecretName { - t.Fatalf("SecretName: got %q, want admin-wf-test", res.SecretName) - } - if !strings.HasPrefix(res.Address, "sei1") { - t.Fatalf("Address %q does not have sei1 prefix", res.Address) - } - - // Secret must carry the mnemonic + address. - secret := &corev1.Secret{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testSecretName}, secret); err != nil { - t.Fatalf("Get Secret: %v", err) - } - mnemonic, ok := secret.Data[keyderive.SecretMnemonicKey] - if !ok || len(mnemonic) == 0 { - t.Fatalf("mnemonic missing from Secret") - } - // 24 words separated by spaces. - if got := len(strings.Fields(string(mnemonic))); got != 24 { - t.Fatalf("mnemonic word count: got %d, want 24", got) - } - if got := string(secret.Data["address"]); got != res.Address { - t.Fatalf("Secret address %q != Result.Address %q", got, res.Address) - } - - // workflow-vars ConfigMap must carry ADMIN_ADDRESS + ADMIN_SECRET_NAME. - cm := &corev1.ConfigMap{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, cm); err != nil { - t.Fatalf("Get CM: %v", err) - } - if got := cm.Data[string(taskruntime.KeyAdminAddress)]; got != res.Address { - t.Fatalf("CM ADMIN_ADDRESS = %q, want %q", got, res.Address) - } - if got := cm.Data[string(taskruntime.KeyAdminSecretName)]; got != testSecretName { - t.Fatalf("CM ADMIN_SECRET_NAME = %q, want admin-wf-test", got) - } - if got := cm.Data[string(taskruntime.KeyRunID)]; got != testWorkflowName { - t.Fatalf("CM RUN_ID = %q, want wf-test", got) - } -} - -// Idempotency: re-running keygen with an existing Secret reuses the key -// rather than rotating it. Manual scenario retries shouldn't blow away the -// identity downstream Tasks already consumed. -func TestRun_Idempotent(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - w := testWorkflow() - - first, err := Run(context.Background(), c, Params{KeyName: testKeyName, Workflow: w}) - if err != nil { - t.Fatalf("first Run: %v", err) - } - - // Drop the workflow-vars CM to simulate it being cleared somewhere. - cm := &corev1.ConfigMap{} - _ = c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, cm) - if err := c.Delete(context.Background(), cm); err != nil { - t.Fatalf("delete CM: %v", err) - } - - second, err := Run(context.Background(), c, Params{KeyName: testKeyName, Workflow: w}) - if err != nil { - t.Fatalf("second Run: %v", err) - } - if second.Address != first.Address { - t.Fatalf("identity rotated on second Run: %q -> %q", first.Address, second.Address) - } - // CM should be re-created. - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, &corev1.ConfigMap{}); err != nil { - t.Fatalf("CM not re-created on idempotent run: %v", err) - } -} - -// Reject missing inputs. -func TestRun_RejectsBadInputs(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - for _, tc := range []struct { - name string - p Params - }{ - {"empty key name", Params{KeyName: "", Workflow: testWorkflow()}}, - {"missing workflow name", Params{KeyName: testKeyName, Workflow: taskruntime.WorkflowIdentity{Namespace: "ns"}}}, - {"missing workflow namespace", Params{KeyName: testKeyName, Workflow: taskruntime.WorkflowIdentity{Name: "wf"}}}, - } { - t.Run(tc.name, func(t *testing.T) { - _, err := Run(context.Background(), c, tc.p) - if err == nil { - t.Fatalf("expected error") - } - }) - } -} - -// Secret + ConfigMap carry ownerReferences to the parent Workflow so -// Workflow deletion cascades. Bench-chain scenarios depend on this for -// cleanup (no trap-on-EXIT logic). -func TestRun_StampsOwnerReferences(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - w := testWorkflow() - - if _, err := Run(context.Background(), c, Params{KeyName: testKeyName, Workflow: w}); err != nil { - t.Fatalf("Run: %v", err) - } - - for _, target := range []client.Object{ - &corev1.Secret{}, - &corev1.ConfigMap{}, - } { - var name string - if _, ok := target.(*corev1.Secret); ok { - name = testSecretName - } else { - name = testWorkflowVarsCM - } - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: name}, target); err != nil { - t.Fatalf("Get %s: %v", name, err) - } - refs := target.GetOwnerReferences() - if len(refs) != 1 || refs[0].Kind != "Workflow" || refs[0].Name != testWorkflowName || string(refs[0].UID) != "uid-test" { - t.Fatalf("%s ownerRefs = %+v", name, refs) - } - } -} diff --git a/internal/seitask/provisionnode/provision.go b/internal/seitask/provisionnode/provision.go deleted file mode 100644 index a7d8cfe1..00000000 --- a/internal/seitask/provisionnode/provision.go +++ /dev/null @@ -1,415 +0,0 @@ -// Package provisionnode implements `seitask provision-node`: fan out N -// standalone SeiNode follower CRs from one Go template, stamp an ownerRef -// to the parent Workflow, Create them, await PhaseRunning, run a two-stage -// per-node readiness probe (Tendermint /status height>0, then EVM -// eth_blockNumber 200), then publish role-scoped endpoints to workflow-vars -// (_EVM_RPC_LIST, _EVM_RPC, _TM_RPC, _REST, CHAIN_ID). -// -// Unlike provision-snd (genesis SeiNetwork, waits Ready, reads the fleet -// aggregate), provision-node provisions followers that join an existing chain. -// It assembles every workflow-vars key from the N per-node .status.endpoint -// scalars because a standalone SeiNode has no fleet ClusterIP to aggregate. -// -// The N CRs are named -0..-(N-1); the controller stamps -// sei.io/node= on each pod, preserving the chaos suite's pod -// selectors. provision-node also stamps sei.io/role=node (always) and -// sei.io/seinetwork= (when --network is set) on each CR's -// metadata.labels — the shared object-label producer contract with -// `seictl node apply`, which the follower-discovery query -// (node list -l sei.io/seinetwork=,sei.io/role=node) matches on. -package provisionnode - -import ( - "bytes" - "context" - "fmt" - "maps" - "net/http" - "os" - "reflect" - "strconv" - "strings" - "text/template" - "time" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/wait" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/yaml" - - seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" - "github.com/sei-protocol/sei-k8s-controller/sdk/sei" -) - -const fieldOwner client.FieldOwner = "seitask-provision-node" - -// Object-label producer contract (§2.2a) — MUST stay byte-identical to -// `seictl node apply`. The keys/values mirror the controller's canonical -// constants (noderesource: sei.io/role / "node"; seinetwork: sei.io/seinetwork), -// which are unexported, so we re-declare them. The contract test pins these -// literals so an accidental edit here fails; the controller side is independently -// pinned by noderesource_test.go. -const ( - labelRole = "sei.io/role" - roleValueNode = "node" - labelSeiNetwork = "sei.io/seinetwork" -) - -// Params carries the typed inputs to Run. -type Params struct { - // Role tags the workflow-vars keys this Task writes (e.g. "rpc"). - // Uppercased to compose RPC_EVM_RPC_LIST etc. Required. - Role string - - // Name is the BASE name; the N followers are -0..-(N-1). - // Defaults to "-" (or "-" when no - // CHAIN_ID var) so chaos sei.io/node selectors stay valid. - Name string - - // TemplatePath is the on-disk path to the Go text/template producing - // ONE kind: SeiNode YAML. Rendered once per replica with .ORDINAL and - // .NODE_NAME injected. Required. - TemplatePath string - - // Vars are the template's substitution context (the .KEY map). Missing - // keys referenced by the template fail rendering. The runtime injects - // .ORDINAL and .NODE_NAME per replica; a --var collision on either is - // rejected (mirrors the runner's --var NODE= guard). - Vars map[string]string - - // Replicas is N: the number of follower SeiNode CRs to fan out. >=1. - Replicas int - - // Network is the genesis SeiNetwork to follow. When set, the runtime - // (a) synthesizes a LabelPeerSource selecting sei.io/seinetwork= - // and (b) stamps the sei.io/seinetwork= object label. - Network string - - // NetworkNamespace is the namespace of the genesis SeiNetwork for the - // synthesized peer selector. Defaults to the Workflow namespace. - NetworkNamespace string - - // RunningTimeout bounds the wait for all N SeiNodes to reach PhaseRunning. - RunningTimeout time.Duration - - // FirstBlockTimeout bounds the post-Running readiness probe (the TM caught-up - // stage and the EVM eth_blockNumber stage), per node. - FirstBlockTimeout time.Duration - - // PollInterval is the interval between SeiNode status reads (waitForRunning). - // The readiness RPC probes are paced by the SDK's own cadence. - PollInterval time.Duration - - // HTTPClient overrides the RPC client; nil means http.DefaultClient. - HTTPClient *http.Client - - // Workflow is the parent Chaos Mesh Workflow identity (downward-API). - Workflow taskruntime.WorkflowIdentity -} - -// Result is the post-Run summary, returned so main can log it before exit. -type Result struct { - // Names are the N created SeiNode names, ordinal-ordered. - Names []string - // ChainID is the resolved chain ID published as CHAIN_ID. - ChainID string - // EVMRPCList is the assembled _EVM_RPC_LIST CSV. - EVMRPCList string -} - -// Run renders the template N times, creates N SeiNode followers with an -// ownerRef to the parent Workflow, waits for all to reach PhaseRunning, runs -// the per-node two-stage readiness probe, then publishes role-scoped endpoints. -func Run(ctx context.Context, c client.Client, p Params) (Result, error) { - if err := validateParams(p); err != nil { - return Result{}, err - } - p = withDefaults(p) - - names := make([]string, 0, p.Replicas) - for ordinal := 0; ordinal < p.Replicas; ordinal++ { - node, err := renderNode(p, ordinal) - if err != nil { - return Result{}, taskruntime.Task(fmt.Errorf("rendering template %s (ordinal %d): %w", p.TemplatePath, ordinal, err)) - } - stampMetadata(node, p, ordinal) - - if err := c.Create(ctx, node, fieldOwner); err != nil { - if !apierrors.IsAlreadyExists(err) { - return Result{}, taskruntime.Infra(fmt.Errorf("creating SeiNode %s/%s: %w", node.Namespace, node.Name, err)) - } - // Re-runs land here. Surface drift loudly so an operator who - // edited the template since the original Create knows the cluster - // is still at the original spec — we don't force-apply. - warnIfDrift(ctx, c, node) - } - names = append(names, node.Name) - } - - // Wait for all N to reach Running under one shared deadline. - if err := waitForRunning(ctx, c, p.Workflow.Namespace, names, p.RunningTimeout, p.PollInterval); err != nil { - return Result{}, err - } - - // Re-read each node post-Running for its .status.endpoint, then run the - // two-stage readiness probe before publishing. - nodes := make([]*seiv1alpha1.SeiNode, 0, len(names)) - httpClient := p.HTTPClient - if httpClient == nil { - httpClient = http.DefaultClient - } - for _, name := range names { - node := &seiv1alpha1.SeiNode{} - if err := c.Get(ctx, types.NamespacedName{Namespace: p.Workflow.Namespace, Name: name}, node); err != nil { - return Result{}, taskruntime.Infra(fmt.Errorf("re-reading SeiNode %s post-Running: %w", name, err)) - } - ep := node.Status.Endpoint - if ep == nil || ep.TendermintRpc == "" { - return Result{}, taskruntime.Infra(fmt.Errorf("SeiNode %s Running but .status.endpoint.tendermintRpc empty", name)) - } - // Stage 2 — TM readiness: the follower has joined consensus and is caught - // up (height>1 && catching_up==false), via the SDK's shared primitive. - if err := waitReady(ctx, httpClient, sei.WaitCaughtUp, ep.TendermintRpc, name, p.FirstBlockTimeout); err != nil { - return Result{}, err - } - // Stage 3 — EVM readiness: the JSON-RPC listener is bound before its URL - // enters RPC_EVM_RPC_LIST. A caught-up TM does NOT prove the EVM listener serves. - if ep.EvmJsonRpc == "" { - return Result{}, taskruntime.Infra(fmt.Errorf("SeiNode %s Running but .status.endpoint.evmJsonRpc empty", name)) - } - if err := waitReady(ctx, httpClient, sei.WaitEVMServing, ep.EvmJsonRpc, name, p.FirstBlockTimeout); err != nil { - return Result{}, err - } - nodes = append(nodes, node) - } - - chainID := p.Vars[string(taskruntime.KeyChainID)] - if chainID == "" && len(nodes) > 0 { - chainID = nodes[0].Spec.ChainID - } - - evmList, err := publishEndpoints(ctx, c, p.Workflow, p.Role, chainID, nodes) - if err != nil { - return Result{}, err - } - return Result{Names: names, ChainID: chainID, EVMRPCList: evmList}, nil -} - -func validateParams(p Params) error { - switch { - case p.Role == "": - return fmt.Errorf("provision-node: --role is required") - case p.TemplatePath == "": - return fmt.Errorf("provision-node: --template is required") - case p.Replicas < 1: - return fmt.Errorf("provision-node: --replicas must be >= 1, got %d", p.Replicas) - case p.Workflow.Name == "" || p.Workflow.Namespace == "": - return fmt.Errorf("provision-node: workflow identity not loaded") - } - // The runtime injects .ORDINAL and .NODE_NAME per replica; a --var on - // either would silently shadow them. Reject, mirroring the runner's - // --var NODE= guard under --per-node-selector. - if _, ok := p.Vars["ORDINAL"]; ok { - return fmt.Errorf("provision-node: --var ORDINAL=... collides with the runtime-injected .ORDINAL") - } - if _, ok := p.Vars["NODE_NAME"]; ok { - return fmt.Errorf("provision-node: --var NODE_NAME=... collides with the runtime-injected .NODE_NAME") - } - return nil -} - -func withDefaults(p Params) Params { - if p.Name == "" { - base := p.Workflow.Name - if cid := p.Vars[string(taskruntime.KeyChainID)]; cid != "" { - base = cid - } - p.Name = base + "-" + p.Role - } - if p.NetworkNamespace == "" { - p.NetworkNamespace = p.Workflow.Namespace - } - if p.RunningTimeout == 0 { - p.RunningTimeout = 15 * time.Minute - } - if p.FirstBlockTimeout == 0 { - p.FirstBlockTimeout = 5 * time.Minute - } - if p.PollInterval == 0 { - p.PollInterval = 5 * time.Second - } - return p -} - -// renderNode parses the template, executes it against the caller's vars plus -// the runtime-injected .ORDINAL and .NODE_NAME, then strict-unmarshals the -// rendered bytes into a SeiNode so field typos fail here, not at Create time. -func renderNode(p Params, ordinal int) (*seiv1alpha1.SeiNode, error) { - raw, err := os.ReadFile(p.TemplatePath) - if err != nil { - return nil, fmt.Errorf("read: %w", err) - } - tmpl, err := template.New(p.TemplatePath).Option("missingkey=error").Parse(string(raw)) - if err != nil { - return nil, fmt.Errorf("parse: %w", err) - } - ctxVars := make(map[string]string, len(p.Vars)+2) - maps.Copy(ctxVars, p.Vars) - ctxVars["ORDINAL"] = strconv.Itoa(ordinal) - ctxVars["NODE_NAME"] = nodeName(p.Name, ordinal) - - var buf bytes.Buffer - if err := tmpl.Execute(&buf, ctxVars); err != nil { - return nil, fmt.Errorf("execute: %w", err) - } - out := &seiv1alpha1.SeiNode{} - if err := yaml.UnmarshalStrict(buf.Bytes(), out); err != nil { - return nil, fmt.Errorf("unmarshal rendered yaml: %w", err) - } - return out, nil -} - -func nodeName(base string, ordinal int) string { - return base + "-" + strconv.Itoa(ordinal) -} - -// stampMetadata overwrites metadata fields the template MUST NOT control, -// stamps the shared object-label producer contract (§2.2a), and appends the -// synthesized peer source (§3). OwnerReferences are assigned (not appended) -// so a template that smuggles a bogus ref can't leak through. -func stampMetadata(node *seiv1alpha1.SeiNode, p Params, ordinal int) { - node.APIVersion = seiv1alpha1.GroupVersion.String() - node.Kind = "SeiNode" - node.Name = nodeName(p.Name, ordinal) - node.Namespace = p.Workflow.Namespace - node.OwnerReferences = []metav1.OwnerReference{p.Workflow.OwnerRef()} - - // Object-label producer contract — identical to `seictl node apply`. - if node.Labels == nil { - node.Labels = map[string]string{} - } - node.Labels[labelRole] = roleValueNode - if p.Network != "" { - node.Labels[labelSeiNetwork] = p.Network - } - - // Peer auto-wiring: synthesize the genesis-pool label source. Appended - // (not assigned) so a template's own static seed peers compose naturally. - if p.Network != "" { - node.Spec.Peers = append(node.Spec.Peers, seiv1alpha1.PeerSource{ - Label: &seiv1alpha1.LabelPeerSource{ - Selector: map[string]string{labelSeiNetwork: p.Network}, - Namespace: p.NetworkNamespace, - }, - }) - } -} - -// warnIfDrift logs when a re-run finds the on-cluster SeiNode.Spec different -// from the freshly-rendered one. Operators who edited the template since the -// original Create need to know the cluster still has the old spec. -func warnIfDrift(ctx context.Context, c client.Client, fresh *seiv1alpha1.SeiNode) { - existing := &seiv1alpha1.SeiNode{} - if err := c.Get(ctx, types.NamespacedName{Namespace: fresh.Namespace, Name: fresh.Name}, existing); err != nil { - return - } - if reflect.DeepEqual(existing.Spec, fresh.Spec) { - return - } - fmt.Fprintf(os.Stderr, "WARN: SeiNode %s/%s exists with spec different from rendered template; reusing on-cluster spec\n", fresh.Namespace, fresh.Name) -} - -// waitForRunning polls each of the named SeiNodes until .status.phase == -// PhaseRunning, failing fast on PhaseFailed. All N share one deadline. -func waitForRunning(ctx context.Context, c client.Client, ns string, names []string, timeout, interval time.Duration) error { - return wait.PollUntilContextTimeout(ctx, interval, timeout, true, func(ctx context.Context) (bool, error) { - for _, name := range names { - node := &seiv1alpha1.SeiNode{} - if err := c.Get(ctx, types.NamespacedName{Namespace: ns, Name: name}, node); err != nil { - if apierrors.IsNotFound(err) { - return false, nil - } - return false, taskruntime.Infra(fmt.Errorf("reading SeiNode %s: %w", name, err)) - } - switch node.Status.Phase { - case seiv1alpha1.PhaseRunning: - // this node done; check the rest - case seiv1alpha1.PhaseFailed: - return false, taskruntime.Task(fmt.Errorf("SeiNode %s reached Failed phase", name)) - default: - return false, nil - } - } - return true, nil - }) -} - -// waitReady runs an SDK readiness probe under a per-node deadline, mapping a -// failure into the workflow's Infra error class. The probe logic (TM caught-up, -// EVM serving) is the SDK's shared primitive — provisionnode no longer carries -// its own copy. -func waitReady(ctx context.Context, hc *http.Client, probe func(context.Context, *http.Client, string) error, url, node string, timeout time.Duration) error { - wctx, cancel := context.WithTimeout(ctx, timeout) - defer cancel() - if err := probe(wctx, hc, url); err != nil { - return taskruntime.Infra(fmt.Errorf("SeiNode %s: %w", node, err)) - } - return nil -} - -// publishEndpoints assembles all five workflow-vars keys from the N per-node -// .status.endpoint scalars (a standalone SeiNode has no fleet aggregate) and -// writes them. Returns the assembled EVM CSV for the Result summary. -// -// Empty-guard (§6.4): every node's evmJsonRpc must be non-empty (a missing -// follower endpoint is a provisioning fault, not a filterable condition), and -// node-0's tendermintRpc must be non-empty before it feeds _TM_RPC / -// _REST (guards a future non-EVM role from emitting a garbage URL the -// chaos wait-for-caught-up probe would curl). -func publishEndpoints(ctx context.Context, c client.Client, w taskruntime.WorkflowIdentity, role, chainID string, nodes []*seiv1alpha1.SeiNode) (string, error) { - if len(nodes) == 0 { - return "", taskruntime.Infra(fmt.Errorf("provision-node: no SeiNodes to publish")) - } - - urls := make([]string, 0, len(nodes)) - for _, n := range nodes { // nodes ordered 0..N-1 - ep := n.Status.Endpoint - if ep == nil || ep.EvmJsonRpc == "" { - return "", taskruntime.Infra(fmt.Errorf("SeiNode %s Running but .status.endpoint.evmJsonRpc empty", n.Name)) - } - urls = append(urls, ep.EvmJsonRpc) - } - - node0 := nodes[0].Status.Endpoint - if node0.TendermintRpc == "" { - return "", taskruntime.Infra(fmt.Errorf("SeiNode %s Running but .status.endpoint.tendermintRpc empty", nodes[0].Name)) - } - if node0.TendermintRest == "" { - return "", taskruntime.Infra(fmt.Errorf("SeiNode %s Running but .status.endpoint.tendermintRest empty", nodes[0].Name)) - } - - evmList := strings.Join(urls, ",") - - if err := taskruntime.EnsureWorkflowVarsCM(ctx, c, w, map[taskruntime.VarKey]string{ - taskruntime.KeyRunID: w.Name, - }); err != nil { - return "", err - } - vars := map[taskruntime.VarKey]string{ - // CHAIN_ID lives in SetVars (merge), not the EnsureWorkflowVarsCM seed - // (no-op on AlreadyExists): the genesis provision step creates the CM - // first, so a CHAIN_ID seed here would be silently dropped. - taskruntime.KeyChainID: chainID, - taskruntime.RoleScoped(role, taskruntime.KeyEVMJSONRPCList): evmList, - taskruntime.RoleScoped(role, taskruntime.KeyEVMJSONRPC): node0.EvmJsonRpc, - taskruntime.RoleScoped(role, taskruntime.KeyTendermintRPC): node0.TendermintRpc, - taskruntime.RoleScoped(role, taskruntime.KeyTendermintREST): node0.TendermintRest, - } - if err := taskruntime.SetVars(ctx, c, w, vars); err != nil { - return "", err - } - return evmList, nil -} diff --git a/internal/seitask/provisionnode/provision_test.go b/internal/seitask/provisionnode/provision_test.go deleted file mode 100644 index d804bfe9..00000000 --- a/internal/seitask/provisionnode/provision_test.go +++ /dev/null @@ -1,618 +0,0 @@ -package provisionnode - -import ( - "context" - "encoding/json" - "net/http" - "net/http/httptest" - "os" - "path/filepath" - "sync" - "sync/atomic" - "testing" - "time" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -const ( - testNamespace = "nightly" - testWorkflowName = "wf-test" - testWorkflowVarsCM = "workflow-vars-wf-test" - testRole = "rpc" - testChainID = "bench-1" - testImage = "ghcr.io/sei/sei-chain:abc123" - testNetwork = "bench-1" - varKeyChainID = "CHAIN_ID" - varKeyImage = "IMAGE" - - testBase = testChainID + "-" + testRole // "bench-1-rpc" - testNode0 = testBase + "-0" // "bench-1-rpc-0" - testNode1 = testBase + "-1" // "bench-1-rpc-1" - - tmSyncInfoField = "sync_info" - tmHeightField = "latest_block_height" -) - -const fullNodeTmpl = `apiVersion: sei.io/v1alpha1 -kind: SeiNode -metadata: - name: PLACEHOLDER -spec: - chainId: {{ .CHAIN_ID }} - image: {{ .IMAGE }} - fullNode: {} -` - -func newScheme(t *testing.T) *runtime.Scheme { - t.Helper() - s := runtime.NewScheme() - for _, add := range []func(*runtime.Scheme) error{ - corev1.AddToScheme, - seiv1alpha1.AddToScheme, - } { - if err := add(s); err != nil { - t.Fatal(err) - } - } - return s -} - -func writeTmpl(t *testing.T, body string) string { - t.Helper() - dir := t.TempDir() - p := filepath.Join(dir, "node.yaml.tmpl") - if err := os.WriteFile(p, []byte(body), 0o600); err != nil { - t.Fatal(err) - } - return p -} - -func testWorkflow() taskruntime.WorkflowIdentity { - return taskruntime.WorkflowIdentity{Name: testWorkflowName, UID: "uid-test", Namespace: testNamespace} -} - -func baseParams() Params { - return Params{ - Role: testRole, - Name: testChainID + "-" + testRole, - TemplatePath: "x.yaml.tmpl", - Replicas: 2, - Workflow: testWorkflow(), - } -} - -// --- renderNode ----------------------------------------------------------- - -func TestRenderNode_SubstitutesVarsAndInjectsOrdinal(t *testing.T) { - path := writeTmpl(t, `apiVersion: sei.io/v1alpha1 -kind: SeiNode -metadata: - name: {{ .NODE_NAME }} -spec: - chainId: {{ .CHAIN_ID }} - image: {{ .IMAGE }} - fullNode: {} - overrides: - ordinal: "{{ .ORDINAL }}" -`) - p := baseParams() - p.Name = testBase - p.TemplatePath = path - p.Vars = map[string]string{varKeyChainID: testChainID, varKeyImage: testImage} - - node, err := renderNode(p, 1) - if err != nil { - t.Fatalf("renderNode: %v", err) - } - if node.Spec.ChainID != testChainID { - t.Errorf("ChainID = %q", node.Spec.ChainID) - } - if node.Spec.Image != testImage { - t.Errorf("Image = %q", node.Spec.Image) - } - // .NODE_NAME and .ORDINAL injected for ordinal 1. - if node.Name != testNode1 { - t.Errorf("NODE_NAME injection: name = %q, want bench-1-rpc-1", node.Name) - } - if got := node.Spec.Overrides["ordinal"]; got != "1" { - t.Errorf("ORDINAL injection: overrides[ordinal] = %q, want 1", got) - } -} - -func TestRenderNode_MissingVarFailsRender(t *testing.T) { - path := writeTmpl(t, fullNodeTmpl) - p := baseParams() - p.TemplatePath = path - p.Vars = map[string]string{varKeyChainID: testChainID} // IMAGE missing - if _, err := renderNode(p, 0); err == nil { - t.Fatalf("expected error: IMAGE not provided") - } -} - -func TestRenderNode_StrictUnmarshalCatchesTypos(t *testing.T) { - path := writeTmpl(t, `apiVersion: sei.io/v1alpha1 -kind: SeiNode -metadata: - name: PLACEHOLDER -spec: - chainId: {{ .CHAIN_ID }} - imagge: {{ .IMAGE }} - fullNode: {} -`) - p := baseParams() - p.TemplatePath = path - p.Vars = map[string]string{varKeyChainID: testChainID, varKeyImage: testImage} - if _, err := renderNode(p, 0); err == nil { - t.Fatalf("expected strict-unmarshal error on `imagge` typo") - } -} - -// --- validateParams (collision guards) ------------------------------------ - -func TestValidateParams(t *testing.T) { - full := Params{ - Role: testRole, - TemplatePath: "x.yaml.tmpl", - Replicas: 2, - Workflow: testWorkflow(), - } - cases := []struct { - name string - mut func(*Params) - want bool - }{ - {"complete", func(*Params) {}, false}, - {"missing role", func(p *Params) { p.Role = "" }, true}, - {"missing template", func(p *Params) { p.TemplatePath = "" }, true}, - {"replicas zero", func(p *Params) { p.Replicas = 0 }, true}, - {"missing workflow.Name", func(p *Params) { p.Workflow.Name = "" }, true}, - {"ORDINAL collision", func(p *Params) { p.Vars = map[string]string{"ORDINAL": "x"} }, true}, - {"NODE_NAME collision", func(p *Params) { p.Vars = map[string]string{"NODE_NAME": "x"} }, true}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - p := full - tc.mut(&p) - err := validateParams(p) - if (err != nil) != tc.want { - t.Fatalf("validateParams err=%v wantErr=%v", err, tc.want) - } - }) - } -} - -// --- stampMetadata (object labels + peer wiring) -------------------------- - -func TestStampMetadata_NamingAndOwnerRef(t *testing.T) { - node := &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{ - // Template author smuggling a bogus ownerRef: must be overwritten. - OwnerReferences: []metav1.OwnerReference{{ - APIVersion: "evil/v1", Kind: "Bogus", Name: "smuggled", UID: "bad", - }}, - }, - } - p := baseParams() - p.Name = testBase - stampMetadata(node, p, 1) - - if node.Name != testNode1 { - t.Fatalf("name = %q, want bench-1-rpc-1", node.Name) - } - if node.Namespace != testNamespace { - t.Fatalf("namespace = %q", node.Namespace) - } - if len(node.OwnerReferences) != 1 || node.OwnerReferences[0].Kind != "Workflow" { - t.Fatalf("ownerRef not replaced: %+v", node.OwnerReferences) - } -} - -func TestStampMetadata_ObjectLabels_WithNetwork(t *testing.T) { - node := &seiv1alpha1.SeiNode{} - p := baseParams() - p.Network = testNetwork - stampMetadata(node, p, 0) - - if got := node.Labels[labelRole]; got != roleValueNode { - t.Errorf("label %s = %q, want %q", labelRole, got, roleValueNode) - } - if got := node.Labels[labelSeiNetwork]; got != testNetwork { - t.Errorf("label %s = %q, want %q", labelSeiNetwork, got, testNetwork) - } - // Producer-contract literals — must match WS-A's seictl node apply. - if labelRole != "sei.io/role" || roleValueNode != "node" || labelSeiNetwork != "sei.io/seinetwork" { - t.Fatalf("object-label producer contract drifted: %s=%s, %s", labelRole, roleValueNode, labelSeiNetwork) - } -} - -func TestStampMetadata_ObjectLabels_NoNetwork_OmitsNetworkLabel(t *testing.T) { - node := &seiv1alpha1.SeiNode{} - p := baseParams() - p.Network = "" // no --network - stampMetadata(node, p, 0) - - if got := node.Labels[labelRole]; got != roleValueNode { - t.Errorf("label %s = %q, want %q (unconditional)", labelRole, got, roleValueNode) - } - if _, ok := node.Labels[labelSeiNetwork]; ok { - t.Errorf("label %s present without --network; must be OMITTED, not stamped empty", labelSeiNetwork) - } -} - -func TestStampMetadata_PeerWiring_WithNetwork(t *testing.T) { - node := &seiv1alpha1.SeiNode{} - p := baseParams() - p.Network = testNetwork - p.NetworkNamespace = "genesis-ns" - stampMetadata(node, p, 0) - - if len(node.Spec.Peers) != 1 { - t.Fatalf("peers = %d, want 1 synthesized", len(node.Spec.Peers)) - } - lbl := node.Spec.Peers[0].Label - if lbl == nil { - t.Fatalf("synthesized peer is not a LabelPeerSource: %+v", node.Spec.Peers[0]) - } - if got := lbl.Selector[labelSeiNetwork]; got != testNetwork { - t.Errorf("peer selector %s = %q, want %q", labelSeiNetwork, got, testNetwork) - } - if lbl.Namespace != "genesis-ns" { - t.Errorf("peer namespace = %q, want genesis-ns", lbl.Namespace) - } -} - -func TestStampMetadata_NoNetwork_NoSynthesizedPeer(t *testing.T) { - node := &seiv1alpha1.SeiNode{} - p := baseParams() - p.Network = "" - stampMetadata(node, p, 0) - if len(node.Spec.Peers) != 0 { - t.Fatalf("peers = %d, want 0 (no --network)", len(node.Spec.Peers)) - } -} - -func TestStampMetadata_PreservesTemplatePeer_Appends(t *testing.T) { - node := &seiv1alpha1.SeiNode{ - Spec: seiv1alpha1.SeiNodeSpec{ - Peers: []seiv1alpha1.PeerSource{{ - Static: &seiv1alpha1.StaticPeerSource{Addresses: []string{"id@1.2.3.4:26656"}}, - }}, - }, - } - p := baseParams() - p.Network = testNetwork - stampMetadata(node, p, 0) - - if len(node.Spec.Peers) != 2 { - t.Fatalf("peers = %d, want 2 (template static + synthesized label)", len(node.Spec.Peers)) - } - if node.Spec.Peers[0].Static == nil { - t.Errorf("template static peer not preserved as first element: %+v", node.Spec.Peers[0]) - } - if node.Spec.Peers[1].Label == nil { - t.Errorf("synthesized label peer not appended as second element: %+v", node.Spec.Peers[1]) - } -} - -// --- waitForRunning ------------------------------------------------------- - -func TestWaitForRunning(t *testing.T) { - cases := []struct { - name string - phase seiv1alpha1.SeiNodePhase - wantErr bool - taskErr bool // expect a task-class (terminal) error - }{ - {"running", seiv1alpha1.PhaseRunning, false, false}, - {"failed", seiv1alpha1.PhaseFailed, true, true}, - {"pending times out", seiv1alpha1.PhasePending, true, false}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - node := &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{Name: testNode0, Namespace: testNamespace}, - Status: seiv1alpha1.SeiNodeStatus{Phase: tc.phase}, - } - c := fake.NewClientBuilder(). - WithScheme(newScheme(t)). - WithObjects(node). - WithStatusSubresource(&seiv1alpha1.SeiNode{}). - Build() - - err := waitForRunning(context.Background(), c, testNamespace, - []string{testNode0}, 200*time.Millisecond, 20*time.Millisecond) - if (err != nil) != tc.wantErr { - t.Fatalf("err=%v wantErr=%v", err, tc.wantErr) - } - if tc.taskErr && taskruntime.ExitCodeFor(err) != taskruntime.ExitTaskFailure { - t.Fatalf("Failed phase should yield a task-class error, got exit code %d", taskruntime.ExitCodeFor(err)) - } - }) - } -} - -// --- readiness probe (stage 2 TM, stage 3 EVM) ---------------------------- - -// Readiness probes (TM caught-up, EVM serving) moved to the SDK -// (sdk/sei/readiness_test.go); the Run-level tests below exercise them in situ. - -// TestRun_PublishBlockedWhileEVMDialFails is the finding-2 gate: TM reports -// height>0 but the EVM listener never binds, so publish must NOT proceed and -// no workflow-vars are written. -func TestRun_PublishBlockedWhileEVMDialFails(t *testing.T) { - w := testWorkflow() - tmplPath := writeTmpl(t, fullNodeTmpl) - vars := map[string]string{varKeyChainID: testChainID, varKeyImage: testImage} - - // TM /status answers height>0; EVM POST always 503 (never bound). - var tmHits atomic.Int32 - srv := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { - if r.Method == http.MethodGet { // TM /status - tmHits.Add(1) - _ = json.NewEncoder(rw).Encode(map[string]any{ - tmSyncInfoField: map[string]any{tmHeightField: "9"}, - }) - return - } - rw.WriteHeader(http.StatusServiceUnavailable) // EVM eth_blockNumber - })) - defer srv.Close() - - node := &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{Name: testChainID + "-" + testRole + "-0", Namespace: testNamespace}, - Spec: seiv1alpha1.SeiNodeSpec{ChainID: testChainID, FullNode: &seiv1alpha1.FullNodeSpec{}}, - Status: seiv1alpha1.SeiNodeStatus{ - Phase: seiv1alpha1.PhaseRunning, - Endpoint: &seiv1alpha1.NodeEndpointStatus{ - TendermintRpc: srv.URL, - TendermintRest: "http://rest.svc:1317", - EvmJsonRpc: srv.URL, // 503 path - }, - }, - } - c := fake.NewClientBuilder(). - WithScheme(newScheme(t)). - WithObjects(node). - WithStatusSubresource(&seiv1alpha1.SeiNode{}). - Build() - - _, err := Run(context.Background(), c, Params{ - Role: testRole, - TemplatePath: tmplPath, - Vars: vars, - Replicas: 1, - RunningTimeout: time.Second, - FirstBlockTimeout: 150 * time.Millisecond, - PollInterval: 20 * time.Millisecond, - HTTPClient: srv.Client(), - Workflow: w, - }) - if err == nil { - t.Fatalf("Run should fail when EVM never binds even at TM height>0") - } - if tmHits.Load() == 0 { - t.Fatalf("TM stage was never reached") - } - // No workflow-vars CM must be written — publish was blocked. - cm := &corev1.ConfigMap{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, cm); err == nil { - t.Fatalf("workflow-vars CM was written despite blocked publish: %+v", cm.Data) - } -} - -// --- publish assembly (the contract test) --------------------------------- - -func fakeNode(name, evm, tmRPC, tmREST string) *seiv1alpha1.SeiNode { - return &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: testNamespace}, - Status: seiv1alpha1.SeiNodeStatus{ - Phase: seiv1alpha1.PhaseRunning, - Endpoint: &seiv1alpha1.NodeEndpointStatus{ - EvmJsonRpc: evm, - TendermintRpc: tmRPC, - TendermintRest: tmREST, - }, - }, - } -} - -func TestPublishEndpoints_AssemblesAllFiveKeys(t *testing.T) { - w := testWorkflow() - nodes := []*seiv1alpha1.SeiNode{ - fakeNode("bench-1-rpc-0", "http://bench-1-rpc-0.nightly.svc:8545", "http://bench-1-rpc-0.nightly.svc:26657", "http://bench-1-rpc-0.nightly.svc:1317"), - fakeNode("bench-1-rpc-1", "http://bench-1-rpc-1.nightly.svc:8545", "http://bench-1-rpc-1.nightly.svc:26657", "http://bench-1-rpc-1.nightly.svc:1317"), - } - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - - evmList, err := publishEndpoints(context.Background(), c, w, testRole, testChainID, nodes) - if err != nil { - t.Fatalf("publishEndpoints: %v", err) - } - wantList := "http://bench-1-rpc-0.nightly.svc:8545,http://bench-1-rpc-1.nightly.svc:8545" - if evmList != wantList { - t.Fatalf("returned EVM list = %q, want %q", evmList, wantList) - } - - cm := &corev1.ConfigMap{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, cm); err != nil { - t.Fatalf("get CM: %v", err) - } - want := map[string]string{ - "CHAIN_ID": testChainID, - "RPC_EVM_RPC_LIST": wantList, - "RPC_EVM_RPC": "http://bench-1-rpc-0.nightly.svc:8545", // node-0 scalar - "RPC_TM_RPC": "http://bench-1-rpc-0.nightly.svc:26657", - "RPC_REST": "http://bench-1-rpc-0.nightly.svc:1317", - } - for k, v := range want { - if cm.Data[k] != v { - t.Errorf("CM[%s] = %q, want %q", k, cm.Data[k], v) - } - } -} - -func TestPublishEndpoints_EmptyGuards(t *testing.T) { - w := testWorkflow() - cases := []struct { - name string - nodes []*seiv1alpha1.SeiNode - }{ - { - "nil endpoint", - []*seiv1alpha1.SeiNode{{ObjectMeta: metav1.ObjectMeta{Name: "n0"}, Status: seiv1alpha1.SeiNodeStatus{Phase: seiv1alpha1.PhaseRunning}}}, - }, - { - "empty evmJsonRpc on node-1", - []*seiv1alpha1.SeiNode{ - fakeNode("n0", "http://n0:8545", "http://n0:26657", "http://n0:1317"), - fakeNode("n1", "", "http://n1:26657", "http://n1:1317"), - }, - }, - { - "empty tendermintRpc on node-0 (finding 6c)", - []*seiv1alpha1.SeiNode{fakeNode("n0", "http://n0:8545", "", "http://n0:1317")}, - }, - { - "empty tendermintRest on node-0", - []*seiv1alpha1.SeiNode{fakeNode("n0", "http://n0:8545", "http://n0:26657", "")}, - }, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - _, err := publishEndpoints(context.Background(), c, w, testRole, testChainID, tc.nodes) - if err == nil { - t.Fatalf("expected infra-fail empty-guard error") - } - if taskruntime.ExitCodeFor(err) != taskruntime.ExitInfraError { - t.Fatalf("empty-guard should be infra-class, got exit code %d", taskruntime.ExitCodeFor(err)) - } - }) - } -} - -// --- Run end-to-end fan-out (naming + happy publish) ---------------------- - -func TestRun_FanOutNamingAndPublish(t *testing.T) { - w := testWorkflow() - tmplPath := writeTmpl(t, fullNodeTmpl) - vars := map[string]string{varKeyChainID: testChainID, varKeyImage: testImage} - - // Healthy TM + EVM for every node. - srv := healthyRPCServer(t) - defer srv.Close() - - // Pre-stage N=2 SeiNodes already Running with endpoints (the controller's - // job; we test seitask's wait+probe+publish, not reconcile). - objs := make([]*seiv1alpha1.SeiNode, 0, 2) - for i := range 2 { - n := &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{Name: nodeName(testChainID+"-"+testRole, i), Namespace: testNamespace}, - Spec: seiv1alpha1.SeiNodeSpec{ChainID: testChainID, FullNode: &seiv1alpha1.FullNodeSpec{}}, - Status: seiv1alpha1.SeiNodeStatus{ - Phase: seiv1alpha1.PhaseRunning, - Endpoint: &seiv1alpha1.NodeEndpointStatus{ - EvmJsonRpc: srv.URL, - TendermintRpc: srv.URL, - TendermintRest: "http://rest.svc:1317", - }, - }, - } - objs = append(objs, n) - } - c := fake.NewClientBuilder(). - WithScheme(newScheme(t)). - WithObjects(objs[0], objs[1]). - WithStatusSubresource(&seiv1alpha1.SeiNode{}). - Build() - - res, err := Run(context.Background(), c, Params{ - Role: testRole, - Name: testChainID + "-" + testRole, - TemplatePath: tmplPath, - Vars: vars, - Replicas: 2, - Network: testNetwork, - RunningTimeout: time.Second, - FirstBlockTimeout: time.Second, - PollInterval: 10 * time.Millisecond, - HTTPClient: srv.Client(), - Workflow: w, - }) - if err != nil { - t.Fatalf("Run: %v", err) - } - wantNames := []string{testNode0, testNode1} - if len(res.Names) != 2 || res.Names[0] != wantNames[0] || res.Names[1] != wantNames[1] { - t.Fatalf("fan-out names = %v, want %v", res.Names, wantNames) - } - - // Pre-staged objects already exist (the AlreadyExists path), so the - // object-label producer contract is exercised by the stampMetadata unit - // tests; here we assert fan-out naming (above) and the publish CM (below). - cm := &corev1.ConfigMap{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, cm); err != nil { - t.Fatalf("get CM: %v", err) - } - if cm.Data["CHAIN_ID"] != testChainID { - t.Errorf("CHAIN_ID = %q", cm.Data["CHAIN_ID"]) - } - if cm.Data["RPC_EVM_RPC_LIST"] == "" { - t.Errorf("RPC_EVM_RPC_LIST empty") - } -} - -func healthyRPCServer(t *testing.T) *httptest.Server { - t.Helper() - var mu sync.Mutex - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - mu.Lock() - defer mu.Unlock() - if r.Method == http.MethodGet { // TM /status - _ = json.NewEncoder(w).Encode(map[string]any{ - tmSyncInfoField: map[string]any{tmHeightField: "12"}, - }) - return - } - _ = json.NewEncoder(w).Encode(map[string]any{"jsonrpc": "2.0", "id": 1, "result": "0x10"}) - })) - t.Cleanup(srv.Close) - return srv -} - -// TestBundledTemplates_RenderClean ensures every bundled SeiNode (rpc follower) -// scenario template renders + strict-unmarshals against a representative --var -// set. Guards against template-vs-schema drift after a CRD field rename. The -// SeiNetwork genesis/validator templates are validated in provisionsnd. -func TestBundledTemplates_RenderClean(t *testing.T) { - repoRoot, err := filepath.Abs("../../../") - if err != nil { - t.Fatal(err) - } - for _, scenario := range []string{"load-test", "release-test"} { - t.Run(scenario+"/rpc.yaml.tmpl", func(t *testing.T) { - p := Params{ - TemplatePath: filepath.Join(repoRoot, "scenarios", scenario, "rpc.yaml.tmpl"), - Vars: map[string]string{varKeyChainID: testChainID, varKeyImage: testImage}, - } - node, err := renderNode(p, 0) - if err != nil { - t.Fatalf("render: %v", err) - } - if node.Spec.ChainID != testChainID { - t.Fatalf("chainId = %q, want %q", node.Spec.ChainID, testChainID) - } - if node.Spec.FullNode == nil { - t.Fatalf("rpc template must render a fullNode SeiNode; spec = %+v", node.Spec) - } - }) - } -} diff --git a/internal/seitask/provisionsnd/provision.go b/internal/seitask/provisionsnd/provision.go deleted file mode 100644 index e1340180..00000000 --- a/internal/seitask/provisionsnd/provision.go +++ /dev/null @@ -1,308 +0,0 @@ -// Package provisionsnd implements `seitask provision-snd`: render a Go -// template to a SeiNetwork YAML, stamp an ownerRef to the parent -// Workflow, Create it, await Ready, poll the chain RPC for first block, -// then publish endpoints to workflow-vars under role-scoped keys -// (VALIDATOR_TM_RPC, RPC_EVM_RPC, etc.). -// -// Templates are scenario-intrinsic: the full SeiNetwork shape (mode, overrides, -// peers, genesis ceremony) lives in the template body as proper YAML. -// Per-run scalars (CHAIN_ID, IMAGE, ADMIN_ADDRESS, ...) flow in via --var -// and resolve at render time. Same `--template + --var` contract as the -// runner subcommand. -package provisionsnd - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "net/http" - "os" - "reflect" - "strings" - "text/template" - "time" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/wait" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/yaml" - - seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -const fieldOwner client.FieldOwner = "seitask-provision-snd" - -// Params carries the typed inputs to Run. -type Params struct { - // Role tags the workflow-vars keys this Task writes (e.g. "validator", - // "rpc"). Required for scenarios with multiple provision-snd Tasks; - // values get uppercased to compose VALIDATOR_TM_RPC etc. - Role string - - // Name is the SeiNetwork metadata.name. Defaults to - // "-" when empty. - Name string - - // TemplatePath is the on-disk path to the Go text/template producing - // a SeiNetwork YAML. Required. - TemplatePath string - - // Vars are the template's substitution context (the .KEY map in - // template syntax). Missing keys referenced by the template fail - // rendering rather than silently expanding to empty strings. - Vars map[string]string - - // ReadyTimeout bounds the wait for status.phase=Ready. - ReadyTimeout time.Duration - - // FirstBlockTimeout bounds the post-Ready wait for the chain to produce - // its first block. - FirstBlockTimeout time.Duration - - // PollInterval is the interval between status reads and chain RPC reads. - PollInterval time.Duration - - // HTTPClient overrides the chain-RPC client; nil means http.DefaultClient. - // Tests use this seam. - HTTPClient *http.Client - - // Workflow is the parent Chaos Mesh Workflow identity (downward-API). - Workflow taskruntime.WorkflowIdentity -} - -// Result is the post-Run summary, returned so main can log it before exit. -type Result struct { - Name string - ChainID string - Endpoints seiv1alpha1.Endpoints -} - -// Run renders the template, creates the SeiNetwork with an ownerRef to the -// parent Workflow, waits for Ready, polls the chain RPC for first block, and -// writes role-scoped endpoints to workflow-vars. -func Run(ctx context.Context, c client.Client, p Params) (Result, error) { - if err := validateParams(p); err != nil { - return Result{}, err - } - p = withDefaults(p) - - snd, err := renderTemplate(p.TemplatePath, p.Vars) - if err != nil { - return Result{}, taskruntime.Task(fmt.Errorf("rendering template %s: %w", p.TemplatePath, err)) - } - stampMetadata(snd, p) - - if err := c.Create(ctx, snd, fieldOwner); err != nil { - if !apierrors.IsAlreadyExists(err) { - return Result{}, taskruntime.Infra(fmt.Errorf("creating SeiNetwork %s/%s: %w", snd.Namespace, snd.Name, err)) - } - // Re-runs land here. Surface drift loudly so an operator who edited - // the template since the original Create knows the cluster is still - // at the original spec — we don't force-apply to avoid clobbering - // hand-edits or in-flight reconciliation. - warnIfDrift(ctx, c, snd) - } - - if err := waitForReady(ctx, c, types.NamespacedName{Namespace: snd.Namespace, Name: snd.Name}, p.ReadyTimeout, p.PollInterval); err != nil { - return Result{}, err - } - - current := &seiv1alpha1.SeiNetwork{} - if err := c.Get(ctx, types.NamespacedName{Namespace: snd.Namespace, Name: snd.Name}, current); err != nil { - return Result{}, taskruntime.Infra(fmt.Errorf("re-reading SeiNetwork post-Ready: %w", err)) - } - if current.Status.Endpoints == nil || current.Status.Endpoints.TendermintRpc == "" { - return Result{}, taskruntime.Infra(fmt.Errorf("SeiNetwork %s reached Ready but .status.endpoints.tendermintRpc is empty", current.Name)) - } - endpoints := *current.Status.Endpoints - chainID := current.Spec.Genesis.ChainID - - httpClient := p.HTTPClient - if httpClient == nil { - httpClient = http.DefaultClient - } - if err := waitForFirstBlock(ctx, httpClient, endpoints.TendermintRpc, p.FirstBlockTimeout, p.PollInterval); err != nil { - return Result{}, err - } - - if err := publishEndpoints(ctx, c, p.Workflow, p.Role, chainID, endpoints); err != nil { - return Result{}, err - } - return Result{Name: snd.Name, ChainID: chainID, Endpoints: endpoints}, nil -} - -func validateParams(p Params) error { - switch { - case p.Role == "": - return fmt.Errorf("provision-snd: --role is required") - case p.TemplatePath == "": - return fmt.Errorf("provision-snd: --template is required") - case p.Workflow.Name == "" || p.Workflow.Namespace == "": - return fmt.Errorf("provision-snd: workflow identity not loaded") - } - return nil -} - -func withDefaults(p Params) Params { - if p.Name == "" { - p.Name = p.Workflow.Name + "-" + p.Role - } - if p.ReadyTimeout == 0 { - p.ReadyTimeout = 15 * time.Minute - } - if p.FirstBlockTimeout == 0 { - p.FirstBlockTimeout = 5 * time.Minute - } - if p.PollInterval == 0 { - p.PollInterval = 5 * time.Second - } - return p -} - -// renderTemplate parses the file at path as a Go text/template, executes it -// against vars (missing keys fail the render — `missingkey=error` option), -// then strict-unmarshals the rendered bytes into a SeiNetwork so -// typos in field names fail here, not at apiserver-Create time. -func renderTemplate(path string, vars map[string]string) (*seiv1alpha1.SeiNetwork, error) { - raw, err := os.ReadFile(path) - if err != nil { - return nil, fmt.Errorf("read: %w", err) - } - tmpl, err := template.New(path).Option("missingkey=error").Parse(string(raw)) - if err != nil { - return nil, fmt.Errorf("parse: %w", err) - } - var buf bytes.Buffer - if err := tmpl.Execute(&buf, vars); err != nil { - return nil, fmt.Errorf("execute: %w", err) - } - out := &seiv1alpha1.SeiNetwork{} - if err := yaml.UnmarshalStrict(buf.Bytes(), out); err != nil { - return nil, fmt.Errorf("unmarshal rendered yaml: %w", err) - } - return out, nil -} - -// stampMetadata overwrites metadata fields the template MUST NOT control. -// OwnerReferences are assigned (not appended) so a template that smuggles -// a bogus ref can't leak through. -func stampMetadata(snd *seiv1alpha1.SeiNetwork, p Params) { - snd.APIVersion = seiv1alpha1.GroupVersion.String() - snd.Kind = "SeiNetwork" - snd.Name = p.Name - snd.Namespace = p.Workflow.Namespace - snd.OwnerReferences = []metav1.OwnerReference{p.Workflow.OwnerRef()} -} - -// warnIfDrift logs when a re-run finds the on-cluster SeiNetwork.Spec different -// from the freshly-rendered one. Operators who edited the template since -// the original Create need to know the cluster still has the old spec. -func warnIfDrift(ctx context.Context, c client.Client, fresh *seiv1alpha1.SeiNetwork) { - existing := &seiv1alpha1.SeiNetwork{} - if err := c.Get(ctx, types.NamespacedName{Namespace: fresh.Namespace, Name: fresh.Name}, existing); err != nil { - return - } - if reflect.DeepEqual(existing.Spec, fresh.Spec) { - return - } - fmt.Fprintf(os.Stderr, "WARN: SeiNetwork %s/%s exists with spec different from rendered template; reusing on-cluster spec\n", fresh.Namespace, fresh.Name) -} - -func waitForReady(ctx context.Context, c client.Client, key types.NamespacedName, timeout, interval time.Duration) error { - return wait.PollUntilContextTimeout(ctx, interval, timeout, true, func(ctx context.Context) (bool, error) { - snd := &seiv1alpha1.SeiNetwork{} - if err := c.Get(ctx, key, snd); err != nil { - if apierrors.IsNotFound(err) { - return false, nil - } - return false, taskruntime.Infra(fmt.Errorf("reading SeiNetwork %s: %w", key, err)) - } - switch snd.Status.Phase { - case seiv1alpha1.GroupPhaseReady: - return true, nil - case seiv1alpha1.GroupPhaseFailed: - return false, taskruntime.Task(fmt.Errorf("SeiNetwork %s reached Failed phase", key)) - } - return false, nil - }) -} - -// tendermintStatusResponse models the subset of Tendermint /status we need. -// Sei's CometBFT fork sometimes returns the body unwrapped (no JSON-RPC -// envelope), so we accept both shapes and fall back via Result/SyncInfo. -type tendermintStatusResponse struct { - Result *struct { - SyncInfo struct { - LatestBlockHeight string `json:"latest_block_height"` - } `json:"sync_info"` - } `json:"result,omitempty"` - SyncInfo struct { - LatestBlockHeight string `json:"latest_block_height"` - } `json:"sync_info"` -} - -func (r *tendermintStatusResponse) latestHeight() string { - if r.Result != nil && r.Result.SyncInfo.LatestBlockHeight != "" { - return r.Result.SyncInfo.LatestBlockHeight - } - return r.SyncInfo.LatestBlockHeight -} - -func waitForFirstBlock(ctx context.Context, hc *http.Client, tmRPC string, timeout, interval time.Duration) error { - return wait.PollUntilContextTimeout(ctx, interval, timeout, true, func(ctx context.Context) (bool, error) { - req, err := http.NewRequestWithContext(ctx, http.MethodGet, tmRPC+"/status", nil) - if err != nil { - return false, taskruntime.Infra(fmt.Errorf("status req: %w", err)) - } - resp, err := hc.Do(req) - if err != nil { - return false, nil - } - defer func() { _ = resp.Body.Close() }() - if resp.StatusCode != http.StatusOK { - return false, nil - } - var parsed tendermintStatusResponse - if err := json.NewDecoder(resp.Body).Decode(&parsed); err != nil { - return false, nil - } - h := parsed.latestHeight() - if h == "" || h == "0" { - return false, nil - } - return true, nil - }) -} - -// publishEndpoints assumes one chain-id per Workflow. CHAIN_ID is written via -// SetVars (a merge patch) — running provision-snd twice with the same -// --var=CHAIN_ID is idempotent; running it against two distinct chains -// silently overwrites and needs an explicit conflict check here. -func publishEndpoints(ctx context.Context, c client.Client, w taskruntime.WorkflowIdentity, role, chainID string, ep seiv1alpha1.Endpoints) error { - if err := taskruntime.EnsureWorkflowVarsCM(ctx, c, w, map[taskruntime.VarKey]string{ - taskruntime.KeyRunID: w.Name, - }); err != nil { - return err - } - vars := map[taskruntime.VarKey]string{ - // CHAIN_ID lives in SetVars (merge), not the EnsureWorkflowVarsCM seed - // (no-op on AlreadyExists): keygen-admin runs first and creates the - // CM, so a CHAIN_ID seed here would be silently dropped. - taskruntime.KeyChainID: chainID, - taskruntime.RoleScoped(role, taskruntime.KeyTendermintRPC): ep.TendermintRpc, - taskruntime.RoleScoped(role, taskruntime.KeyTendermintREST): ep.TendermintRest, - } - if len(ep.Nodes) > 0 { - vars[taskruntime.RoleScoped(role, taskruntime.KeyEVMJSONRPC)] = ep.Nodes[0].EvmJsonRpc - urls := make([]string, 0, len(ep.Nodes)) - for _, n := range ep.Nodes { - urls = append(urls, n.EvmJsonRpc) - } - vars[taskruntime.RoleScoped(role, taskruntime.KeyEVMJSONRPCList)] = strings.Join(urls, ",") - } - return taskruntime.SetVars(ctx, c, w, vars) -} diff --git a/internal/seitask/provisionsnd/provision_test.go b/internal/seitask/provisionsnd/provision_test.go deleted file mode 100644 index 6b97fde1..00000000 --- a/internal/seitask/provisionsnd/provision_test.go +++ /dev/null @@ -1,345 +0,0 @@ -package provisionsnd - -import ( - "context" - "encoding/json" - "net/http" - "net/http/httptest" - "os" - "path/filepath" - "sync" - "testing" - "time" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -const ( - testNamespace = "nightly" - testWorkflowName = "wf-test" - testWorkflowVarsCM = "workflow-vars-wf-test" - testRole = "validator" - testChainID = "bench-1" - testImage = "ghcr.io/sei/sei-chain:abc123" - testAdminAddress = "sei1admin" - varKeyChainID = "CHAIN_ID" - varKeyImage = "IMAGE" - varKeyAdminAddress = "ADMIN_ADDRESS" -) - -const validatorTmpl = `apiVersion: sei.io/v1alpha1 -kind: SeiNetwork -metadata: - name: PLACEHOLDER -spec: - image: {{ .IMAGE }} - replicas: 4 - genesis: - chainId: {{ .CHAIN_ID }} - accounts: - - address: {{ .ADMIN_ADDRESS }} - balance: 1000000000000usei -` - -const configOverridesTmpl = `apiVersion: sei.io/v1alpha1 -kind: SeiNetwork -metadata: - name: PLACEHOLDER -spec: - image: {{ .IMAGE }} - replicas: 2 - genesis: - chainId: {{ .CHAIN_ID }} - configOverrides: - evm.http_port: "8545" -` - -func newScheme(t *testing.T) *runtime.Scheme { - t.Helper() - s := runtime.NewScheme() - for _, add := range []func(*runtime.Scheme) error{ - corev1.AddToScheme, - seiv1alpha1.AddToScheme, - } { - if err := add(s); err != nil { - t.Fatal(err) - } - } - return s -} - -func writeTmpl(t *testing.T, body string) string { - t.Helper() - dir := t.TempDir() - p := filepath.Join(dir, "snd.yaml.tmpl") - if err := os.WriteFile(p, []byte(body), 0o600); err != nil { - t.Fatal(err) - } - return p -} - -func testWorkflow() taskruntime.WorkflowIdentity { - return taskruntime.WorkflowIdentity{Name: testWorkflowName, UID: "uid-test", Namespace: testNamespace} -} - -func TestRenderTemplate_SubstitutesVars(t *testing.T) { - path := writeTmpl(t, validatorTmpl) - snd, err := renderTemplate(path, map[string]string{ - varKeyChainID: testChainID, - varKeyImage: testImage, - varKeyAdminAddress: testAdminAddress, - }) - if err != nil { - t.Fatalf("renderTemplate: %v", err) - } - if snd.Spec.Image != testImage { - t.Errorf("Image: %q", snd.Spec.Image) - } - if snd.Spec.Genesis.ChainID != testChainID { - t.Errorf("Genesis.ChainID: %q", snd.Spec.Genesis.ChainID) - } - if len(snd.Spec.Genesis.Accounts) != 1 || snd.Spec.Genesis.Accounts[0].Address != testAdminAddress { - t.Errorf("Genesis.Accounts: %+v", snd.Spec.Genesis.Accounts) - } -} - -func TestRenderTemplate_MissingVarFailsRender(t *testing.T) { - path := writeTmpl(t, validatorTmpl) - if _, err := renderTemplate(path, map[string]string{varKeyChainID: testChainID}); err == nil { - t.Fatalf("expected error: IMAGE and ADMIN_ADDRESS not provided") - } -} - -func TestRenderTemplate_StrictUnmarshalCatchesTypos(t *testing.T) { - tmpl := `apiVersion: sei.io/v1alpha1 -kind: SeiNetwork -metadata: - name: PLACEHOLDER -spec: - replcas: 4 - image: {{ .IMAGE }} - genesis: - chainId: {{ .CHAIN_ID }} -` - path := writeTmpl(t, tmpl) - if _, err := renderTemplate(path, map[string]string{varKeyChainID: testChainID, varKeyImage: testImage}); err == nil { - t.Fatalf("expected strict-unmarshal error on `replcas` typo") - } -} - -func TestRenderTemplate_ConfigOverridesSubstitution(t *testing.T) { - path := writeTmpl(t, configOverridesTmpl) - snd, err := renderTemplate(path, map[string]string{ - varKeyChainID: testChainID, - varKeyImage: testImage, - }) - if err != nil { - t.Fatalf("renderTemplate: %v", err) - } - if got := snd.Spec.ConfigOverrides["evm.http_port"]; got != "8545" { - t.Errorf("configOverrides[evm.http_port] = %q; want %q", got, "8545") - } - if snd.Spec.Genesis.ChainID != testChainID { - t.Errorf("Genesis.ChainID = %q; want %q", snd.Spec.Genesis.ChainID, testChainID) - } -} - -// TestBundledTemplates_RenderClean ensures every bundled SeiNetwork scenario -// template (genesis/validator) renders + strict-unmarshals against a -// representative --var set. Guards against template-vs-schema drift after a -// CRD field rename. The SeiNode rpc.yaml.tmpl is validated in provisionnode. -func TestBundledTemplates_RenderClean(t *testing.T) { - repoRoot, err := filepath.Abs("../../../") - if err != nil { - t.Fatal(err) - } - cases := []struct { - path string - vars map[string]string - }{ - { - path: filepath.Join(repoRoot, "scenarios", "release-test", "validator.yaml.tmpl"), - vars: map[string]string{varKeyChainID: "rel-test", varKeyImage: "img:1", varKeyAdminAddress: testAdminAddress}, - }, - } - for _, tc := range cases { - t.Run(filepath.Base(tc.path), func(t *testing.T) { - snd, err := renderTemplate(tc.path, tc.vars) - if err != nil { - t.Fatalf("render: %v", err) - } - if snd.Spec.Genesis.ChainID == "" { - t.Fatalf("genesis.chainId empty after render: %+v", snd.Spec) - } - }) - } -} - -func TestStampMetadata_AssignsOwnerRefsNotAppend(t *testing.T) { - snd := &seiv1alpha1.SeiNetwork{ - // Template author smuggling a bogus ownerRef: stampMetadata MUST - // overwrite, not append. - ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: []metav1.OwnerReference{{ - APIVersion: "evil/v1", Kind: "Bogus", Name: "smuggled", UID: "bad", - }}, - }, - } - stampMetadata(snd, Params{Role: testRole, Name: testRole, Workflow: testWorkflow()}) - if len(snd.OwnerReferences) != 1 { - t.Fatalf("ownerReferences: want 1, got %d (%+v)", len(snd.OwnerReferences), snd.OwnerReferences) - } - if snd.OwnerReferences[0].Kind != "Workflow" { - t.Fatalf("ownerRef not replaced: %+v", snd.OwnerReferences[0]) - } -} - -func TestValidateParams(t *testing.T) { - full := Params{ - Role: testRole, - TemplatePath: "x.yaml.tmpl", - Workflow: testWorkflow(), - } - cases := []struct { - name string - mut func(*Params) - want bool - }{ - {"complete", func(*Params) {}, false}, - {"missing role", func(p *Params) { p.Role = "" }, true}, - {"missing template", func(p *Params) { p.TemplatePath = "" }, true}, - {"missing workflow.Name", func(p *Params) { p.Workflow.Name = "" }, true}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - p := full - tc.mut(&p) - err := validateParams(p) - if (err != nil) != tc.want { - t.Fatalf("validateParams err=%v wantErr=%v", err, tc.want) - } - }) - } -} - -func TestRun_EndToEnd_FakeClient(t *testing.T) { - w := testWorkflow() - tmplPath := writeTmpl(t, validatorTmpl) - vars := map[string]string{varKeyChainID: testChainID, varKeyImage: testImage, varKeyAdminAddress: testAdminAddress} - - prestaged, err := renderTemplate(tmplPath, vars) - if err != nil { - t.Fatal(err) - } - stampMetadata(prestaged, Params{Role: testRole, Name: testRole, Workflow: w}) - prestaged.Status.Phase = seiv1alpha1.GroupPhaseReady - prestaged.Status.Endpoints = &seiv1alpha1.Endpoints{ - TendermintRpc: "http://tm.svc:26657", - TendermintRest: "http://rest.svc:1317", - Nodes: []seiv1alpha1.NodeEndpoint{ - {Name: "validator-0", EvmJsonRpc: "http://evm-0.svc:8545"}, - {Name: "validator-1", EvmJsonRpc: "http://evm-1.svc:8545"}, - }, - } - - c := fake.NewClientBuilder(). - WithScheme(newScheme(t)). - WithObjects(prestaged). - WithStatusSubresource(&seiv1alpha1.SeiNetwork{}). - Build() - - srv := fakeStatusServer(t, "42") - defer srv.Close() - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testRole}, prestaged); err != nil { - t.Fatal(err) - } - prestaged.Status.Endpoints.TendermintRpc = srv.URL - if err := c.Status().Update(context.Background(), prestaged); err != nil { - t.Fatal(err) - } - - res, err := Run(context.Background(), c, Params{ - Role: testRole, - Name: testRole, - TemplatePath: tmplPath, - Vars: vars, - ReadyTimeout: 2 * time.Second, - FirstBlockTimeout: 2 * time.Second, - PollInterval: 10 * time.Millisecond, - HTTPClient: srv.Client(), - Workflow: w, - }) - if err != nil { - t.Fatalf("Run: %v", err) - } - if res.Name != testRole || res.ChainID != testChainID { - t.Fatalf("Result: %+v", res) - } - - cm := &corev1.ConfigMap{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, cm); err != nil { - t.Fatalf("get CM: %v", err) - } - if got := cm.Data["CHAIN_ID"]; got != testChainID { - t.Fatalf("CHAIN_ID = %q", got) - } - if cm.Data["VALIDATOR_TM_RPC"] == "" { - t.Fatalf("VALIDATOR_TM_RPC empty") - } - if cm.Data["VALIDATOR_EVM_RPC"] != "http://evm-0.svc:8545" { - t.Fatalf("VALIDATOR_EVM_RPC = %q (want pod-0 only)", cm.Data["VALIDATOR_EVM_RPC"]) - } - if cm.Data["VALIDATOR_EVM_RPC_LIST"] != "http://evm-0.svc:8545,http://evm-1.svc:8545" { - t.Fatalf("VALIDATOR_EVM_RPC_LIST = %q (want comma-separated all-pod URLs)", cm.Data["VALIDATOR_EVM_RPC_LIST"]) - } -} - -func fakeStatusServer(t *testing.T, height string) *httptest.Server { - t.Helper() - var ( - mu sync.Mutex - calls int - ) - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - mu.Lock() - calls++ - mu.Unlock() - _ = json.NewEncoder(w).Encode(map[string]any{ - "sync_info": map[string]any{ - "latest_block_height": height, - }, - }) - })) - t.Cleanup(srv.Close) - return srv -} - -func TestTendermintStatusResponse_LatestHeight(t *testing.T) { - cases := []struct { - name string - body string - want string - }{ - {"jsonrpc envelope", `{"result":{"sync_info":{"latest_block_height":"42"}}}`, "42"}, - {"bare", `{"sync_info":{"latest_block_height":"7"}}`, "7"}, - {"empty", `{}`, ""}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - var r tendermintStatusResponse - if err := json.Unmarshal([]byte(tc.body), &r); err != nil { - t.Fatal(err) - } - if got := r.latestHeight(); got != tc.want { - t.Fatalf("got %q, want %q", got, tc.want) - } - }) - } -} diff --git a/internal/seitask/uploadreport/upload.go b/internal/seitask/uploadreport/upload.go deleted file mode 100644 index d43e0c94..00000000 --- a/internal/seitask/uploadreport/upload.go +++ /dev/null @@ -1,183 +0,0 @@ -// Package uploadreport implements `seitask upload-report`: collect Workflow -// observability artifacts that Loki doesn't index — workflow-vars CM, the -// parent Workflow CR, the WorkflowNode tree — and upload them to S3 under -// a per-run prefix. -// -// Pod stdout/stderr is NOT uploaded; Alloy + Loki on the cluster already -// ingest every Task pod's logs indexed by chaos-mesh.org/workflow. -// upload-report's job is the K8s resource snapshot Loki can't give you: -// the structural record of what fired and how each step terminated. -// -// Runs as the final step of a scenario. The subcommand's exit code mirrors -// the EXIT_REASON workflow-vars value so the Workflow's terminal phase -// reflects scenario outcome rather than upload-step success. Does NOT -// write EXIT_REASON — upload-report is the terminal observer, not a -// producer of the upstream verdict. -// -// +kubebuilder:rbac:groups=chaos-mesh.org,resources=workflows;workflownodes,verbs=get;list -package uploadreport - -import ( - "bytes" - "context" - "fmt" - "strings" - - "github.com/aws/aws-sdk-go-v2/aws" - "github.com/aws/aws-sdk-go-v2/service/s3" - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/yaml" - - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -// S3Uploader is the seam tests inject. Production wires *s3.Client. -type S3Uploader interface { - Put(ctx context.Context, bucket, key string, body []byte) error -} - -// Workflow + WorkflowNode are chaos-mesh.org/v1alpha1 CRs. unstructured so -// the binary doesn't depend on the chaos-mesh Go types for read-only -// artifact collection. -const ( - chaosMeshGroup = "chaos-mesh.org" - chaosMeshVersion = "v1alpha1" - chaosMeshWFLabel = chaosMeshGroup + "/workflow" -) - -var ( - workflowGVK = schema.GroupVersionKind{Group: chaosMeshGroup, Version: chaosMeshVersion, Kind: "Workflow"} - workflowNodeGVK = schema.GroupVersionKind{Group: chaosMeshGroup, Version: chaosMeshVersion, Kind: "WorkflowNode"} -) - -// Params carries the typed inputs to Run. -type Params struct { - Bucket string - Prefix string // S3 key prefix; leading/trailing slashes are trimmed. - Workflow taskruntime.WorkflowIdentity - - S3 S3Uploader -} - -type Result struct { - UploadedKeys []string - ExitReason taskruntime.ExitReason -} - -// Run uploads workflow-vars + Workflow CR + WorkflowNode tree under -// s3:////. Reads EXIT_REASON from workflow-vars so the -// caller can propagate it into the process's exit code. -func Run(ctx context.Context, c client.Client, p Params) (Result, error) { - if err := validate(p); err != nil { - return Result{}, err - } - prefix := strings.Trim(p.Prefix, "/") - res := Result{ExitReason: taskruntime.ExitReasonPass} - - if err := uploadWorkflowVars(ctx, c, p, prefix, &res); err != nil { - return res, err - } - if err := uploadWorkflowResources(ctx, c, p, prefix, &res); err != nil { - return res, err - } - return res, nil -} - -func validate(p Params) error { - switch { - case p.Bucket == "": - return fmt.Errorf("upload-report: --bucket is required") - case p.Prefix == "": - return fmt.Errorf("upload-report: --prefix is required") - case p.Workflow.Name == "" || p.Workflow.Namespace == "": - return fmt.Errorf("upload-report: workflow identity not loaded") - case p.S3 == nil: - return fmt.Errorf("upload-report: S3Uploader seam is required") - } - return nil -} - -func uploadWorkflowVars(ctx context.Context, c client.Client, p Params, prefix string, res *Result) error { - cm := &corev1.ConfigMap{} - err := c.Get(ctx, types.NamespacedName{Namespace: p.Workflow.Namespace, Name: taskruntime.WorkflowVarsName(p.Workflow.Name)}, cm) - switch { - case apierrors.IsNotFound(err): - // No prior Task initialized the CM — pass, nothing to upload. - return nil - case err != nil: - return taskruntime.Infra(fmt.Errorf("reading workflow-vars: %w", err)) - } - if reason := taskruntime.ExitReason(cm.Data[string(taskruntime.KeyExitReason)]); reason != "" { - res.ExitReason = reason - } - body, err := yaml.Marshal(cm.Data) - if err != nil { - return taskruntime.Infra(fmt.Errorf("marshal workflow-vars: %w", err)) - } - return putAt(ctx, p, prefix+"/workflow-vars.yaml", body, res) -} - -// uploadWorkflowResources serializes the parent Workflow CR + every -// WorkflowNode owned by it. The WorkflowNode tree is the canonical record -// of which Task fired, in what order, and how each terminated — the -// structural artifact Loki can't give you from log queries alone. -func uploadWorkflowResources(ctx context.Context, c client.Client, p Params, prefix string, res *Result) error { - wf := &unstructured.Unstructured{} - wf.SetGroupVersionKind(workflowGVK) - if err := c.Get(ctx, types.NamespacedName{Namespace: p.Workflow.Namespace, Name: p.Workflow.Name}, wf); err != nil { - if apierrors.IsNotFound(err) { - return nil // workflow was deleted before our turn; nothing to upload - } - return taskruntime.Infra(fmt.Errorf("reading Workflow CR: %w", err)) - } - body, err := yaml.Marshal(wf.Object) - if err != nil { - return taskruntime.Infra(fmt.Errorf("marshal Workflow CR: %w", err)) - } - if err := putAt(ctx, p, prefix+"/workflow.yaml", body, res); err != nil { - return err - } - - nodes := &unstructured.UnstructuredList{} - nodes.SetGroupVersionKind(workflowNodeGVK) - if err := c.List(ctx, nodes, - client.InNamespace(p.Workflow.Namespace), - client.MatchingLabels{chaosMeshWFLabel: p.Workflow.Name}, - ); err != nil { - return taskruntime.Infra(fmt.Errorf("listing WorkflowNodes: %w", err)) - } - body, err = yaml.Marshal(nodes.Object) - if err != nil { - return taskruntime.Infra(fmt.Errorf("marshal WorkflowNodes: %w", err)) - } - return putAt(ctx, p, prefix+"/workflownodes.yaml", body, res) -} - -func putAt(ctx context.Context, p Params, key string, body []byte, res *Result) error { - if err := p.S3.Put(ctx, p.Bucket, key, body); err != nil { - return taskruntime.Infra(fmt.Errorf("upload %s: %w", key, err)) - } - res.UploadedKeys = append(res.UploadedKeys, key) - return nil -} - -// NewS3Uploader wraps an *s3.Client as an S3Uploader. -func NewS3Uploader(s3c *s3.Client) S3Uploader { - return &s3Uploader{client: s3c} -} - -type s3Uploader struct{ client *s3.Client } - -func (u *s3Uploader) Put(ctx context.Context, bucket, key string, body []byte) error { - _, err := u.client.PutObject(ctx, &s3.PutObjectInput{ - Bucket: aws.String(bucket), - Key: aws.String(key), - Body: bytes.NewReader(body), - }) - return err -} diff --git a/internal/seitask/uploadreport/upload_test.go b/internal/seitask/uploadreport/upload_test.go deleted file mode 100644 index 1fb4f888..00000000 --- a/internal/seitask/uploadreport/upload_test.go +++ /dev/null @@ -1,229 +0,0 @@ -package uploadreport - -import ( - "context" - "errors" - "strings" - "sync" - "testing" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -const ( - testBucket = "harbor-validation-results" - testPrefix = "nightly/release-test/wf-test" - testNamespace = "nightly" - testWorkflowName = "wf-test" -) - -func newScheme(t *testing.T) *runtime.Scheme { - t.Helper() - s := runtime.NewScheme() - if err := corev1.AddToScheme(s); err != nil { - t.Fatal(err) - } - return s -} - -func testWorkflow() taskruntime.WorkflowIdentity { - return taskruntime.WorkflowIdentity{Name: testWorkflowName, UID: "uid-test", Namespace: testNamespace} -} - -// fakeS3 records every PutObject so tests can assert keys + bodies. -type fakeS3 struct { - mu sync.Mutex - objects map[string][]byte - failOn string // bucket+"/"+key prefix to fail on -} - -func newFakeS3() *fakeS3 { return &fakeS3{objects: map[string][]byte{}} } - -func (s *fakeS3) Put(_ context.Context, bucket, key string, body []byte) error { - s.mu.Lock() - defer s.mu.Unlock() - full := bucket + "/" + key - if s.failOn != "" && strings.HasPrefix(full, s.failOn) { - return errors.New("s3 simulated failure") - } - s.objects[full] = append([]byte(nil), body...) - return nil -} - -func workflowVarsCM(data map[string]string) *corev1.ConfigMap { - return &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: taskruntime.WorkflowVarsName(testWorkflowName), - Namespace: testNamespace, - }, - Data: data, - } -} - -func workflowCR() *unstructured.Unstructured { - u := &unstructured.Unstructured{} - u.SetGroupVersionKind(schema.GroupVersionKind{Group: chaosMeshGroup, Version: chaosMeshVersion, Kind: "Workflow"}) - u.SetName(testWorkflowName) - u.SetNamespace(testNamespace) - u.Object["status"] = map[string]any{"phase": "Succeed"} - return u -} - -func workflowNode(name string) *unstructured.Unstructured { - u := &unstructured.Unstructured{} - u.SetGroupVersionKind(schema.GroupVersionKind{Group: chaosMeshGroup, Version: chaosMeshVersion, Kind: "WorkflowNode"}) - u.SetName(name) - u.SetNamespace(testNamespace) - u.SetLabels(map[string]string{chaosMeshWFLabel: testWorkflowName}) - return u -} - -func TestRun_UploadsAllArtifacts(t *testing.T) { - c := fake.NewClientBuilder(). - WithScheme(newScheme(t)). - WithObjects(workflowVarsCM(map[string]string{ - string(taskruntime.KeyRunID): testWorkflowName, - string(taskruntime.KeyExitReason): string(taskruntime.ExitReasonPass), - })). - WithObjects(workflowCR(), workflowNode("step-1"), workflowNode("step-2")). - Build() - s3 := newFakeS3() - - res, err := Run(context.Background(), c, Params{ - Bucket: testBucket, Prefix: testPrefix, Workflow: testWorkflow(), S3: s3, - }) - if err != nil { - t.Fatalf("Run: %v", err) - } - if res.ExitReason != taskruntime.ExitReasonPass { - t.Fatalf("ExitReason = %q", res.ExitReason) - } - for _, k := range []string{ - testBucket + "/" + testPrefix + "/workflow-vars.yaml", - testBucket + "/" + testPrefix + "/workflow.yaml", - testBucket + "/" + testPrefix + "/workflownodes.yaml", - } { - if _, ok := s3.objects[k]; !ok { - t.Fatalf("expected S3 key %q, got %v", k, keysOf(s3)) - } - } -} - -func TestRun_PropagatesEXITReasonInfraFail(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).WithObjects( - workflowVarsCM(map[string]string{string(taskruntime.KeyExitReason): string(taskruntime.ExitReasonInfraFail)}), - ).Build() - - res, err := Run(context.Background(), c, Params{ - Bucket: testBucket, Prefix: testPrefix, Workflow: testWorkflow(), S3: newFakeS3(), - }) - if err != nil { - t.Fatalf("Run: %v", err) - } - if res.ExitReason != taskruntime.ExitReasonInfraFail { - t.Fatalf("ExitReason = %q", res.ExitReason) - } -} - -func TestRun_PropagatesEXITReasonTaskFail(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).WithObjects( - workflowVarsCM(map[string]string{string(taskruntime.KeyExitReason): string(taskruntime.ExitReasonTaskFail)}), - ).Build() - - res, err := Run(context.Background(), c, Params{ - Bucket: testBucket, Prefix: testPrefix, Workflow: testWorkflow(), S3: newFakeS3(), - }) - if err != nil { - t.Fatalf("Run: %v", err) - } - if res.ExitReason != taskruntime.ExitReasonTaskFail { - t.Fatalf("ExitReason = %q", res.ExitReason) - } -} - -func TestRun_NoWorkflowVarsCMTreatsAsPass(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - - res, err := Run(context.Background(), c, Params{ - Bucket: testBucket, Prefix: testPrefix, Workflow: testWorkflow(), S3: newFakeS3(), - }) - if err != nil { - t.Fatalf("Run: %v", err) - } - if res.ExitReason != taskruntime.ExitReasonPass { - t.Fatalf("ExitReason = %q", res.ExitReason) - } -} - -func TestRun_NormalizesLeadingAndTrailingSlashInPrefix(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).WithObjects(workflowCR()).Build() - s3 := newFakeS3() - - _, err := Run(context.Background(), c, Params{ - Bucket: testBucket, Prefix: "/" + testPrefix + "/", Workflow: testWorkflow(), S3: s3, - }) - if err != nil { - t.Fatalf("Run: %v", err) - } - want := testBucket + "/" + testPrefix + "/workflow.yaml" - if _, ok := s3.objects[want]; !ok { - t.Fatalf("expected normalized key %q, got %v", want, keysOf(s3)) - } -} - -func TestRun_S3UploadFailureIsInfraError(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).WithObjects(workflowCR()).Build() - s3 := newFakeS3() - s3.failOn = testBucket + "/" + testPrefix - - _, err := Run(context.Background(), c, Params{ - Bucket: testBucket, Prefix: testPrefix, Workflow: testWorkflow(), S3: s3, - }) - if err == nil { - t.Fatalf("expected error") - } - var infra *taskruntime.InfraError - if !errors.As(err, &infra) { - t.Fatalf("expected InfraError, got %T: %v", err, err) - } -} - -func TestValidate(t *testing.T) { - full := Params{Bucket: testBucket, Prefix: testPrefix, Workflow: testWorkflow(), S3: newFakeS3()} - cases := []struct { - name string - mut func(*Params) - want bool - }{ - {"complete", func(*Params) {}, false}, - {"missing bucket", func(p *Params) { p.Bucket = "" }, true}, - {"missing prefix", func(p *Params) { p.Prefix = "" }, true}, - {"missing workflow", func(p *Params) { p.Workflow.Name = "" }, true}, - {"missing s3 seam", func(p *Params) { p.S3 = nil }, true}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - p := full - tc.mut(&p) - err := validate(p) - if (err != nil) != tc.want { - t.Fatalf("validate err=%v wantErr=%v", err, tc.want) - } - }) - } -} - -func keysOf(s *fakeS3) []string { - out := make([]string, 0, len(s.objects)) - for k := range s.objects { - out = append(out, k) - } - return out -} diff --git a/internal/taskruntime/cm.go b/internal/taskruntime/cm.go deleted file mode 100644 index ee0d4678..00000000 --- a/internal/taskruntime/cm.go +++ /dev/null @@ -1,84 +0,0 @@ -package taskruntime - -import ( - "context" - "fmt" - - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -// cmFieldOwner identifies this library to server-side-apply conflict detection. -const cmFieldOwner client.FieldOwner = "seitask" - -// EnsureWorkflowVarsCM creates the per-run workflow-vars ConfigMap with an -// ownerRef + optional seed entries. AlreadyExists is treated as success -// (idempotent). Called before any SetVar. -func EnsureWorkflowVarsCM(ctx context.Context, c client.Client, w WorkflowIdentity, seed map[VarKey]string) error { - cm := &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: WorkflowVarsName(w.Name), - Namespace: w.Namespace, - OwnerReferences: []metav1.OwnerReference{w.OwnerRef()}, - }, - Data: stringifyKeys(seed), - } - err := c.Create(ctx, cm, cmFieldOwner) - if err == nil { - return nil - } - if apierrors.IsAlreadyExists(err) { - return nil - } - return Infra(fmt.Errorf("creating workflow-vars ConfigMap: %w", err)) -} - -// SetVar writes one key via SetVars. -func SetVar(ctx context.Context, c client.Client, w WorkflowIdentity, key VarKey, value string) error { - return SetVars(ctx, c, w, map[VarKey]string{key: value}) -} - -// SetVars merges multiple keys with MergeFromWithOptimisticLock (matches the -// status-patch discipline in CLAUDE.md). Caller retries on IsConflict if the -// flow is known-racy. -func SetVars(ctx context.Context, c client.Client, w WorkflowIdentity, kv map[VarKey]string) error { - if len(kv) == 0 { - return nil - } - current := &corev1.ConfigMap{} - if err := c.Get(ctx, types.NamespacedName{Namespace: w.Namespace, Name: WorkflowVarsName(w.Name)}, current); err != nil { - return Infra(fmt.Errorf("reading workflow-vars ConfigMap: %w", err)) - } - patch := client.MergeFromWithOptions(current.DeepCopy(), client.MergeFromWithOptimisticLock{}) - if current.Data == nil { - current.Data = map[string]string{} - } - for k, v := range kv { - current.Data[string(k)] = v - } - if err := c.Patch(ctx, current, patch, cmFieldOwner); err != nil { - return Infra(fmt.Errorf("patching workflow-vars ConfigMap: %w", err)) - } - return nil -} - -// WriteExitReason stamps EXIT_REASON from err's classification. Write errors -// are intentionally swallowed — the underlying failure already determined -// the exit code and shouldn't be masked by a CM-write failure. -func WriteExitReason(ctx context.Context, c client.Client, w WorkflowIdentity, err error) { - _ = SetVar(ctx, c, w, KeyExitReason, string(ExitReasonFor(err))) -} - -func stringifyKeys(in map[VarKey]string) map[string]string { - if len(in) == 0 { - return nil - } - out := make(map[string]string, len(in)) - for k, v := range in { - out[string(k)] = v - } - return out -} diff --git a/internal/taskruntime/cm_test.go b/internal/taskruntime/cm_test.go deleted file mode 100644 index a83c8478..00000000 --- a/internal/taskruntime/cm_test.go +++ /dev/null @@ -1,110 +0,0 @@ -package taskruntime - -import ( - "context" - "errors" - "testing" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func newScheme(t *testing.T) *runtime.Scheme { - t.Helper() - s := runtime.NewScheme() - if err := corev1.AddToScheme(s); err != nil { - t.Fatal(err) - } - return s -} - -const ( - testNamespace = "nightly" - testWorkflowName = "wf-test" - testWorkflowVarsCM = "workflow-vars-wf-test" -) - -func testIdentity() WorkflowIdentity { - return WorkflowIdentity{Name: testWorkflowName, UID: "uid-test", Namespace: testNamespace} -} - -func TestEnsureWorkflowVarsCM_CreatesWithOwnerRef(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - w := testIdentity() - - if err := EnsureWorkflowVarsCM(context.Background(), c, w, map[VarKey]string{KeyRunID: testWorkflowName}); err != nil { - t.Fatalf("Ensure: %v", err) - } - - got := &corev1.ConfigMap{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, got); err != nil { - t.Fatalf("Get: %v", err) - } - if got.Data[string(KeyRunID)] != testWorkflowName { - t.Fatalf("seed not written: %v", got.Data) - } - if len(got.OwnerReferences) != 1 || got.OwnerReferences[0].Kind != workflowKind { - t.Fatalf("ownerRef not stamped: %v", got.OwnerReferences) - } -} - -func TestEnsureWorkflowVarsCM_AlreadyExistsIsNoError(t *testing.T) { - existing := &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{Name: testWorkflowVarsCM, Namespace: testNamespace}, - Data: map[string]string{string(KeyRunID): testWorkflowName}, - } - c := fake.NewClientBuilder().WithScheme(newScheme(t)).WithObjects(existing).Build() - - if err := EnsureWorkflowVarsCM(context.Background(), c, testIdentity(), nil); err != nil { - t.Fatalf("Ensure on existing: %v", err) - } -} - -func TestSetVars_MergesIntoExisting(t *testing.T) { - existing := &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{Name: testWorkflowVarsCM, Namespace: testNamespace}, - Data: map[string]string{string(KeyRunID): testWorkflowName}, - } - c := fake.NewClientBuilder().WithScheme(newScheme(t)).WithObjects(existing).Build() - w := testIdentity() - - if err := SetVars(context.Background(), c, w, map[VarKey]string{ - KeyAdminAddress: "sei1abc", - KeyAdminSecretName: "admin-" + testWorkflowName, - }); err != nil { - t.Fatalf("SetVars: %v", err) - } - - got := &corev1.ConfigMap{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, got); err != nil { - t.Fatalf("Get: %v", err) - } - if got.Data[string(KeyRunID)] != testWorkflowName { - t.Fatalf("existing key clobbered: %v", got.Data) - } - if got.Data[string(KeyAdminAddress)] != "sei1abc" || got.Data[string(KeyAdminSecretName)] != "admin-wf-test" { - t.Fatalf("new keys not merged: %v", got.Data) - } -} - -func TestSetVar_ConfigMapMissingIsInfraError(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - err := SetVar(context.Background(), c, testIdentity(), KeyChainID, "bench-1") - if err == nil { - t.Fatalf("expected error when CM missing") - } - var infra *InfraError - if !errors.As(err, &infra) { - t.Fatalf("expected InfraError, got %T", err) - } -} - -func TestSetVars_Empty(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - if err := SetVars(context.Background(), c, testIdentity(), nil); err != nil { - t.Fatalf("SetVars(nil): %v", err) - } -} diff --git a/internal/taskruntime/exit.go b/internal/taskruntime/exit.go deleted file mode 100644 index 0b3e3903..00000000 --- a/internal/taskruntime/exit.go +++ /dev/null @@ -1,60 +0,0 @@ -// Package taskruntime is the shared library for seitask subcommands: typed -// exit codes, ownerReference stamping, and workflow-vars ConfigMap helpers. -package taskruntime - -import ( - "errors" - "fmt" -) - -// Exit codes match qa-testing/release-test.ts: 0=pass, 1=task-fail (the work -// failed), 2=infra-fail (couldn't reach a verdict). Chaos Mesh collapses 1 -// and 2 to "Failed"; downstream readers use the EXIT_REASON workflow-vars -// key to recover the distinction. -const ( - ExitPass = 0 - ExitTaskFailure = 1 - ExitInfraError = 2 -) - -// InfraError marks the failure as non-deterministic (API unreachable, -// timeout, malformed input). Bare errors and TaskError map to exit 1. -type InfraError struct{ Err error } - -func (e *InfraError) Error() string { return fmt.Sprintf("infra: %v", e.Err) } -func (e *InfraError) Unwrap() error { return e.Err } - -// Infra wraps err as an InfraError. nil in → nil out. -func Infra(err error) error { - if err == nil { - return nil - } - return &InfraError{Err: err} -} - -// TaskError marks the failure as work-correctness. Wrapping is optional; -// bare errors map to exit 1 too. -type TaskError struct{ Err error } - -func (e *TaskError) Error() string { return fmt.Sprintf("task: %v", e.Err) } -func (e *TaskError) Unwrap() error { return e.Err } - -// Task wraps err as a TaskError. nil in → nil out. -func Task(err error) error { - if err == nil { - return nil - } - return &TaskError{Err: err} -} - -// ExitCodeFor: nil → 0, InfraError → 2, everything else → 1. -func ExitCodeFor(err error) int { - if err == nil { - return ExitPass - } - var infra *InfraError - if errors.As(err, &infra) { - return ExitInfraError - } - return ExitTaskFailure -} diff --git a/internal/taskruntime/exit_test.go b/internal/taskruntime/exit_test.go deleted file mode 100644 index f1e20454..00000000 --- a/internal/taskruntime/exit_test.go +++ /dev/null @@ -1,32 +0,0 @@ -package taskruntime - -import ( - "errors" - "fmt" - "testing" -) - -func TestExitCodeFor(t *testing.T) { - plain := errors.New("plain") - cases := []struct { - name string - err error - want int - }{ - {"nil", nil, ExitPass}, - {"plain", plain, ExitTaskFailure}, - {"task", Task(plain), ExitTaskFailure}, - {"infra", Infra(plain), ExitInfraError}, - {"infra wrapped twice", fmt.Errorf("outer: %w", Infra(plain)), ExitInfraError}, - {"task wrapped twice", fmt.Errorf("outer: %w", Task(plain)), ExitTaskFailure}, - {"Task(nil)", Task(nil), ExitPass}, - {"Infra(nil)", Infra(nil), ExitPass}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - if got := ExitCodeFor(tc.err); got != tc.want { - t.Fatalf("ExitCodeFor(%v) = %d, want %d", tc.err, got, tc.want) - } - }) - } -} diff --git a/internal/taskruntime/ownerref.go b/internal/taskruntime/ownerref.go deleted file mode 100644 index dff3b83a..00000000 --- a/internal/taskruntime/ownerref.go +++ /dev/null @@ -1,94 +0,0 @@ -package taskruntime - -import ( - "context" - "fmt" - "os" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -// Identity env contract for Task pods (downward API, projected by the -// scenario YAML's container env block): -// -// SEI_WORKFLOW_NAME ← fieldRef metadata.labels['chaos-mesh.org/workflow'] -// SEI_NAMESPACE ← fieldRef metadata.namespace -// -// The Workflow CR's UID is NOT projectable via downward API — Chaos Mesh -// does not stamp it on Task pods — so we fetch it from the apiserver at -// subcommand startup using NAME + NAMESPACE. SEI_WORKFLOW_UID env, when -// set, short-circuits the lookup; tests use it. -const ( - EnvWorkflowName = "SEI_WORKFLOW_NAME" - EnvWorkflowUID = "SEI_WORKFLOW_UID" - EnvNamespace = "SEI_NAMESPACE" - - workflowAPIVersion = "chaos-mesh.org/v1alpha1" - workflowKind = "Workflow" -) - -var workflowGVK = schema.GroupVersionKind{Group: "chaos-mesh.org", Version: "v1alpha1", Kind: workflowKind} - -// WorkflowIdentity is the parent Workflow CR's identity, read once at -// subcommand startup. -type WorkflowIdentity struct { - Name string - UID types.UID - Namespace string -} - -// LoadWorkflowIdentity reads NAME + NAMESPACE from env (downward-API -// projected on each Task pod), then fetches the Workflow CR's UID from -// the apiserver. SEI_WORKFLOW_UID env short-circuits the round-trip. -func LoadWorkflowIdentity(ctx context.Context, c client.Client) (WorkflowIdentity, error) { - name := os.Getenv(EnvWorkflowName) - ns := os.Getenv(EnvNamespace) - missing := []string{} - if name == "" { - missing = append(missing, EnvWorkflowName) - } - if ns == "" { - missing = append(missing, EnvNamespace) - } - if len(missing) > 0 { - return WorkflowIdentity{}, Infra(fmt.Errorf("downward-API env not projected: %v", missing)) - } - if uid := os.Getenv(EnvWorkflowUID); uid != "" { - return WorkflowIdentity{Name: name, UID: types.UID(uid), Namespace: ns}, nil - } - wf := &unstructured.Unstructured{} - wf.SetGroupVersionKind(workflowGVK) - if err := c.Get(ctx, types.NamespacedName{Namespace: ns, Name: name}, wf); err != nil { - return WorkflowIdentity{}, Infra(fmt.Errorf("fetching Workflow %s/%s for UID: %w", ns, name, err)) - } - uid := wf.GetUID() - if uid == "" { - return WorkflowIdentity{}, Infra(fmt.Errorf("workflow %s/%s exists but has no UID", ns, name)) - } - return WorkflowIdentity{Name: name, UID: uid, Namespace: ns}, nil -} - -// OwnerRef returns an ownerReference to the parent Workflow CR. Controller -// is explicit-false (Chaos Mesh manages Workflow children only via -// WorkflowNodes); BlockOwnerDeletion is explicit-false so cleanup doesn't -// stall on slow Task children. -func (w WorkflowIdentity) OwnerRef() metav1.OwnerReference { - return metav1.OwnerReference{ - APIVersion: workflowAPIVersion, - Kind: workflowKind, - Name: w.Name, - UID: w.UID, - Controller: new(bool), - BlockOwnerDeletion: new(bool), - } -} - -// WorkflowVarsName returns the per-run workflow-vars ConfigMap name. -// Single-sourced so producers and consumers don't drift. -func WorkflowVarsName(workflowName string) string { - return "workflow-vars-" + workflowName -} diff --git a/internal/taskruntime/ownerref_test.go b/internal/taskruntime/ownerref_test.go deleted file mode 100644 index d54df7f4..00000000 --- a/internal/taskruntime/ownerref_test.go +++ /dev/null @@ -1,102 +0,0 @@ -package taskruntime - -import ( - "context" - "errors" - "testing" - - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func TestLoadWorkflowIdentity(t *testing.T) { - t.Run("env-short-circuit (UID set)", func(t *testing.T) { - t.Setenv(EnvWorkflowName, "release-test-abc") - t.Setenv(EnvWorkflowUID, "uid-xyz") - t.Setenv(EnvNamespace, testNamespace) - - // Fake client w/ no Workflow CR — env UID should short-circuit - // the apiserver lookup, so this should still succeed. - c := fake.NewClientBuilder().Build() - w, err := LoadWorkflowIdentity(context.Background(), c) - if err != nil { - t.Fatalf("LoadWorkflowIdentity: %v", err) - } - if w.Name != "release-test-abc" || string(w.UID) != "uid-xyz" || w.Namespace != testNamespace { - t.Fatalf("got %+v", w) - } - }) - - t.Run("apiserver-lookup (UID env empty)", func(t *testing.T) { - t.Setenv(EnvWorkflowName, "release-test-abc") - t.Setenv(EnvWorkflowUID, "") - t.Setenv(EnvNamespace, testNamespace) - - wf := &unstructured.Unstructured{} - wf.SetGroupVersionKind(workflowGVK) - wf.SetName("release-test-abc") - wf.SetNamespace(testNamespace) - wf.SetUID("uid-from-apiserver") - - scheme := runtime.NewScheme() - c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(wf).Build() - w, err := LoadWorkflowIdentity(context.Background(), c) - if err != nil { - t.Fatalf("LoadWorkflowIdentity: %v", err) - } - if string(w.UID) != "uid-from-apiserver" { - t.Fatalf("expected UID from apiserver lookup, got %q", w.UID) - } - }) - - t.Run("missing name", func(t *testing.T) { - t.Setenv(EnvWorkflowName, "") - t.Setenv(EnvNamespace, testNamespace) - c := fake.NewClientBuilder().Build() - _, err := LoadWorkflowIdentity(context.Background(), c) - var infra *InfraError - if !errors.As(err, &infra) { - t.Fatalf("expected InfraError, got %T: %v", err, err) - } - }) - - t.Run("workflow CR not found", func(t *testing.T) { - t.Setenv(EnvWorkflowName, "missing-workflow") - t.Setenv(EnvWorkflowUID, "") - t.Setenv(EnvNamespace, testNamespace) - - scheme := runtime.NewScheme() - c := fake.NewClientBuilder().WithScheme(scheme).Build() - _, err := LoadWorkflowIdentity(context.Background(), c) - var infra *InfraError - if !errors.As(err, &infra) { - t.Fatalf("expected InfraError, got %T: %v", err, err) - } - }) -} - -func TestOwnerRef(t *testing.T) { - w := WorkflowIdentity{Name: "wf", UID: "uid", Namespace: "ns"} - ref := w.OwnerRef() - if ref.APIVersion != "chaos-mesh.org/v1alpha1" || ref.Kind != workflowKind { - t.Fatalf("wrong target: %+v", ref) - } - if ref.Name != "wf" || string(ref.UID) != "uid" { - t.Fatalf("wrong identity: %+v", ref) - } - if ref.Controller == nil || *ref.Controller { - t.Fatalf("Controller should be explicit false; got %+v", ref.Controller) - } - if ref.BlockOwnerDeletion == nil || *ref.BlockOwnerDeletion { - t.Fatalf("BlockOwnerDeletion should be explicit false; got %+v", ref.BlockOwnerDeletion) - } -} - -func TestWorkflowVarsName(t *testing.T) { - got := WorkflowVarsName("major-upgrade-20260520-184443") - want := "workflow-vars-major-upgrade-20260520-184443" - if got != want { - t.Fatalf("got %q, want %q", got, want) - } -} diff --git a/internal/taskruntime/scenarios_test.go b/internal/taskruntime/scenarios_test.go deleted file mode 100644 index 58a7e252..00000000 --- a/internal/taskruntime/scenarios_test.go +++ /dev/null @@ -1,96 +0,0 @@ -package taskruntime - -import ( - "os" - "path/filepath" - "strings" - "testing" - - "sigs.k8s.io/yaml" -) - -// TestScenarioYAMLs_CMNameMatchesWorkflowVarsName guards the contract -// surfaced by sei-protocol/sei-k8s-controller#337: scenario YAML's -// envFrom configMapRef.name MUST match what WorkflowVarsName produces -// for the scenario's own Workflow CR name. The first manual fire of -// release-test stuck a Task pod in CreateContainerConfigError for ~8m -// because the scenario referenced workflow-vars- but keygen -// created workflow-vars-. -// -// Scenarios using the seitask binary's typed CM helpers must opt in to -// this test by appearing in `scenariosToCheck` below. Bash-driven -// scenarios (e.g., major-upgrade.yaml today) are excluded — they -// create the CM via kubectl with their own naming convention. -func TestScenarioYAMLs_CMNameMatchesWorkflowVarsName(t *testing.T) { - scenariosDir, err := filepath.Abs("../../scenarios") - if err != nil { - t.Fatal(err) - } - scenariosToCheck := []string{"release-test.yaml", "load-test.yaml"} - - for _, name := range scenariosToCheck { - t.Run(name, func(t *testing.T) { - raw, err := os.ReadFile(filepath.Join(scenariosDir, name)) - if err != nil { - t.Fatal(err) - } - workflowName := workflowMetaName(t, raw) - wantCMName := WorkflowVarsName(workflowName) - cmRefs := configMapRefNames(t, raw) - if len(cmRefs) == 0 { - t.Fatalf("no envFrom configMapRef names found — scenario isn't exercising the bridge") - } - for i, got := range cmRefs { - if got != wantCMName { - t.Errorf("configMapRef[%d].name = %q; want %q (workflow %q)", i, got, wantCMName, workflowName) - } - } - }) - } -} - -// workflowMetaName extracts the first `kind: Workflow` document's -// metadata.name from a multi-doc scenario YAML. -func workflowMetaName(t *testing.T, raw []byte) string { - t.Helper() - for doc := range strings.SplitSeq(string(raw), "\n---\n") { - var head struct { - Kind string `json:"kind"` - Metadata struct { - Name string `json:"name"` - } `json:"metadata"` - } - if err := yaml.Unmarshal([]byte(doc), &head); err != nil { - continue - } - if head.Kind == "Workflow" && head.Metadata.Name != "" { - return head.Metadata.Name - } - } - t.Fatal("no kind=Workflow document with metadata.name found") - return "" -} - -// configMapRefNames pulls every envFrom configMapRef.name in the scenario. -// Walks the Workflow.spec.templates[].task.container.envFrom path; using a -// regex over the raw YAML keeps the test independent of the chaos-mesh -// Go types. -func configMapRefNames(t *testing.T, raw []byte) []string { - t.Helper() - var names []string - lines := strings.Split(string(raw), "\n") - for i, line := range lines { - if !strings.Contains(line, "configMapRef:") { - continue - } - // Next non-blank line should be ` name: ` at deeper indent. - for j := i + 1; j < len(lines) && j < i+4; j++ { - trimmed := strings.TrimSpace(lines[j]) - if rest, ok := strings.CutPrefix(trimmed, "name:"); ok { - names = append(names, strings.TrimSpace(rest)) - break - } - } - } - return names -} diff --git a/internal/taskruntime/vars.go b/internal/taskruntime/vars.go deleted file mode 100644 index e1766d80..00000000 --- a/internal/taskruntime/vars.go +++ /dev/null @@ -1,64 +0,0 @@ -package taskruntime - -import "strings" - -// VarKey is a typed key for the workflow-vars ConfigMap. Producers and -// consumers reference these constants so renames are compile errors. Schema + -// stability discipline: https://github.com/sei-protocol/bdchatham-designs/blob/main/designs/test-harness/test-harness-lld.md. -type VarKey string - -const ( - // KeyRunID — Workflow CR's metadata.name. Written by the initializing Task. - KeyRunID VarKey = "RUN_ID" - - // KeyChainID — the SeiNetwork's chainId. One-way door. - KeyChainID VarKey = "CHAIN_ID" - - // Endpoints — written by provision-snd after the SeiNetwork is Ready. - // KeyEVMJSONRPC is pod-0 only (release-test pins stateful EVM - // sequences to one pod). KeyEVMJSONRPCList is comma-separated - // per-pod URLs for seiload, whose stateful EVM workload needs to - // hit all RPC pods. - KeyTendermintRPC VarKey = "TM_RPC" - KeyTendermintREST VarKey = "REST" - KeyEVMJSONRPC VarKey = "EVM_RPC" - KeyEVMJSONRPCList VarKey = "EVM_RPC_LIST" - - // Admin identity — written by keygen. Mnemonic itself lives in the - // referenced Secret, not the ConfigMap. - KeyAdminAddress VarKey = "ADMIN_ADDRESS" - KeyAdminSecretName VarKey = "ADMIN_SECRET_NAME" - - // KeyExitReason — written by the failing Task pre-exit. upload-report - // reads this to recover the exit-code class Chaos Mesh collapses. - KeyExitReason VarKey = "EXIT_REASON" -) - -// ExitReason is the string mirror of ExitCodeFor for the EXIT_REASON CM value. -type ExitReason string - -const ( - ExitReasonPass ExitReason = "pass" - ExitReasonTaskFail ExitReason = "task-fail" - ExitReasonInfraFail ExitReason = "infra-fail" -) - -// ExitReasonFor mirrors ExitCodeFor: nil → pass, InfraError → infra-fail, -// otherwise → task-fail. -func ExitReasonFor(err error) ExitReason { - switch ExitCodeFor(err) { - case ExitPass: - return ExitReasonPass - case ExitInfraError: - return ExitReasonInfraFail - default: - return ExitReasonTaskFail - } -} - -// RoleScoped prefixes a key with an upper-cased role tag so scenarios with -// multiple SeiNetworks (validator + rpc) write disjoint workflow-vars keys. -// RoleScoped("validator", KeyTendermintRPC) → "VALIDATOR_TM_RPC". -func RoleScoped(role string, key VarKey) VarKey { - return VarKey(strings.ToUpper(role) + "_" + string(key)) -} diff --git a/internal/taskruntime/vars_test.go b/internal/taskruntime/vars_test.go deleted file mode 100644 index 2a090217..00000000 --- a/internal/taskruntime/vars_test.go +++ /dev/null @@ -1,46 +0,0 @@ -package taskruntime - -import ( - "errors" - "testing" -) - -func TestRoleScoped(t *testing.T) { - cases := []struct { - role string - key VarKey - want VarKey - }{ - {"validator", KeyTendermintRPC, "VALIDATOR_TM_RPC"}, - {"rpc", KeyEVMJSONRPC, "RPC_EVM_RPC"}, - {"Validator", KeyChainID, "VALIDATOR_CHAIN_ID"}, - } - for _, tc := range cases { - t.Run(string(tc.want), func(t *testing.T) { - if got := RoleScoped(tc.role, tc.key); got != tc.want { - t.Fatalf("RoleScoped(%q, %q) = %q, want %q", tc.role, tc.key, got, tc.want) - } - }) - } -} - -func TestExitReasonFor(t *testing.T) { - plain := errors.New("plain") - cases := []struct { - name string - err error - want ExitReason - }{ - {"nil", nil, ExitReasonPass}, - {"plain", plain, ExitReasonTaskFail}, - {"task", Task(plain), ExitReasonTaskFail}, - {"infra", Infra(plain), ExitReasonInfraFail}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - if got := ExitReasonFor(tc.err); got != tc.want { - t.Fatalf("ExitReasonFor(%v) = %s, want %s", tc.err, got, tc.want) - } - }) - } -} diff --git a/manifests/role.yaml b/manifests/role.yaml index 9dccc620..76cd7fd3 100644 --- a/manifests/role.yaml +++ b/manifests/role.yaml @@ -67,14 +67,6 @@ rules: - patch - update - watch -- apiGroups: - - chaos-mesh.org - resources: - - workflownodes - - workflows - verbs: - - get - - list - apiGroups: - sei.io resources: diff --git a/manifests/sei.io_seinodetasks.yaml b/manifests/sei.io_seinodetasks.yaml index 351caca6..4f595fc6 100644 --- a/manifests/sei.io_seinodetasks.yaml +++ b/manifests/sei.io_seinodetasks.yaml @@ -349,7 +349,7 @@ spec: description: |- Target identifies the single SeiNode this task operates on. Fan-out targeting (label selectors) is intentionally out of scope at the CRD - layer — express fan-out at the seitask-runner / Chaos Workflow layer. + layer — express fan-out in the orchestrating caller (one task per node). properties: nodeRef: description: NodeRef is a same-namespace reference to a SeiNode. diff --git a/runner/rbac.yaml b/runner/rbac.yaml deleted file mode 100644 index ee6709ef..00000000 --- a/runner/rbac.yaml +++ /dev/null @@ -1,94 +0,0 @@ -# RBAC for seitask-runner — apply alongside each Chaos Mesh Workflow that uses -# the runner image. The Workflow's `task.container.serviceAccountName` must -# reference this ServiceAccount. -# -# Namespaced (Role, not ClusterRole) — the runner only operates on resources -# in the same namespace as the Workflow. -# -# The `configmaps` verbs cover the PR 6 cross-step variable bridge: bash -# steps (compute-target-height, resolve-proposal-id) upsert -# `workflow-vars-` via `kubectl apply`, and every other step reads -# values via `envFrom: configMapRef`. The kubelet itself needs no extra -# RBAC for the envFrom read — it uses kubelet credentials, not the -# workload SA — but the producer steps run `kubectl get/create/apply` -# under the workload SA and therefore require these verbs. ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: seitask-runner ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: seitask-runner -rules: -# SeiNodeTask: create + read + SSA-patch (apply uses PATCH with -# application/apply-patch+yaml, which requires the `patch` verb in addition -# to `create` — `create` alone 403s on the second invocation). -- apiGroups: ["sei.io"] - resources: ["seinodetasks"] - verbs: ["create", "get", "list", "watch", "patch", "update"] -- apiGroups: ["sei.io"] - resources: ["seinodetasks/status"] - verbs: ["get"] -# SeiNode: create + read. create is required by provision-node, whose first -# act is c.Create(SeiNode) to fan out N standalone follower CRs; without it -# every provision-node step 403s. get/list/watch back the --per-node-selector -# fan-out mode and the post-create wait-for-Running loop (.status.phase / -# .status.endpoint are read off the main object, gated by seinodes get). -- apiGroups: ["sei.io"] - resources: ["seinodes"] - verbs: ["create", "get", "list", "watch"] -# seinodes/status get is a forward-compat over-grant, NOT currently exercised: -# provision-node reads .status off the main object (seinodes get), not the -# subresource. Kept to match the /status pattern above and a future -# c.Status() read. -- apiGroups: ["sei.io"] - resources: ["seinodes/status"] - verbs: ["get"] -# SeiNetwork: create + read for provision-snd. Polls .status.phase -# until Ready and reads .status.endpoints to publish role-scoped TM/REST/ -# EVM URLs into workflow-vars. patch covers the major-upgrade bump-snd-image -# step, which `kubectl patch --type=merge`es spec.image to roll -# all validators onto the post-upgrade binary in a single write. -- apiGroups: ["sei.io"] - resources: ["seinetworks"] - verbs: ["create", "get", "list", "watch", "patch"] -- apiGroups: ["sei.io"] - resources: ["seinetworks/status"] - verbs: ["get"] -# Secrets: get + create for keygen, which writes the per-run admin -# mnemonic Secret. Downstream Tasks consume it via secretKeyRef on the -# Pod's env, which the kubelet handles under its own credentials. -- apiGroups: [""] - resources: ["secrets"] - verbs: ["get", "create"] -# ConfigMaps: workflow-vars- upsert by producer bash steps. -# `create`+`patch` are the apply-path verbs; `get`+`list`+`watch` support -# read-modify-write (resolve-proposal-id merges PROPOSAL_ID into the -# existing object); `update` covers the non-SSA apply path some kubectl -# versions emit. -- apiGroups: [""] - resources: ["configmaps"] - verbs: ["get", "list", "watch", "create", "patch", "update"] -# Chaos Mesh Workflow + WorkflowNode: read-only. -# workflows.get: LoadWorkflowIdentity fetches the Workflow CR's UID for -# ownerRef stamping (Chaos Mesh doesn't project UID via downward API). -# workflownodes.{get,list}: upload-report enumerates the per-step -# WorkflowNode tree as part of the S3 snapshot. -- apiGroups: ["chaos-mesh.org"] - resources: ["workflows", "workflownodes"] - verbs: ["get", "list"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: seitask-runner -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: seitask-runner -subjects: -- kind: ServiceAccount - name: seitask-runner diff --git a/runner/templates/await-condition.yaml.tmpl b/runner/templates/await-condition.yaml.tmpl deleted file mode 100644 index 2d936112..00000000 --- a/runner/templates/await-condition.yaml.tmpl +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNodeTask -metadata: - name: PLACEHOLDER -spec: - kind: AwaitCondition - target: - nodeRef: - name: {{ .NODE }} - timeoutSeconds: {{ with index . "TIMEOUT_SECONDS" }}{{ . }}{{ else }}900{{ end }} - awaitCondition: - height: - targetHeight: {{ .TARGET_HEIGHT }} - {{- with index . "ACTION" }} - action: {{ . }} - {{- end }} diff --git a/runner/templates/await-nodes-at-height.yaml.tmpl b/runner/templates/await-nodes-at-height.yaml.tmpl deleted file mode 100644 index b541d694..00000000 --- a/runner/templates/await-nodes-at-height.yaml.tmpl +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNodeTask -metadata: - name: PLACEHOLDER -spec: - kind: AwaitNodesAtHeight - target: - nodeRef: - name: {{ .NODE }} - timeoutSeconds: {{ with index . "TIMEOUT_SECONDS" }}{{ . }}{{ else }}1800{{ end }} - awaitNodesAtHeight: - targetHeight: {{ .TARGET_HEIGHT }} diff --git a/runner/templates/gov-software-upgrade.yaml.tmpl b/runner/templates/gov-software-upgrade.yaml.tmpl deleted file mode 100644 index a1e581a7..00000000 --- a/runner/templates/gov-software-upgrade.yaml.tmpl +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNodeTask -metadata: - # name is rewritten to a deterministic value by the runner; this placeholder - # is here only to keep the manifest schema-valid pre-rewrite. - name: PLACEHOLDER -spec: - kind: GovSoftwareUpgrade - target: - nodeRef: - name: {{ .NODE }} - timeoutSeconds: {{ with index . "TIMEOUT_SECONDS" }}{{ . }}{{ else }}600{{ end }} - govSoftwareUpgrade: - chainId: {{ .CHAIN_ID }} - {{- with index . "KEY_NAME" }} - keyName: {{ . }} - {{- end }} - title: {{ .TITLE | printf "%q" }} - description: {{ .DESCRIPTION | printf "%q" }} - upgradeName: {{ .UPGRADE_NAME }} - upgradeHeight: {{ .UPGRADE_HEIGHT }} - {{- with index . "UPGRADE_INFO" }} - upgradeInfo: {{ . | printf "%q" }} - {{- end }} - initialDeposit: {{ .INITIAL_DEPOSIT }} - fees: {{ .FEES }} - gas: {{ .GAS }} diff --git a/runner/templates/gov-vote.yaml.tmpl b/runner/templates/gov-vote.yaml.tmpl deleted file mode 100644 index 3111b977..00000000 --- a/runner/templates/gov-vote.yaml.tmpl +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNodeTask -metadata: - name: PLACEHOLDER -spec: - kind: GovVote - target: - nodeRef: - name: {{ .NODE }} - timeoutSeconds: {{ with index . "TIMEOUT_SECONDS" }}{{ . }}{{ else }}300{{ end }} - govVote: - chainId: {{ .CHAIN_ID }} - {{- with index . "KEY_NAME" }} - keyName: {{ . }} - {{- end }} - proposalId: {{ .PROPOSAL_ID }} - option: "{{ .OPTION }}" - fees: {{ .FEES }} - gas: {{ .GAS }} diff --git a/runner/templates/update-node-image.yaml.tmpl b/runner/templates/update-node-image.yaml.tmpl deleted file mode 100644 index e6ce3ec9..00000000 --- a/runner/templates/update-node-image.yaml.tmpl +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNodeTask -metadata: - name: PLACEHOLDER -spec: - kind: UpdateNodeImage - target: - nodeRef: - name: {{ .NODE }} - # Pre-upgrade nodes may be Pending or CrashLooping; relax the default - # Running-only requirement unless the operator overrides REQUIRE_PHASE. - requirePhase: {{ with index . "REQUIRE_PHASE" }}{{ . }}{{ else }}Running{{ end }} - timeoutSeconds: {{ with index . "TIMEOUT_SECONDS" }}{{ . }}{{ else }}1800{{ end }} - updateNodeImage: - image: {{ .IMAGE }} diff --git a/scenarios/README.md b/scenarios/README.md deleted file mode 100644 index 8cf2ea65..00000000 --- a/scenarios/README.md +++ /dev/null @@ -1,264 +0,0 @@ -# Scenarios - -End-to-end Chaos Mesh Workflows that compose `SeiNodeTask` CRs to exercise -chain-lifecycle behavior against real Kubernetes clusters. Each scenario is -the acceptance test for one capability surface. - -> **Status: runnable, gated on runner image publishing.** PR 6 closed the -> cross-step variable bridge gap via a per-Workflow-run ConfigMap -> (`workflow-vars-`). The bash steps that compute `TARGET_HEIGHT` -> and resolve `PROPOSAL_ID` `kubectl apply` the ConfigMap; every other -> step reads values via `envFrom`. End-to-end runs require a published -> `seitask-runner` image (see Prerequisites, item 4) -- everything else -> is in-tree. - -## Index - -| File | Mirrors | Purpose | -|---|---|---| -| `major-upgrade.yaml` | `sei-chain/integration_test/upgrade_module/major_upgrade_test.yaml` | 4-validator software-upgrade flow: gov proposal, vote, then a single SeiNetwork image bump that rolls all validators onto the new binary at the upgrade height. MVP acceptance for the SeiNodeTask CRD. | -| `testnet-deployment.yaml` | n/a | Reference 4-validator `SeiNetwork` the Workflow can target. | - -## Where this runs - -These scenarios are **destructive**. They submit governance proposals, mutate -SeiNode images, and drive validators through CrashLoop states. They are -designed for: - -- The **harbor dev cluster** (`harbor-dev` EKS). Ephemeral testnets only. -- Local `kind`/`minikube` clusters with the controller installed. -- **Not** any cluster carrying a chain you care about. - -The Workflow does not provision the chain. It assumes a 4-validator -`SeiNetwork` exists in the target namespace before the Workflow -applies. See "Run" below. - -## Prerequisites - -1. **CRDs installed** in the target cluster: - - `seinetworks.sei.io` - - `seinodes.sei.io` - - `seinodetasks.sei.io` - - ```bash - kubectl apply -f config/crd/ - ``` - -2. **Controller running** in `sei-k8s-controller-system` (or wherever the - platform repo installs it) and watching the target namespace. - -3. **Chaos Mesh installed** in the cluster (2.5+ verified). The dev cluster - already ships this via `platform/clusters/dev/chaos-mesh`. - -4. **`seitask-runner` image published** to a registry the cluster can pull - from. As of PR 5 the runner image is **not yet auto-published** by the - `ecr.yml` GitHub Action (which only builds the controller). Until that - workflow is extended, the operator must build and push manually: - - ```bash - make runner-image RUNNER_IMG=/sei/seitask-runner: - make runner-push RUNNER_IMG=/sei/seitask-runner: - ``` - - The image bundles the per-kind templates at `/templates/`; no ConfigMap - override is required for the in-tree scenarios. - -5. **RBAC for the runner ServiceAccount.** `runner/rbac.yaml` defines the - `seitask-runner` ServiceAccount + Role + RoleBinding. Apply it to the - namespace where the Workflow will run. - - Chaos Mesh `Task.container` does NOT expose `serviceAccountName` -- the - synthesized Workflow pod uses the namespace's `default` ServiceAccount. - You therefore need to either (a) bind the `seitask-runner` Role to the - `default` SA in the test namespace, or (b) use the - `chaos-mesh.org/inject-serviceaccount` annotation if your Chaos Mesh - build includes the SA-injection webhook (2.6+ optional component). - - Recommended: bind to `default` SA in the ephemeral namespace. - - ```bash - kubectl apply -n -f runner/rbac.yaml - kubectl create rolebinding seitask-runner-default \ - --role=seitask-runner --serviceaccount=:default \ - -n - ``` - -## Run: major-upgrade - -### 1. Apply the reference testnet - -```bash -kubectl apply -f scenarios/testnet-deployment.yaml -# wait for status.replicas == status.readyReplicas == 4 -kubectl -n majorupgrade wait --for=condition=NodesReady=true \ - --timeout=15m seinetwork/majorupgrade -# spot-check the validator phases -kubectl -n majorupgrade get seinodes -# expected: majorupgrade-0 .. majorupgrade-3, all Running. -``` - -### 2. Render and apply the Workflow - -The Workflow YAML uses `envsubst`-style placeholders so the same file works -across upgrade targets. Substitute and apply: - -```bash -export SEI_DEPLOYMENT=majorupgrade -export SEI_NAMESPACE=majorupgrade -export SEI_CHAIN_ID=majorupgrade-1 -export SEI_PRE_UPGRADE_IMG=ghcr.io/sei-protocol/sei:v6.3.0 -export SEI_POST_UPGRADE_IMG=ghcr.io/sei-protocol/sei:v6.4.0 -export SEI_UPGRADE_NAME=v6.4.0 -export SEITASK_RUNNER_IMG=/sei/seitask-runner: -# Unique per Workflow run -- drives the `workflow-vars-` ConfigMap -# name. Two concurrent runs of the same Workflow must not collide. -export SEI_WORKFLOW_RUN_ID="$(date +%s)-$(openssl rand -hex 3)" - -envsubst < scenarios/major-upgrade.yaml \ - | kubectl apply -n "${SEI_NAMESPACE}" -f - -``` - -`envsubst` is from the `gettext` package (preinstalled on most Linux distros; -`brew install gettext` on macOS). For Flux/ArgoCD-managed deployments, replace -the envsubst step with a kustomize patch or `flux create kustomization ---substitute-from=secret/...`. - -### 3. Watch progress - -```bash -# Workflow node states -kubectl -n majorupgrade get workflownodes -l workflow=major-upgrade -# top-level -kubectl -n majorupgrade describe workflow major-upgrade -# tail step containers as they spin up -kubectl -n majorupgrade get pods -l chaos-mesh.org/workflow=major-upgrade -w -``` - -### 4. Interpret results - -Each step exits 0 (PASS) or 1 (FAIL). Chaos Mesh records terminal status on -the corresponding `WorkflowNode`. The Workflow itself is `Succeeded` when -every step in the entry Serial path completed; `Failed` when any required -step (no `conditionalBranches` override) failed. - -Per-step interpretation: - -| Step | What success means | -|---|---| -| `compute-target-height` | Created `workflow-vars-${SEI_WORKFLOW_RUN_ID}` ConfigMap with `TARGET_HEIGHT` / `UPGRADE_HEIGHT` / `POST_UPGRADE_HEIGHT`. | -| `submit-upgrade-proposal` | SeiNodeTask `.status.phase=Complete`. proposalId is NOT extracted here (sidecar structured outputs are intentionally empty post-PR 3); `resolve-proposal-id` derives it from the chain. | -| `resolve-proposal-id` | Polled gov REST for a voting-period proposal whose plan name matches `$SEI_UPGRADE_NAME`, merged `PROPOSAL_ID` into the workflow-vars ConfigMap. | -| `vote-yes-all-validators` | All 4 vote tasks Complete. | -| `wait-for-proposal-to-pass` | Proposal observed `PROPOSAL_STATUS_PASSED`. | -| `bump-snd-image` | `kubectl patch seinetwork` set `spec.image` to the post-upgrade build. The SeiNetwork controller re-asserts the image onto every child and rolls all validators onto the new binary. | -| `await-post-upgrade-progress` | Post-upgrade height-advance check: each of nodes 0/1/2/3 advanced past `POST_UPGRADE_HEIGHT` (= `TARGET_HEIGHT + 10`) via AwaitCondition. This is the liveness assertion -- a node that crosses the boundary has survived the upgrade. | - -### 5. Cleanup - -```bash -kubectl delete workflow -n majorupgrade major-upgrade -# Workflow does NOT delete the SeiNodeTask CRs it created (intentional -- -# you want them visible for post-mortem). Remove them explicitly: -kubectl delete seinodetasks -n majorupgrade --all -# Per-run ConfigMaps (labeled with sei.io/workflow-run) accumulate across -# runs. The Workflow does not garbage-collect them; an operator clears -# them out by label: -kubectl delete configmap -n majorupgrade -l sei.io/workflow-run -# or for a single run: -kubectl delete configmap -n majorupgrade "workflow-vars-${SEI_WORKFLOW_RUN_ID}" -# Tear down the testnet: -kubectl delete -f scenarios/testnet-deployment.yaml -``` - -## Cross-step variable bridge (PR 6) - -Chaos Mesh Workflow Task steps are each their own Pod, so emptyDir -volumes cannot carry state across steps. The bridge is a per-run -ConfigMap named `workflow-vars-${SEI_WORKFLOW_RUN_ID}` in the same -namespace as the Workflow: - -- **Producer steps** (`compute-target-height`, `resolve-proposal-id`) - use `alpine/k8s` (curl + kubectl + jq) to compute or query values and - `kubectl apply` the ConfigMap. `compute-target-height` creates it with - `--from-literal` x4 and labels it `sei.io/workflow-run`; later - producers read-modify-write via `kubectl get -o json | jq | apply`. - -- **Consumer steps** receive every key as a container env var via - `envFrom: configMapRef: name: workflow-vars-$SEI_WORKFLOW_RUN_ID`. The - runner's `--var KEY=$(KEY)` arguments use the Kubernetes container env - expansion (`$(VAR)`), which kubelet resolves against the env at - container start. The runner sees concrete `--var KEY=` strings - and no longer needs to source any file. - -- **Concurrency:** the ConfigMap name is parameterized on - `$SEI_WORKFLOW_RUN_ID`, which the operator generates at apply time - (see the `export SEI_WORKFLOW_RUN_ID=...` line above). Two concurrent - runs of the same Workflow get distinct ConfigMaps. - **Caveat:** `resolve-proposal-id` filters voting-period proposals by - `content.plan.name` (= `$SEI_UPGRADE_NAME`). Running two concurrent - scenarios on the **same chain** with the **same upgrade name** lets - either run resolve to whichever proposal sorts first. Use a distinct - `$SEI_UPGRADE_NAME` per concurrent run, or treat the chain as serially - owned by one scenario at a time. - -- **Cleanup:** the ConfigMap carries an `ownerReference` pointing at the - parent Workflow CR (`major-upgrade-$SEI_WORKFLOW_RUN_ID`). Deleting the - Workflow cascades garbage-collection of the ConfigMap automatically - via kube-controller-manager. Operators can still clean up by label - (`-l sei.io/workflow-run`) if multiple Workflows are torn down at once. - -## Known limitations / deferred capability - -1. **Liveness via post-upgrade height-advance check only.** This Workflow - does not assert pre-upgrade running state, does not detect the panic - directly (no RPC-down / stuck-at-`TARGET_HEIGHT-1` polling), and does - not assert the `UPGRADE "" NEEDED` log line that the source - `major_upgrade_test.yaml` greps for. The post-upgrade height-advance - check (each upgraded node advances past `TARGET_HEIGHT + 10` via - AwaitCondition) is the actual liveness signal -- a node that crosses - that boundary has by construction survived the upgrade. Explicit panic - detection and log-line assertions are future SeiNodeTask kinds - (`AssertLogContains`, `AwaitCondition` with a `panicked` predicate) - that no current scenario actually requires. - -2. **No chain-query task kind for proposals.** `compute-target-height`, - `resolve-proposal-id`, and `wait-for-proposal-to-pass` are bash + - curl against the per-pod headless Service RPC/REST. The right - primitive is an `AwaitCondition` extension with `proposalStatus` / - `proposalIdByPlanName` / `heightAdvancing` predicates that emits the - resolved value to the standard outputs path. Migrating those three - steps to a structured kind also lets us delete the `configmaps` RBAC - verbs (only the runner's outputs ConfigMap-write would remain). - -3. **Upgrade rolls the whole fleet, not staggered per-node.** This - Workflow bumps the SeiNetwork spec.image once and lets the SeiNetwork controller - roll all validators together. It does NOT exercise the staggered - early-upgrade-one-node-then-the-rest path the source - `major_upgrade_test.yaml` does. Per-child `UpdateNodeImage` against a - SeiNetwork-owned node fights the controller's spec.image re-assertion (the child - image flip-flops, the StatefulSet churns, `observe-image` never settles), - so staggered rollout needs a different primitive (e.g. SeiNetwork-level - partition/maxUnavailable) before it can return. - -4. **The runner image is not yet auto-published.** Add a `runner` step to - `.github/workflows/ecr.yml` once this scenario is wired into a CI job. - -5. **Argo Workflows migration is still on the long-term roadmap.** The - ConfigMap bridge is the MVP. Argo's `outputs.parameters` / - `inputs.parameters` is more ergonomic and avoids the per-run - ConfigMap garbage. Plan that migration once we have more than one - scenario worth porting. - -6. **No fan-out from a single step.** The 4-vote step is hard-coded to - 4 children rather than `--per-node-selector=role=validator`. We could - collapse the four `vote-node-*` templates into one fan-out runner if - the SeiNodes carry a consistent label, but the explicit per-node form - is easier to diagnose in `kubectl describe workflownode` output. - -## References - -- `https://github.com/sei-protocol/bdchatham-designs/blob/main/designs/seinode-task/seinode-task-lld.md` -- the canonical interface contract. -- `runner/rbac.yaml` -- RBAC the workflow expects on its ServiceAccount. -- `runner/templates/*.yaml.tmpl` -- the templates the runner ships. -- `sei-chain/integration_test/upgrade_module/major_upgrade_test.yaml` -- - the north-star scenario this Workflow replicates. diff --git a/scenarios/load-test.yaml b/scenarios/load-test.yaml deleted file mode 100644 index d2cdcfe4..00000000 --- a/scenarios/load-test.yaml +++ /dev/null @@ -1,251 +0,0 @@ -# Chaos Mesh Workflow: load-test scenario. -# -# Provisions a 4-validator chain + 2-RPC fleet, renders the seiload -# profile JSON with per-run chain id + per-pod EVM endpoints, runs -# seiload against the fleet for DURATION_MINUTES, uploads the report. -# Replaces the bash orchestrate.sh at -# clusters/harbor/nightly/load/orchestrate.sh. -# -# Second scenario authored on the seitask primitives (release-test is -# N=1). Follows the same shape: thin wrapper CronJob fires this -# Workflow; ownerRefs cascade-clean SNDs / profile CM on Workflow -# deletion; trap-side state capture lives in the wrapper. -# -# Open prereq for first fire: pods/exec verb on the seitask-runner -# Role (wait-rpc-caught-up Task uses kubectl exec on the RPC pod to -# poll seid status). Track in the platform-repo wrapper PR. -# -# Placeholders (the wrapper envsubst's at apply time): -# $SEI_NAMESPACE namespace of workflow + provisioned SNDs -# $SEI_CHAIN_ID chain id (e.g. "bench-$SEI_WORKFLOW_RUN_ID") -# $SEI_WORKFLOW_RUN_ID unique per-run id -# $SEID_IMAGE seid container image -# $SEITASK_IMAGE seitask monolith image (SeiNetwork templates baked in) -# $SEILOAD_IMAGE sei-load benchmark image (ghcr.io/sei-protocol/sei-load:…) -# $SEILOAD_COMMIT_ID sei-chain commit SHA (parsed by wrapper from $SEID_IMAGE -# tag); flows into SEILOAD_COMMIT_ID metric/report label -# $SEILOAD_PROFILE profile name in the source CM (e.g., "nightly_evm_transfer") -# $DURATION_MINUTES seiload run length ---- -apiVersion: chaos-mesh.org/v1alpha1 -kind: Workflow -metadata: - name: load-test-$SEI_WORKFLOW_RUN_ID - labels: - sei.io/scenario: load-test - sei.io/workflow-run: "$SEI_WORKFLOW_RUN_ID" -spec: - entry: load-test - templates: - - name: load-test - templateType: Serial - deadline: 90m - children: - - provision-validator-chain - - provision-rpc-fleet - - wait-rpc-caught-up - - render-seiload-profile - - run-seiload - - upload-report - - # Every seitask container projects Workflow identity via downward API: - # NAME from chaos-mesh.org/workflow label, NAMESPACE from pod metadata. - # UID is fetched at runtime via taskruntime.LoadWorkflowIdentity. - - name: provision-validator-chain - templateType: Task - deadline: 25m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - provision-snd - - --role=validator - - --name=$SEI_CHAIN_ID - - --template=/scenarios/load-test/validator.yaml.tmpl - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=IMAGE=$SEID_IMAGE - - --ready-timeout=18m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - - name: provision-rpc-fleet - # 42m covers worst-case sequential readiness: running-timeout 18m + - # N×(WaitCaughtUp + WaitEVMServing) at first-block-timeout 5m each - # (N=2 → 18m + 4×5m = 38m), plus 4m headroom. The Chaos-Mesh deadline - # clock starts at Task admission, so the headroom must also absorb pod - # scheduling + a cold $SEITASK_IMAGE pull (both outside the inner 38m) — - # else a slow-but-genuine readiness is killed opaquely before its typed - # exit lands, inverting the very invariant this budget protects. - templateType: Task - deadline: 42m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - provision-node # rpc followers are standalone SeiNodes, not a SeiNetwork - - --role=rpc - - --name=$SEI_CHAIN_ID-rpc # BASE name; followers are -0..N-1 - - --replicas=2 # seiload drives all N via RPC_EVM_RPC_LIST - - --network=$SEI_CHAIN_ID # peer auto-wire to the genesis SeiNetwork (sei.io/seinetwork=) - - --template=/scenarios/load-test/rpc.yaml.tmpl - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=IMAGE=$SEID_IMAGE - - --running-timeout=18m # was --ready-timeout; SeiNode has no Ready phase (default --first-block-timeout=5m) - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - # Redundant secondary gate: provision-node already waited every follower - # caught-up (height>1 && catching_up==false) before publishing endpoints — - # this re-confirms node-0 (RPC_TM_RPC is node-0's URL, not an aggregate) once - # more right before pointing seiload at the fleet. The sed extract tolerates - # Sei CometBFT's envelope-or-bare /status shape. - - name: wait-rpc-caught-up - templateType: Task - deadline: 10m - task: - container: - name: wait-rpc - image: curlimages/curl:8.10.1 - command: ["/bin/sh", "-c"] - args: - - | - set -eu - for i in $(seq 1 120); do - state=$(curl -fsS --max-time 5 "${RPC_TM_RPC}/status" 2>/dev/null \ - | sed -n 's/.*"catching_up":\([^,}]*\).*/\1/p' \ - | head -1 || true) - if [ "${state}" = "false" ]; then - echo "catching_up=false on ${RPC_TM_RPC}" - exit 0 - fi - sleep 5 - done - echo "timed out waiting for catching_up=false" >&2 - exit 1 - envFrom: - - configMapRef: - name: workflow-vars-load-test-$SEI_WORKFLOW_RUN_ID - - # Sed-substitute the source profile (mounted via volume from the - # seiload-profiles CM in this namespace) and create the per-run - # rendered ConfigMap with an ownerRef to the parent Workflow. - # $RPC_EVM_RPC_LIST is published by provision-node as comma- - # separated bare URLs; the profile expects `[...]` with JSON-quoted - # entries, so we quote+join here. - - name: render-seiload-profile - templateType: Task - deadline: 5m - task: - container: - name: render - image: alpine/k8s:1.31.0 - command: ["/bin/sh", "-c"] - args: - - | - set -eu - # Fast-fail on missing source CM rather than letting the - # pod hang in ContainerCreating against a missing mount. - kubectl -n "${SEI_NAMESPACE}" get configmap seiload-profiles >/dev/null - - QUOTED=$(printf '%s' "${RPC_EVM_RPC_LIST}" | sed 's|\([^,][^,]*\)|"\1"|g') - sed \ - -e "s|__SEI_CHAIN_ID__|${CHAIN_ID}|g" \ - -e "s|__RPC_ENDPOINTS__|${QUOTED}|g" \ - "/profiles/${SEILOAD_PROFILE}.json" > /tmp/profile.json - - WORKFLOW_UID=$(kubectl -n "${SEI_NAMESPACE}" get \ - "workflow.chaos-mesh.org/load-test-${SEI_WORKFLOW_RUN_ID}" \ - -o jsonpath='{.metadata.uid}') - [ -n "${WORKFLOW_UID}" ] || { echo "no Workflow UID"; exit 1; } - - kubectl -n "${SEI_NAMESPACE}" create configmap "${PROFILE_CM}" \ - --from-file=profile.json=/tmp/profile.json \ - --dry-run=client -o yaml \ - | kubectl label -f - --local -o yaml \ - "sei.io/chain-id=${CHAIN_ID}" \ - "sei.io/workflow-run=${SEI_WORKFLOW_RUN_ID}" \ - | kubectl patch -f - --local --type=merge --patch \ - "{\"metadata\":{\"ownerReferences\":[{\"apiVersion\":\"chaos-mesh.org/v1alpha1\",\"kind\":\"Workflow\",\"name\":\"load-test-${SEI_WORKFLOW_RUN_ID}\",\"uid\":\"${WORKFLOW_UID}\",\"controller\":false,\"blockOwnerDeletion\":false}]}}" \ - -o yaml \ - | kubectl apply -f - - env: - - {name: SEI_NAMESPACE, valueFrom: {fieldRef: {fieldPath: metadata.namespace}}} - - {name: SEILOAD_PROFILE, value: "$SEILOAD_PROFILE"} - - {name: PROFILE_CM, value: "seiload-profile-$SEI_WORKFLOW_RUN_ID"} - envFrom: - - configMapRef: - name: workflow-vars-load-test-$SEI_WORKFLOW_RUN_ID - volumeMounts: - - {name: profiles, mountPath: /profiles, readOnly: true} - volumes: - - name: profiles - configMap: - name: seiload-profiles - - - name: run-seiload - templateType: Task - deadline: 30m - task: - container: - name: seiload - image: $SEILOAD_IMAGE - args: - - --config - - /etc/seiload/profile.json - - --duration=$(DURATION_MINUTES)m - - --post-summary-flush-delay=45s - - --track-receipts=true - ports: - - {name: metrics, containerPort: 9090, protocol: TCP} - env: - - {name: DURATION_MINUTES, value: "$DURATION_MINUTES"} - - {name: SEILOAD_RUN_ID, value: "$SEI_WORKFLOW_RUN_ID"} - - {name: SEILOAD_CHAIN_ID, value: "$SEI_CHAIN_ID"} - - {name: SEILOAD_COMMIT_ID, value: "$SEILOAD_COMMIT_ID"} - - {name: SEILOAD_WORKLOAD, value: nightly} - volumeMounts: - - {name: profile, mountPath: /etc/seiload, readOnly: true} - resources: - requests: {cpu: "2", memory: "4Gi"} - limits: {cpu: "4", memory: "8Gi"} - volumes: - - name: profile - configMap: - name: seiload-profile-$SEI_WORKFLOW_RUN_ID - - - name: upload-report - templateType: Task - deadline: 5m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - upload-report - - --bucket=harbor-validation-results - - --prefix=nightly/load-test/$SEI_WORKFLOW_RUN_ID - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace diff --git a/scenarios/load-test/rpc.yaml.tmpl b/scenarios/load-test/rpc.yaml.tmpl deleted file mode 100644 index 6a8fec4d..00000000 --- a/scenarios/load-test/rpc.yaml.tmpl +++ /dev/null @@ -1,23 +0,0 @@ -# Standalone follower SeiNode. provision-node renders this once per replica, -# stamping metadata.name=- (e.g. -rpc-0), namespace, -# ownerRef->Workflow, the sei.io/role=node + sei.io/seinetwork= object -# labels, and a synthesized peers[].label.selector{sei.io/seinetwork:}. -# This template describes the node only -- never its name, peering, or topology. -apiVersion: sei.io/v1alpha1 -kind: SeiNode -metadata: - name: PLACEHOLDER # overwritten to - by provision-node -spec: - chainId: "{{ .CHAIN_ID }}" - image: "{{ .IMAGE }}" - fullNode: {} # the rpc role = EVM-serving full node - overrides: - storage.state_commit.write_mode: memiavl_only - storage.state_store.write_mode: memiavl_only - evm.worker_pool_size: "32" - evm.worker_queue_size: "4000" - evm.max_tx_pool_txs: "10000" - {{- with (index . "MEMPOOL_SIZE") }} - mempool.size: "{{ . }}" - mempool.pending_size: "{{ . }}" - {{- end }} diff --git a/scenarios/load-test/validator.yaml.tmpl b/scenarios/load-test/validator.yaml.tmpl deleted file mode 100644 index 0abf81f2..00000000 --- a/scenarios/load-test/validator.yaml.tmpl +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNetwork -metadata: - name: PLACEHOLDER -spec: - image: "{{ .IMAGE }}" - replicas: 4 - configOverrides: - storage.state_commit.write_mode: memiavl_only - storage.state_store.write_mode: memiavl_only - {{- with (index . "MEMPOOL_SIZE") }} - mempool.size: "{{ . }}" - mempool.pending_size: "{{ . }}" - {{- end }} - genesis: - chainId: "{{ .CHAIN_ID }}" diff --git a/scenarios/major-upgrade.yaml b/scenarios/major-upgrade.yaml deleted file mode 100644 index 7f1812ef..00000000 --- a/scenarios/major-upgrade.yaml +++ /dev/null @@ -1,575 +0,0 @@ -# Chaos Mesh Workflow: major-upgrade scenario. -# -# Acceptance test for the SeiNodeTask MVP. Expresses -# sei-chain/integration_test/upgrade_module/major_upgrade_test.yaml as a -# composition of SeiNodeTask CRs driven by the seitask runner. -# -# Provisions a 4-validator chain in-workflow via provision-snd, runs the -# upgrade pipeline against it, uploads the run snapshot to S3. Matches the -# release-test/load-test pattern: SeiNetwork lifecycle and workflow-vars ConfigMap -# all carry ownerRef to this Workflow CR, so the wrapper's only cleanup duty -# is `kubectl delete workflow`. -# -# Upgrade mechanism: a single bump-snd-image step patches the SeiNetwork -# spec.image; the SeiNetwork controller rolls all validators onto the new binary. -# The SeiNetwork spec.image is the one source of truth for child image -- per-child -# UpdateNodeImage would fight the controller's spec.image re-assertion and -# churn the StatefulSet so the rollout never settles. -# -# Workflow-vars producers/consumers -# --------------------------------- -# provision-validator-chain seeds CHAIN_ID + VALIDATOR_TM_RPC + VALIDATOR_REST. -# compute-target-height patches TARGET_HEIGHT/UPGRADE_HEIGHT/POST_UPGRADE_HEIGHT. -# resolve-proposal-id patches PROPOSAL_ID. Every downstream step consumes via -# `envFrom: configMapRef`; runner steps use `$(VAR)` (K8s container env -# interpolation) inside --var args. -# -# PROPOSAL_ID resolution (chain-as-medium) -# ---------------------------------------- -# .status.outputs.govSoftwareUpgrade.proposalId is empty by design (no -# sidecar-derived structured outputs in MVP). The resolve-proposal-id step -# polls /cosmos/gov/v1beta1/proposals?proposal_status=2 (voting period) until -# a proposal matching $SEI_UPGRADE_NAME appears, then patches PROPOSAL_ID. -# -# Placeholders (wrapper envsubst's at apply time -- see scenarios/README.md): -# $SEI_NAMESPACE namespace of workflow + provisioned SeiNetwork -# $SEI_CHAIN_ID chain id; also the SeiNetwork name -# $SEI_PRE_UPGRADE_IMG seid image the validators boot on -# $SEI_POST_UPGRADE_IMG seid image the upgrade rolls out to -# $SEI_UPGRADE_NAME upgrade plan name registered in seid -# $SEITASK_IMAGE seitask monolith image (SeiNetwork templates baked in) -# $SEI_WORKFLOW_RUN_ID unique per-run id; suffixes Workflow + CM names ---- -apiVersion: chaos-mesh.org/v1alpha1 -kind: Workflow -metadata: - name: major-upgrade-$SEI_WORKFLOW_RUN_ID - labels: - sei.io/scenario: major-upgrade - sei.io/workflow-run: "$SEI_WORKFLOW_RUN_ID" -spec: - entry: major-upgrade - templates: - - name: major-upgrade - templateType: Serial - deadline: 90m - children: - - provision-validator-chain - - compute-target-height - - submit-upgrade-proposal - - resolve-proposal-id - - vote-yes-all-validators - - wait-for-proposal-to-pass - - settle-into-halt - - bump-snd-image - - await-post-upgrade-progress - - upload-report - - # Every seitask container projects Workflow identity via downward API: - # NAME from the chaos-mesh.org/workflow label chaos-mesh stamps on each - # Task pod, NAMESPACE from the pod's own metadata. UID isn't projectable - # so taskruntime.LoadWorkflowIdentity fetches it via the apiserver using - # NAME + NAMESPACE. - - name: provision-validator-chain - templateType: Task - deadline: 25m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - provision-snd - - --role=validator - - --name=$SEI_CHAIN_ID - - --template=/scenarios/major-upgrade/validator.yaml.tmpl - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=IMAGE=$SEI_PRE_UPGRADE_IMG - - --ready-timeout=18m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - # Sets upgrade height = current + 200 blocks (~120s at Sei's ~600ms block - # time) to outlast the 60s gov voting_period plus tally + plan-execution - # slack. Patches the workflow-vars ConfigMap (seeded by provision-snd) with: - # TARGET_HEIGHT -- upgrade height - # UPGRADE_HEIGHT -- consumed by gov-software-upgrade.yaml.tmpl - # POST_UPGRADE_HEIGHT -- TARGET_HEIGHT + 10; liveness check threshold - - name: compute-target-height - templateType: Task - deadline: 5m - task: - container: - name: compute-target-height - image: alpine/k8s:1.31.0 - command: ["/bin/sh", "-c"] - args: - - | - set -eu - CUR="" - for i in $(seq 1 30); do - CUR=$(curl -fsS --connect-timeout 3 "${VALIDATOR_TM_RPC}/status" 2>/dev/null \ - | sed -n 's/.*"latest_block_height":"\([0-9]*\)".*/\1/p' || true) - if [ -n "${CUR}" ]; then - echo "got height=${CUR} on attempt=${i}" - break - fi - echo "attempt=${i} RPC not ready yet; retrying in 3s" - sleep 3 - done - if [ -z "${CUR}" ]; then - echo "failed to parse latest_block_height from ${VALIDATOR_TM_RPC}/status after 30 attempts" >&2 - exit 1 - fi - TARGET=$((CUR + 200)) - POST=$((TARGET + 10)) - echo "current=${CUR} target=${TARGET} post=${POST}" - kubectl patch configmap "workflow-vars-major-upgrade-${SEI_WORKFLOW_RUN_ID}" \ - --type=merge \ - --patch "{\"data\":{\"TARGET_HEIGHT\":\"${TARGET}\",\"UPGRADE_HEIGHT\":\"${TARGET}\",\"POST_UPGRADE_HEIGHT\":\"${POST}\"}}" - env: - - name: SEI_WORKFLOW_RUN_ID - value: "$SEI_WORKFLOW_RUN_ID" - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - # Submits software-upgrade proposal at UPGRADE_HEIGHT via node-0's sidecar. - - name: submit-upgrade-proposal - templateType: Task - deadline: 10m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/gov-software-upgrade.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-0 - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=TITLE=major-upgrade scenario - - --var=DESCRIPTION=software-upgrade to $SEI_UPGRADE_NAME - - --var=UPGRADE_NAME=$SEI_UPGRADE_NAME - - --var=UPGRADE_HEIGHT=$(UPGRADE_HEIGHT) - - --var=INITIAL_DEPOSIT=20000000usei - - --var=FEES=10000usei - - --var=GAS=500000 - - --timeout=8m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - # Polls gov REST for a voting-period proposal whose content.plan.name - # matches $SEI_UPGRADE_NAME (legacy shape) OR messages[].content.plan.name - # (v1 shape). Writes PROPOSAL_ID to workflow-vars. 150 * 2s = 300s window. - - name: resolve-proposal-id - templateType: Task - deadline: 5m - task: - container: - name: resolve-proposal-id - image: alpine/k8s:1.31.0 - command: ["/bin/sh", "-c"] - args: - - | - set -eu - for i in $(seq 1 150); do - BODY=$(curl -fsS "${VALIDATOR_REST}/cosmos/gov/v1beta1/proposals?proposal_status=2" || true) - PID=$(printf '%s' "${BODY}" | jq -r --arg n "${SEI_UPGRADE_NAME}" ' - .proposals // [] - | map(select( - (.content.plan.name? == $n) - or (.messages? // [] | map(.content.plan.name? // empty) | index($n)) - )) - | .[0].proposal_id // empty - ') - if [ -n "${PID}" ] && [ "${PID}" != "null" ]; then - echo "resolved proposal_id=${PID} for upgrade=${SEI_UPGRADE_NAME}" - kubectl patch configmap "workflow-vars-major-upgrade-${SEI_WORKFLOW_RUN_ID}" \ - --type=merge \ - --patch "{\"data\":{\"PROPOSAL_ID\":\"${PID}\"}}" - exit 0 - fi - echo "attempt=${i} no voting-period proposal matching ${SEI_UPGRADE_NAME} yet" - sleep 2 - done - echo "timed out resolving PROPOSAL_ID for upgrade=${SEI_UPGRADE_NAME}" >&2 - exit 1 - env: - - name: SEI_UPGRADE_NAME - value: "$SEI_UPGRADE_NAME" - - name: SEI_WORKFLOW_RUN_ID - value: "$SEI_WORKFLOW_RUN_ID" - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - # vote-yes-all-validators -- parallel, one CR per validator. - - name: vote-yes-all-validators - templateType: Parallel - deadline: 10m - children: - - vote-node-0 - - vote-node-1 - - vote-node-2 - - vote-node-3 - - - name: vote-node-0 - templateType: Task - deadline: 8m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/gov-vote.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-0 - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=PROPOSAL_ID=$(PROPOSAL_ID) - - --var=OPTION=yes - - --var=FEES=10000usei - - --var=GAS=200000 - - --timeout=5m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - - name: vote-node-1 - templateType: Task - deadline: 8m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/gov-vote.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-1 - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=PROPOSAL_ID=$(PROPOSAL_ID) - - --var=OPTION=yes - - --var=FEES=10000usei - - --var=GAS=200000 - - --timeout=5m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - - name: vote-node-2 - templateType: Task - deadline: 8m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/gov-vote.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-2 - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=PROPOSAL_ID=$(PROPOSAL_ID) - - --var=OPTION=yes - - --var=FEES=10000usei - - --var=GAS=200000 - - --timeout=5m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - - name: vote-node-3 - templateType: Task - deadline: 8m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/gov-vote.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-3 - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=PROPOSAL_ID=$(PROPOSAL_ID) - - --var=OPTION=yes - - --var=FEES=10000usei - - --var=GAS=200000 - - --timeout=5m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - # Polls REST gov endpoint until status=PROPOSAL_STATUS_PASSED. - - name: wait-for-proposal-to-pass - templateType: Task - deadline: 10m - task: - container: - name: wait-for-pass - image: curlimages/curl:8.10.1 - command: ["/bin/sh", "-c"] - args: - - | - set -eu - for i in $(seq 1 300); do - STATUS=$(curl -fsS "${VALIDATOR_REST}/cosmos/gov/v1beta1/proposals/${PROPOSAL_ID}" \ - | sed -n 's/.*"status":[[:space:]]*"\([A-Z_]*\)".*/\1/p' | head -1) - echo "attempt=${i} proposal=${PROPOSAL_ID} status=${STATUS:-unknown}" - [ "${STATUS}" = "PROPOSAL_STATUS_PASSED" ] && exit 0 - sleep 1 - done - echo "proposal ${PROPOSAL_ID} did not pass within timeout" >&2 - exit 1 - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - # Waits for the chain to reach UPGRADE_HEIGHT and halt before the binary - # swap. The old binary panics ("UPGRADE NEEDED") at UPGRADE_HEIGHT; the new - # binary panics ("BINARY UPDATED BEFORE TRIGGER", sei-cosmos x/upgrade - # abci.go) if it processes ANY block below UPGRADE_HEIGHT. So bump-snd-image - # must land only after every validator has committed UPGRADE_HEIGHT-1 and - # halted. The height can't be polled at that point -- all validators halt - # together and stop serving RPC exactly when the predicate would be true -- - # so this is a fixed wait, not an AwaitCondition. UPGRADE_HEIGHT is current - # + 200 blocks measured at compute-target-height, but the proposal flow - # (~60s voting period + tally) burns most of that budget first, so only - # ~100 blocks (~60s at ~600ms blocks) remain once the proposal has passed. - # Over-waiting is free (the chain just sits halted until the swap); the only - # failure mode is waiting too short. The full wall-clock from height - # measurement to swap (~60s voting + 150s here) must exceed 200 x block_time, - # so block time above ~1s would break it -- raise this if a cold chain's - # early blocks run slow. - - name: settle-into-halt - templateType: Task - deadline: 8m - task: - container: - name: settle-into-halt - image: alpine/k8s:1.31.0 - command: ["/bin/sh", "-c"] - args: - - | - set -eu - echo "waiting 150s for the chain to reach UPGRADE_HEIGHT and halt before swapping the binary" - sleep 150 - echo "settle window elapsed; proceeding to bump-snd-image" - - # Bumps the SeiNetwork image to the post-upgrade build in - # a single patch. The SeiNetwork controller re-asserts the new - # image onto every child SeiNode and drives each node's NodeUpdate plan; - # the validators roll together onto the new binary at the upgrade height. - # - # Patches spec.image only -- a merge patch leaves the rest of the spec - # untouched. Per-child UpdateNodeImage is NOT used here: the SeiNetwork - # controller would re-assert spec.image every reconcile, flip-flopping the - # child spec.image and churning the StatefulSet so the rollout never settles - # (observe-image never completes). The SeiNetwork spec.image is the single - # source of truth for child image. - - name: bump-snd-image - templateType: Task - deadline: 5m - task: - container: - name: bump-snd-image - image: alpine/k8s:1.31.0 - command: ["/bin/sh", "-c"] - args: - - | - set -eu - kubectl patch seinetwork "${SEI_CHAIN_ID}" \ - --type=merge \ - --patch "{\"spec\":{\"image\":\"${SEI_POST_UPGRADE_IMG}\"}}" - echo "patched seinetwork/${SEI_CHAIN_ID} image to ${SEI_POST_UPGRADE_IMG}" - env: - - name: SEI_CHAIN_ID - value: "$SEI_CHAIN_ID" - - name: SEI_POST_UPGRADE_IMG - value: "$SEI_POST_UPGRADE_IMG" - - # Liveness: each validator advances past TARGET_HEIGHT+10 - # (= POST_UPGRADE_HEIGHT) after the SeiNetwork rolls all nodes onto the new - # binary. AwaitCondition over the height predicate, one per validator. - - name: await-post-upgrade-progress - templateType: Parallel - deadline: 15m - children: - - await-post-upgrade-progress-node-0 - - await-post-upgrade-progress-node-1 - - await-post-upgrade-progress-node-2 - - await-post-upgrade-progress-node-3 - - - name: await-post-upgrade-progress-node-0 - templateType: Task - deadline: 12m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/await-condition.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-0 - - --var=TARGET_HEIGHT=$(POST_UPGRADE_HEIGHT) - - --timeout=10m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - - name: await-post-upgrade-progress-node-1 - templateType: Task - deadline: 12m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/await-condition.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-1 - - --var=TARGET_HEIGHT=$(POST_UPGRADE_HEIGHT) - - --timeout=10m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - - name: await-post-upgrade-progress-node-2 - templateType: Task - deadline: 12m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/await-condition.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-2 - - --var=TARGET_HEIGHT=$(POST_UPGRADE_HEIGHT) - - --timeout=10m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - - name: await-post-upgrade-progress-node-3 - templateType: Task - deadline: 12m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/await-condition.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-3 - - --var=TARGET_HEIGHT=$(POST_UPGRADE_HEIGHT) - - --timeout=10m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - - name: upload-report - templateType: Task - deadline: 5m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - upload-report - - --bucket=harbor-validation-results - - --prefix=nightly/major-upgrade/$SEI_WORKFLOW_RUN_ID - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace diff --git a/scenarios/major-upgrade/validator.yaml.tmpl b/scenarios/major-upgrade/validator.yaml.tmpl deleted file mode 100644 index 29d11430..00000000 --- a/scenarios/major-upgrade/validator.yaml.tmpl +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNetwork -metadata: - name: PLACEHOLDER -spec: - image: "{{ .IMAGE }}" - replicas: 4 - configOverrides: - tx_index.indexer: kv - api.rest.enable: "true" - genesis: - chainId: "{{ .CHAIN_ID }}" - overrides: - gov.voting_params.voting_period: "60s" diff --git a/scenarios/release-test.yaml b/scenarios/release-test.yaml deleted file mode 100644 index 07de3bb3..00000000 --- a/scenarios/release-test.yaml +++ /dev/null @@ -1,189 +0,0 @@ -# Chaos Mesh Workflow: release-test scenario. -# -# Provisions a 4-validator chain + 2-RPC fleet, runs the release-test image -# against the RPC endpoints, uploads the run snapshot to S3. First scenario -# composed end-to-end from seitask primitives (keygen, provision-snd, -# upload-report). Workflow-vars CM bridges per-step values; see -# scenarios/major-upgrade.yaml for the pattern. -# -# Cleanup: every per-run resource (SNDs, admin Secret, workflow-vars CM) -# carries ownerRef to this Workflow CR. The wrapper's only cleanup duty is -# `kubectl delete workflow` — kube-controller-manager cascades. -# -# Upload-report is the last Serial child: a failed earlier step bails the -# Serial before upload fires. Phase 2b lifts upload-report into an -# always-fire Parallel branch. -# -# Placeholders (the wrapper envsubst's at apply time): -# $SEI_NAMESPACE namespace of workflow + provisioned SNDs -# $SEI_CHAIN_ID chain id (e.g. "rel-$SEI_WORKFLOW_RUN_ID") -# $SEI_WORKFLOW_RUN_ID unique per-run id; suffixes Workflow + Secret names -# $SEID_IMAGE seid container image -# $SEITASK_IMAGE seitask monolith image (SeiNetwork templates baked in) -# $RELEASE_TEST_IMAGE release-test harness image ---- -apiVersion: chaos-mesh.org/v1alpha1 -kind: Workflow -metadata: - name: release-test-$SEI_WORKFLOW_RUN_ID - labels: - sei.io/scenario: release-test - sei.io/workflow-run: "$SEI_WORKFLOW_RUN_ID" -spec: - entry: release-test - templates: - - name: release-test - templateType: Serial - deadline: 60m - children: - - keygen-admin - - provision-validator-chain - - provision-rpc-fleet - - run-release-test - - upload-report - - # Every seitask container projects Workflow identity via downward API: - # NAME from the chaos-mesh.org/workflow label chaos-mesh stamps on - # each Task pod, NAMESPACE from the pod's own metadata. UID isn't - # projectable so taskruntime.LoadWorkflowIdentity fetches it via the - # apiserver using NAME + NAMESPACE. - - name: keygen-admin - templateType: Task - deadline: 2m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - keygen - - --key-name=admin - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - # Inner --ready-timeout + default first-block-timeout sit 2m below the - # Task deadline so provision-snd's typed exit reaches the parent before - # chaos-mesh kills the pod on deadline exceeded. - - name: provision-validator-chain - templateType: Task - deadline: 25m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - provision-snd - - --role=validator - - --name=$SEI_CHAIN_ID - - --template=/scenarios/release-test/validator.yaml.tmpl - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=IMAGE=$SEID_IMAGE - - --var=ADMIN_ADDRESS=$(ADMIN_ADDRESS) - - --ready-timeout=18m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-release-test-$SEI_WORKFLOW_RUN_ID - - - name: provision-rpc-fleet - # 32m covers worst-case sequential readiness: running-timeout 18m + - # WaitCaughtUp + WaitEVMServing at first-block-timeout 5m each - # (N=1 → 18m + 2×5m = 28m), plus 4m headroom. The Chaos-Mesh deadline - # clock starts at Task admission, so the headroom must also absorb pod - # scheduling + a cold $SEITASK_IMAGE pull (both outside the inner 28m) — - # else a slow-but-genuine readiness is killed opaquely before its typed - # exit lands, inverting the very invariant this budget protects. - templateType: Task - deadline: 32m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - provision-node # rpc follower is a standalone SeiNode, not a SeiNetwork - - --role=rpc - - --name=$SEI_CHAIN_ID-rpc # BASE name; follower is -0 - - --replicas=1 # mocha hits a single RPC (RPC_TM_RPC/EVM/REST) - - --network=$SEI_CHAIN_ID # peer auto-wire to the genesis SeiNetwork (sei.io/seinetwork=) - - --template=/scenarios/release-test/rpc.yaml.tmpl - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=IMAGE=$SEID_IMAGE - - --running-timeout=18m # was --ready-timeout; SeiNode has no Ready phase (default --first-block-timeout=5m) - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-release-test-$SEI_WORKFLOW_RUN_ID - - # With one rpc follower (replicas=1), TM RPC + REST + EVM RPC all resolve to - # that node via provision-node's RoleScoped(rpc, *) off node-0's - # .status.endpoint. A single node also gives the stateful sequences - # (sei_newFilter + sei_getFilterLogs, eth_sendRawTransaction + tx.wait) one - # consistent mempool + filter-store view. - - name: run-release-test - templateType: Task - deadline: 30m - task: - container: - name: release-test - image: $RELEASE_TEST_IMAGE - envFrom: - - configMapRef: - name: workflow-vars-release-test-$SEI_WORKFLOW_RUN_ID - env: - - {name: TEST_TARGET, value: chain-agnostic} - - {name: SEI_CHAIN_ID, value: $(CHAIN_ID)} - - {name: SEI_ADMIN_ADDRESS, value: $(ADMIN_ADDRESS)} - - {name: SEI_TENDERMINT_RPC, value: $(RPC_TM_RPC)} - - {name: SEI_EVM_JSON_RPC, value: $(RPC_EVM_RPC)} - - {name: SEI_REST_ENDPOINT, value: $(RPC_REST)} - - name: SEI_ADMIN_MNEMONIC - valueFrom: - secretKeyRef: - name: admin-release-test-$SEI_WORKFLOW_RUN_ID - key: mnemonic - resources: - requests: {cpu: 500m, memory: 1Gi} - limits: {memory: 2Gi} - - - name: upload-report - templateType: Task - deadline: 5m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - upload-report - - --bucket=harbor-validation-results - - --prefix=nightly/release-test/$SEI_WORKFLOW_RUN_ID - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace diff --git a/scenarios/release-test/rpc.yaml.tmpl b/scenarios/release-test/rpc.yaml.tmpl deleted file mode 100644 index dc870b7c..00000000 --- a/scenarios/release-test/rpc.yaml.tmpl +++ /dev/null @@ -1,20 +0,0 @@ -# Standalone follower SeiNode. provision-node renders this once per replica, -# stamping metadata.name=- (e.g. -rpc-0), namespace, -# ownerRef->Workflow, the sei.io/role=node + sei.io/seinetwork= object -# labels, and a synthesized peers[].label.selector{sei.io/seinetwork:}. -# This template describes the node only -- never its name, peering, or topology. -apiVersion: sei.io/v1alpha1 -kind: SeiNode -metadata: - name: PLACEHOLDER # overwritten to - by provision-node -spec: - chainId: "{{ .CHAIN_ID }}" - image: "{{ .IMAGE }}" - fullNode: {} # the rpc role = EVM-serving full node - overrides: - tx_index.indexer: kv - storage.state_commit.write_mode: memiavl_only - storage.state_store.write_mode: memiavl_only - mempool.ttl_duration: 60s - network.rpc.lag_threshold: "2" - evm.enabled_legacy_sei_apis: sei_getLogs,sei_getBlockByNumber,sei_getBlockByHash,sei_getSeiAddress,sei_getEVMAddress,sei_getCosmosTx,sei_getEvmTx,sei_newFilter,sei_getFilterLogs diff --git a/scenarios/release-test/validator.yaml.tmpl b/scenarios/release-test/validator.yaml.tmpl deleted file mode 100644 index 1b7a17bd..00000000 --- a/scenarios/release-test/validator.yaml.tmpl +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNetwork -metadata: - name: PLACEHOLDER -spec: - image: "{{ .IMAGE }}" - replicas: 4 - configOverrides: - tx_index.indexer: kv - storage.state_commit.write_mode: memiavl_only - storage.state_store.write_mode: memiavl_only - mempool.ttl_duration: 60s - genesis: - chainId: "{{ .CHAIN_ID }}" - accounts: - - address: "{{ .ADMIN_ADDRESS }}" - balance: 1000000000000usei diff --git a/scenarios/testnet-deployment.yaml b/scenarios/testnet-deployment.yaml deleted file mode 100644 index 86000727..00000000 --- a/scenarios/testnet-deployment.yaml +++ /dev/null @@ -1,33 +0,0 @@ -# Reference 4-validator SeiNetwork that the major-upgrade Workflow -# can target. NOT a production manifest -- intended for ephemeral harbor -# dev cluster testnets. Adjust .spec.image and genesis.chainId to -# match the upgrade you are exercising. -# -# After apply, wait for status.replicas==status.readyReplicas==4 and all -# 4 SeiNodes in phase Running before applying scenarios/major-upgrade.yaml. ---- -apiVersion: v1 -kind: Namespace -metadata: - name: majorupgrade - labels: - pod-security.kubernetes.io/enforce: restricted - pod-security.kubernetes.io/audit: restricted - pod-security.kubernetes.io/warn: restricted - sei.io/scenario: major-upgrade ---- -apiVersion: sei.io/v1alpha1 -kind: SeiNetwork -metadata: - name: majorupgrade - namespace: majorupgrade -spec: - replicas: 4 - # PRE_UPGRADE image -- bump in lockstep with $SEI_PRE_UPGRADE_IMG in - # the Workflow apply step. - image: ghcr.io/sei-protocol/sei:v6.3.0 - genesis: - chainId: majorupgrade-1 - stakingAmount: "10000000usei" - sidecar: - image: ghcr.io/sei-protocol/seictl:v0.0.29 diff --git a/sdk/CLAUDE.md b/sdk/CLAUDE.md index 4f701fa7..0d10b20c 100644 --- a/sdk/CLAUDE.md +++ b/sdk/CLAUDE.md @@ -47,8 +47,8 @@ timeout spec fields; `sei.IsTimeout(err)` reports a deadline. **`WaitCaughtUp` / `WaitEVMServing` = STRICT readiness** (`readiness.go`). The caught-up gate (TM `/status`: `height>1 && catching_up==false`) and the EVM serve gate (`eth_blockNumber` bound) — the heavier contract `WaitReady` deliberately is -not. URL-based and stdlib-only (no apimachinery), so seictl, the seitask Task -steps, and external integration harnesses share one readiness implementation +not. URL-based and stdlib-only (no apimachinery), so seictl and external +integration harnesses share one readiness implementation instead of bespoke bash. Inputs (endpoint URLs) come from whatever produced the resource — e.g. the CLI create command's stdout. @@ -89,6 +89,6 @@ authors them once. - **`provider.Provider` interface + `Register`/`Factory`.** The handle-based CRUD driver-registration contract. - **Object-label keys** `sei.io/role=node`, `sei.io/seinetwork=`. The - fleet-wide selector contract shared with seictl, seitask, chaos selectors. + fleet-wide selector contract shared with seictl + chaos selectors. - **SSA FieldOwner `sei-sdk`.** A distinct field manager. Renaming it orphans field ownership on objects the SDK already created. diff --git a/sdk/sei/.xreview/sdk-task-surface.md b/sdk/sei/.xreview/sdk-task-surface.md deleted file mode 100644 index 04e27516..00000000 --- a/sdk/sei/.xreview/sdk-task-surface.md +++ /dev/null @@ -1,46 +0,0 @@ -# xreview ledger — SDK SeiNodeTask surface (WS-G) - -Class: component (public SDK surface over the SeiNodeTask CRD) -Tier: T2 - -Target: `sdk/sei/task.go`, `sdk/sei/provider.go`, `sdk/sei/provider/k8s/{render,handle,k8s}.go`, stubs + tests -Artifact: branch `feat/sdk-task-surface` (diff /tmp/wsg-task-surface.diff) - -## Round 1 - -State: RESOLVED -OpenFindings: 0 -Convergence: independent (4 blinded reviewers) -Blinded: yes -Dissenter: sei-network-specialist (DISSENT → resolved) - -Slate: kubernetes-specialist (CRD-contract), idiomatic-reviewer (Go idiom), systems-engineer (poll/error contract), sei-network-specialist (dissenter, upgrade semantics). - -### Boundary table - -| Boundary | Provider | Consumer | Status | Evidence | Raised by | -|---|---|---|---|---|---| -| GovSoftwareUpgrade proposal-ID handoff | nodetask controller | harness (GovVote input) | **MISMATCH → FIXED** | `controller.go:360-369` populateOutputs only handles UpdateNodeImage; gov/await outputs never written (chain-as-medium by design). SDK advertised `ProposalID` as "the GovVote input" → always 0 → GovVote.proposalId Minimum=1 admission reject (`seinodetask_types.go:366`). | dissenter (lead), k8s, systems | -| UpdateNodeImage RequirePhase on halted node | nodetask controller | harness (step 4) | **MISMATCH → FIXED** | Gate is `==` exact-match defaulting Running (`controller.go:195-199`); a node halted at upgrade height still reports Running (phase sticky). SDK doc+test told callers to relax to Pending → terminal timeout. | dissenter | -| Payload field mapping (4 kinds) | CRD | renderTask | COMPATIBLE | All fields field-for-field congruent (render.go). | k8s | -| SSA / status subresource | CRD | k8s apply | COMPATIBLE | Main-resource Apply; status subresource separate — no status stomp. | k8s | -| WaitComplete poll loop | — | harness | **MISMATCH → FIXED** | Only NotFound tolerated; a transient Get error aborts a multi-minute wait. Tolerate retryable (ServerTimeout/TooManyRequests/InternalError). | systems | -| Complete + nil outputs | controller | WaitComplete | MISMATCH → MITIGATED | `(nil,nil)` nil-deref hazard; largely mooted by removing the unpopulated gov output types. | systems, k8s | -| Resubmit / idempotency | CRD + controller | RunTask | MISSING → DOCUMENTED | same-name re-apply is a no-op (no double-submit); delete+recreate resubmits the gov-tx. Doc'd on RunTask. CEL immutability + on-chain dedup are later coordinated CRD work. | systems | -| GovVote per-validator key derivation | controller | harness | COMPATIBLE | `KeyName:""` derives per-target SeiNode key (`seinodetask_params.go:120,231`); no shared-key assumption. | dissenter | - -### Idiom addendum (idiomatic-reviewer — RATIFY) -Clean. No correctness/divergence-with-consequence findings. Endorsed `WaitComplete (*TaskOutputs, error)` as the correct one-shot-terminal shape. Two pure-style notes accepted as-is. Process note: add a provider-side one-output-per-Kind test. - -### Resolutions (this PR, no controller change) -1. **ProposalID lie:** removed `GovSoftwareUpgradeOutputs`/`GovVoteOutputs`/`AwaitNodesAtHeightOutputs` from the SDK (all structurally unpopulated); `TaskOutputs` now carries only `UpdateNodeImage` (the sole kind populateOutputs writes). Package + payload docs rewritten to the chain-as-medium reality. -2. **RequirePhase backwards:** UpdateNodeImage doc fixed — a halted node still reports Running, so the default is correct; removed the relax-to-Pending guidance; repurposed the test to verify mechanical RequirePhase override (not the upgrade-failure pattern). -3. **Transient-error tolerance:** WaitComplete keeps polling on retryable Get errors. -4. **Cheap hardening:** validateTaskSpec now validates GovVote.Option enum + rejects 0,sei.io/role=node), and seictl all match the exact -// literals. +// truth. seictl's copy is unexported (internal/), so the SDK authors these once +// here. Changing a value is a fleet-wide breaking change: chaos selectors, +// follower-discovery queries (node list -l sei.io/seinetwork=,sei.io/role=node), +// and seictl all match the exact literals. const ( // LabelRole keys the role an object plays in a network. LabelRole = "sei.io/role" @@ -20,6 +19,6 @@ const ( ) // FieldOwner is the SSA field manager the SDK applies under — a distinct writer -// from seictl ("seictl") and seitask ("seitask-provision-node"). Stable: renaming -// it orphans field ownership on objects the SDK already created. +// from seictl ("seictl"). Stable: renaming it orphans field ownership on objects +// the SDK already created. const FieldOwner = "sei-sdk" diff --git a/sdk/sei/provider/k8s/render.go b/sdk/sei/provider/k8s/render.go index 1fab481e..b1e30d3d 100644 --- a/sdk/sei/provider/k8s/render.go +++ b/sdk/sei/provider/k8s/render.go @@ -13,8 +13,7 @@ import ( "github.com/sei-protocol/sei-k8s-controller/sdk/sei" ) -// fieldOwner is the SDK's SSA field manager. A distinct writer from -// seictl/seitask. +// fieldOwner is the SDK's SSA field manager. A distinct writer from seictl. const fieldOwner client.FieldOwner = sei.FieldOwner // renderNetwork builds the SeiNetwork from a NetworkSpec. ChainID is not a spec diff --git a/sdk/sei/readiness.go b/sdk/sei/readiness.go index c594e2f7..abc241bd 100644 --- a/sdk/sei/readiness.go +++ b/sdk/sei/readiness.go @@ -14,8 +14,8 @@ import ( // Readiness probes are the generally-useful chain-provisioning lifecycle piece: // "the node has joined consensus and is actually serving," not merely "the pod is // Running." They are mode-agnostic — they take a published endpoint URL and speak -// HTTP, so the k8s/local/docker providers, the seitask Task steps, and external -// harnesses all share one implementation. Kept stdlib-only (no apimachinery) so +// HTTP, so the k8s/local/docker providers and external callers all share one +// implementation. Kept stdlib-only (no apimachinery) so // the core package stays dependency-free for lightweight external consumers. // probeInterval is the readiness poll cadence; a var so tests can shrink it diff --git a/sdk/sei/sei.go b/sdk/sei/sei.go index a7fa3b21..8aec62b2 100644 --- a/sdk/sei/sei.go +++ b/sdk/sei/sei.go @@ -1,4 +1,4 @@ -// Package sei is a thin, typed, stateless, multi-mode Go-native API for +// Package sei is a thin, typed, stateless, multi-mode API for // SeiNetwork/SeiNode lifecycle. It mirrors database/sql: a provider registers in // init(), the consumer blank-imports it, and Open selects the mode by name. // @@ -130,7 +130,7 @@ func (c *Client) GetNode(ctx context.Context, name, namespace string) (*Node, er return &Node{handle: h}, nil } -// Network is a Go-native handle to a SeiNetwork. Endpoint getters read the +// Network is a handle to a SeiNetwork. Endpoint getters read the // runtime's status verbatim — never reconstructed. type Network struct{ handle NetworkHandle } @@ -159,7 +159,7 @@ func (n *Network) Delete(ctx context.Context) error { return n.handle.Delete(ctx // type-asserts; local/docker stubs return nil. func (n *Network) Object() any { return n.handle.Object() } -// Node is a Go-native handle to a SeiNode. +// Node is a handle to a SeiNode. type Node struct{ handle NodeHandle } // Name is the SeiNode resource name. diff --git a/sdk/sei/task.go b/sdk/sei/task.go index c7e22079..748a46ef 100644 --- a/sdk/sei/task.go +++ b/sdk/sei/task.go @@ -8,9 +8,8 @@ import ( // Task support. A SeiNodeTask is a one-shot, typed operation against a single // SeiNode — submit a gov upgrade proposal, vote, wait for a height, swap the -// node image. The harness drives a major-upgrade or release scenario by running -// these in statement order. This replaces the Chaos-Mesh Workflow DAG + env-file -// handoffs the seitask-runner used. +// node image. A caller drives a major-upgrade or release flow by running these +// in statement order. // // Cross-task coordination is chain-as-medium, NOT task-to-task output currying: // the controller surfaces typed Outputs only for UpdateNodeImage today (the gov @@ -190,7 +189,7 @@ func (c *Client) GetTask(ctx context.Context, name, namespace string) (*Task, er return &Task{handle: h}, nil } -// Task is a Go-native handle to a SeiNodeTask. +// Task is a handle to a SeiNodeTask. type Task struct{ handle TaskHandle } // Name is the SeiNodeTask resource name. diff --git a/test/integration/.xreview/release-suite.md b/test/integration/.xreview/release-suite.md deleted file mode 100644 index 03892cb0..00000000 --- a/test/integration/.xreview/release-suite.md +++ /dev/null @@ -1,40 +0,0 @@ -# xreview ledger — TestRelease + keygen refactor (WS-I) - -Class: component (integration suite + internal package refactor + additive SDK surface) -Tier: T2 - -Target: `test/integration/release_test.go`, `internal/keygen/*`, `internal/seitask/keygen/keygen.go`, `sdk/sei` Node.REST() -Artifact: branch `feat/test-release` - -## Round 1 - -State: RESOLVED -OpenFindings: 0 -Convergence: independent (4 blinded reviewers) -Blinded: yes -Dissenter: sei-network-specialist (DISSENT → resolved) - -Slate: sei-network-specialist (dissenter), systems-engineer, kubernetes-specialist, idiomatic-reviewer. - -### Findings - -| Finding | Status | Evidence | Raised by | Resolution | -|---|---|---|---|---| -| Dropped envFrom / RPC_EVM_RPC_LIST | **MISMATCH → FIXED** | Scenario injects env via `envFrom: workflow-vars CM` ∪ explicit list; the CM carries RPC_EVM_RPC_LIST (+ RPC_*/CHAIN_ID/ADMIN_ADDRESS) with no explicit equivalent. A harness sub-case reading it would skip silently → exit 0 false-pass. | dissenter (headline) | Job env now reproduces the scenario superset: the RPC_*/CHAIN_ID/ADMIN_ADDRESS CM names alongside the SEI_* explicit names. | -| Verdict is exit-0-only | **MISSING → FIXED** | No record of which sub-cases ran; scenario had upload-report (S3 audit). Strictly less observable than the artifact. | dissenter | Log the harness pod-log tail on completion (success too), so a skip-but-exit-0 is forensically visible. (Full S3/report = the deferred telemetry component.) | -| Job missing securityContext/resources/ttl | **MISMATCH → FIXED** | seiload_job.yaml.tmpl sets runAsNonRoot/seccomp/drop-ALL/readOnlyRootFS + resources + ttl; releaseJob set none → restricted-PSS admission could reject. | k8s | releaseJob now matches the seiload baseline (security context, resources, ttlSecondsAfterFinished). | -| REST handed unprobed (cold-start) | **MISSING → FIXED** | rest=="" is a status-string check, not a serve-probe; LCD binds later than the EVM listener → cold-REST window. | systems, k8s, dissenter | Added sei.WaitRESTServing (GET /cosmos/base/tendermint/v1beta1/node_info), symmetric with WaitEVMServing; replaces the bare empty-check. | -| waitJob drops podLogTail on ctx/signal | **MISMATCH → FIXED** | ctx.Done() branch failed with only ctx.Err() — no harness log on Ctrl-C/SIGTERM/timeout. | systems | ctx.Done() branch now tails the pod log (fresh ctx); messages genericized from "seiload job" to "job". | -| releaseBaseConfig handed un-cloned | **flag → FIXED** | provision maps.Clone's config; release passed the package-global directly. | systems | maps.Clone at the network create. | -| REST on by default for fullNode | COMPATIBLE | Smoke-confirmed (`...:1317` populated); ModeFull → REST.Enable=true, validators → false (explains the upgrade-suite gap). | dissenter (refutes own attack) | — | -| Secret material handling | COMPATIBLE | secretKeyRef (not plain env), never logged, Data not StringData. | systems | — | -| keygen refactor behavior | COMPATIBLE | Pure extraction verified line-by-line (entropy/path/pipeline/idempotency/ownerRef preserved). | k8s, idiom | — | -| Single RPC + EVM-legacy + funding + namespace | COMPATIBLE | One-node filter consistency correct; 1e12 usei matches scenario; co-located. | dissenter, k8s | — | -| Suite SA needs secrets RBAC | RESOLVED (Brandon-authorized) | createMnemonicSecret needs secrets create/delete; smoke confirmed Forbidden without it. | k8s, systems | Granted to the harness Role; the committed manifest lands in the cutover. | - -### Idiom addendum (RATIFY) -Clean. The Go-built Job (vs seiload's template) is principled (shape owned by the suite, not platform) — do NOT harmonize. Node.REST() additive-safe, mirrors EVMRPC/TendermintRPC. Nits fixed: keygen.go:9 typo. SecretMnemonicKey placement vet-and-rejected (one shared const). - -### Deferred -- Full upload-report / S3 audit trail = the deferred telemetry/report component (PromQL punted to last); the pod-log tail is the interim observability. -- Secret leak on SIGKILL until the label-GC sweep ships (cutover) — documented; mnemonic is for a throwaway chain (DeletionDelete cascade). diff --git a/test/integration/.xreview/upgrade-suite.md b/test/integration/.xreview/upgrade-suite.md deleted file mode 100644 index 206ccf1b..00000000 --- a/test/integration/.xreview/upgrade-suite.md +++ /dev/null @@ -1,41 +0,0 @@ -# xreview ledger — TestChainUpgrade (WS-I Step 5) - -Class: component (integration suite consuming the SDK + harness) -Tier: T2 - -Target: `test/integration/upgrade_test.go` -Artifact: branch `feat/test-chain-upgrade` - -## Round 1 - -State: RESOLVED -OpenFindings: 0 -Convergence: independent (4 blinded reviewers) -Blinded: yes -Dissenter: sei-network-specialist (DISSENT → resolved) - -Slate: sei-network-specialist (dissenter, upgrade mechanics), systems-engineer (concurrency/poll/error), kubernetes-specialist (re-apply/targeting), idiomatic-reviewer. - -### Boundary table - -| Boundary | Status | Evidence | Raised by | Resolution | -|---|---|---|---|---| -| Halt detection (step 5) | **MISMATCH → FIXED** | Scenario uses a fixed wait, NOT a poll — validators halt together and stop serving RPC exactly when the predicate is true (major-upgrade.yaml:380-392); aggregate VIP drops NotReady backends at halt → black hole. My pollHeightAtLeast(aggregate, upgradeHeight-1) hangs/flakes. | dissenter | Poll aggregate only to a PRE-halt height (upgradeHeight-haltMargin, endpoint still alive), then a bounded settle for the halt; bump after. | -| Proposal-resolve JSON shape | **MISMATCH → FIXED** | Scenario jq matches content.plan.name OR messages[].content.plan.name (legacy+v1, :177-200); my struct only decodes content → hangs on gov v1. | dissenter | govProposal decodes both arms; matcher checks both. | -| Recovery gate soundness | **MISMATCH → FIXED** | waitValidatorsReady = PodReady ≠ rejoined-consensus-on-new-binary (brief-Ready-then-CrashLoop slips through). | k8s, dissenter | Replaced with RunTask(AwaitNodesAtHeight{upgradeHeight+postDelta}) per validator (port-faithful to scenario step 9, semantic, targets by NodeRef — no pod-label dependence) + a definitive /cosmos/upgrade applied-plan assertion. | -| False-pass on fast blocks | **MISSING → FIXED** | No assertion the upgrade actually executed; fast blocks → chain passes upgradeHeight while voting → no plan → no halt → WaitHeightAdvances greens a never-upgraded chain. | dissenter | Added waitUpgradeApplied (/cosmos/upgrade/v1beta1/applied_plan/{name} height>0) — proves the handler ran. | -| Enveloped-only /status decode | **MISMATCH → FIXED** | pollHeight re-models /status wrapped-only; the Sei fork sometimes answers unwrapped (SDK latestHeight handles both) → spins forever on such a node. | idiom | Promote SDK latestHeight → exported sei.LatestHeight (dual-shape); suite consumes it. | -| Diagnosability of timeouts | **MISMATCH → FIXED** | Poll helpers swallow last-seen height/status into bare deadline errors — below the WaitHeightAdvances bar; suite is unattended-nightly. | systems | pollREST threads a last-seen string into the deadline error; height polls use LatestHeight's value. | -| Task GC label | **MISSING → FIXED** | Task CRs carry no sei.io/harness-run label → leak on abnormal exit. | k8s | Added SDK TaskSpec.Labels (mirrors NetworkSpec.Labels); suite stamps runLabelKey. | -| Vote error fan-in | flag → FIXED | Only first-in-slice error surfaced. | systems, idiom | errors.Join across all validators. | -| NetworkSpec drift (provision vs bump) | flag → FIXED | Two hand-duplicated literals; a future field added to one strips it on the other via ForceOwnership. | systems, k8s | Single networkSpec builder; bump mutates only Image. | -| Image bump full-spec SSA re-apply | COMPATIBLE | k8s read the controller: it never writes the parent spec (finalizer + status only); genesis re-stamp idempotent (ceremony latched, nodes.go:91); no replica churn; no SSA conflict. Verified equivalent to `patch spec.image`. | k8s (refutes dissenter concern) | Kept; builder shared. | -| Concurrency / leaks / SIGTERM | COMPATIBLE | Race-free vote fan-out, body closed, ctx nesting + NotifyContext match sibling suites. | systems | — | -| Validator naming / namespace co-location | COMPATIBLE | - 0-based matches controller labels.go; task/target/pods co-located. | k8s | — | - -### Idiom addendum (RATIFY) -Reads native (env+spec idiom, helpers, comment register, build-tag). Divergence-with-consequence = the /status + poll duplication of the SDK (resolved by exporting LatestHeight; gov-REST polling stays harness-local as gov-query orchestration, not readiness — matches the scope rule). - -### Deferred (not blocking) -- Parent 60m can fire mid-child-step → misattributed error (systems): generous envelope; un-defer on first spurious occurrence. -- min_deposit / deposit-period hang (dissenter): params match the proven scenario (20000000usei clears min_deposit); the last-seen diagnostic surfaces a stuck deposit-period proposal if it ever regresses. diff --git a/test/integration/Dockerfile b/test/integration/Dockerfile index e5bfc383..98b848e9 100644 --- a/test/integration/Dockerfile +++ b/test/integration/Dockerfile @@ -1,8 +1,7 @@ -# The integration harness image: the build-tagged test binary, compiled once and -# run by one in-cluster CronJob per target (args: -test.run TestX). It replaces -# the seitask-runner image + the Chaos-Mesh Workflow scenarios — the suites carry -# their fault/seiload templates via //go:embed, so the binary is self-contained -# (no scenario files to COPY). +# The integration test suite compiled to an image: the build-tagged test binary, +# run by one in-cluster CronJob per target (args: -test.run TestX). The suites +# carry their fault/seiload templates via //go:embed, so the binary is +# self-contained (no extra files to COPY). FROM golang:1.26 AS builder ARG TARGETOS ARG TARGETARCH diff --git a/test/integration/release_test.go b/test/integration/release_test.go index 68809eb8..49ae922c 100644 --- a/test/integration/release_test.go +++ b/test/integration/release_test.go @@ -22,14 +22,12 @@ import ( ) // releaseAdminBalance funds the admin account in genesis so the release-test -// harness can sign and pay for the txs it issues. Ported from the release-test -// scenario's validator template. +// harness can sign and pay for the txs it issues. const releaseAdminBalance = "1000000000000usei" // releaseBaseConfig is the seid config the release chain runs with: the memiavl // storage baseline (the nightly image rejects the cosmos_only default) plus kv tx -// indexing (the harness queries txs) and a short mempool TTL. Ported from the -// release-test scenario's validator + rpc configOverrides. +// indexing (the harness queries txs) and a short mempool TTL. var releaseBaseConfig = mergeConfig(memiavlStorageConfig, map[string]string{ "tx_index.indexer": "kv", "mempool.ttl_duration": "60s", @@ -48,13 +46,12 @@ var releaseRPCConfig = map[string]string{ "evm.enabled_legacy_sei_apis": releaseLegacyEVMAPIs, } -// TestRelease drives the release-validation scenario: provision a 4-validator +// TestRelease drives the release-validation flow: provision a 4-validator // chain + one EVM-serving RPC follower, generate a funded admin account, and run // the external release-test image against the RPC node as a Job. The release-test // image owns the functional assertions (TEST_TARGET=chain-agnostic); the suite's // job is to stand up the chain, hand the harness its endpoints + admin key, and -// gate on the Job's exit code. Replaces the Chaos-Mesh Workflow's keygen + -// provision + run steps with statement order + the SDK. +// gate on the Job's exit code. // // One RPC node (not the load suite's two) is deliberate: the harness runs // stateful EVM-filter and send-then-wait sequences that need one consistent @@ -176,8 +173,8 @@ func TestRelease(t *testing.T) { waitJob(ctx, t, cs, net.Namespace(), job.Name) // Archive the harness output even on success: exit 0 alone doesn't show which - // sub-cases ran, so a skip-but-pass is otherwise invisible (the scenario's - // upload-report served this; an S3 report is the deferred telemetry step). + // sub-cases ran, so a skip-but-pass is otherwise invisible. (A durable S3 + // report is a deferred telemetry step.) t.Logf("release-test job completed; harness log tail:\n%s", podLogTail(ctx, cs, net.Namespace(), job.Name)) // The chain stayed live through the release suite: the follower is still @@ -189,9 +186,8 @@ func TestRelease(t *testing.T) { } // createMnemonicSecret writes the admin mnemonic to a Secret the release-test pod -// reads via secretKeyRef. Labeled for the GC sweep; deleted on cleanup. (The -// seitask-runner stamps an ownerRef instead — the harness uses the run label + -// t.Cleanup, matching how it provisions everything else.) +// reads via secretKeyRef. Labeled for the GC sweep and deleted on cleanup, +// matching how the suite manages everything else it creates. func createMnemonicSecret( ctx context.Context, t *testing.T, cs *kubernetes.Clientset, ns, name string, labels map[string]string, mnemonic string, @@ -222,9 +218,8 @@ type releaseParams struct { // releaseJob builds the release-test Job: the external harness image, fed the // chain endpoints + admin identity, run once (no retry) with a self-terminating -// deadline. Resources + ttl match the scenario's run-release-test step (which the -// nightly — an unenforced-PSS namespace — runs without a securityContext, so this -// stays faithful rather than imposing one the harness image may not tolerate). +// deadline. No securityContext: nightly is an unenforced-PSS namespace, so this +// avoids imposing one the harness image may not tolerate (it writes a keyring). func releaseJob(p releaseParams) *batchv1.Job { backoff := int32(0) deadline := int64(60 * 60) // the chain-agnostic harness runs >35m against one RPC node; generous cap @@ -246,10 +241,10 @@ func releaseJob(p releaseParams) *batchv1.Job { Containers: []corev1.Container{{ Name: "release-test", Image: p.image, - // The scenario projects the workflow-vars CM (RPC_*/CHAIN_ID/ - // ADMIN_ADDRESS) via envFrom ON TOP of the explicit SEI_* list; - // reproduce that superset so a harness sub-case reading e.g. - // RPC_EVM_RPC_LIST isn't silently unset (a skip-but-exit-0). + // The release-test image reads both the SEI_* names and the + // RPC_*/CHAIN_ID/ADMIN_ADDRESS names; provide both so a + // sub-case reading e.g. RPC_EVM_RPC_LIST isn't silently unset + // (which would skip-but-exit-0). Env: []corev1.EnvVar{ {Name: "TEST_TARGET", Value: "chain-agnostic"}, {Name: "SEI_CHAIN_ID", Value: p.chainID}, @@ -257,7 +252,7 @@ func releaseJob(p releaseParams) *batchv1.Job { {Name: "SEI_TENDERMINT_RPC", Value: p.tmRPC}, {Name: "SEI_EVM_JSON_RPC", Value: p.evmRPC}, {Name: "SEI_REST_ENDPOINT", Value: p.rest}, - // workflow-vars CM superset (the scenario's envFrom). + // The RPC_*/CHAIN_ID/ADMIN_ADDRESS aliases the image also reads. {Name: "CHAIN_ID", Value: p.chainID}, {Name: "ADMIN_ADDRESS", Value: p.adminAddr}, {Name: "RPC_TM_RPC", Value: p.tmRPC}, diff --git a/test/integration/upgrade_test.go b/test/integration/upgrade_test.go index 9add2d79..08e463a7 100644 --- a/test/integration/upgrade_test.go +++ b/test/integration/upgrade_test.go @@ -18,9 +18,9 @@ import ( "github.com/sei-protocol/sei-k8s-controller/sdk/sei" ) -// Gov tx parameters for the upgrade flow, ported from the major-upgrade scenario -// (scenarios/major-upgrade.yaml). usei-only; the deposit must clear the chain's -// min_deposit so the proposal enters voting immediately (not the deposit period). +// Gov tx parameters for the upgrade flow. usei-only; the deposit must clear the +// chain's min_deposit so the proposal enters voting immediately (not the deposit +// period). const ( upgradeDeposit = "20000000usei" govFees = "10000usei" @@ -42,9 +42,9 @@ const ( // haltPollMargin is how many blocks BEFORE the upgrade height the suite stops // polling: it polls the (load-balanced) aggregate RPC only while the chain is // still serving, then settles. At the halt itself every validator stops - // serving RPC simultaneously, so the halt height is unpollable (the scenario - // uses a fixed wait for the same reason) — polling to a pre-halt height keeps - // the endpoint alive while still confirming the chain is about to halt. + // serving RPC simultaneously, so the halt height is unpollable — polling to a + // pre-halt height keeps the endpoint alive while still confirming the chain is + // about to halt. haltPollMargin = 10 // haltSettle bounds the wait, after the chain reaches the pre-halt height, for // the remaining blocks to commit and every validator to halt at the upgrade @@ -64,8 +64,7 @@ var votingPeriodGenesis = map[string]string{ // upgradeConfig are the seid runtime overrides the upgrade flow needs: the REST // API serves the gov proposal queries (off by default), and kv tx-indexing lets -// the proposal-submission tx be found. Ported from the major-upgrade scenario's -// SeiNetwork configOverrides. +// the proposal-submission tx be found. var upgradeConfig = map[string]string{ "api.rest.enable": "true", "tx_index.indexer": "kv", @@ -75,13 +74,13 @@ var upgradeConfig = map[string]string{ const restUnreachable = "REST unreachable / non-200" // TestChainUpgrade drives a Sei major software upgrade end-to-end through the SDK -// task surface, replacing the Chaos-Mesh Workflow DAG: provision a 4-validator -// chain on the pre-upgrade image -> submit a GovSoftwareUpgrade proposal -> -// resolve its ID from the chain's gov REST (chain-as-medium, since the controller -// does not surface it as a task output) -> vote yes from every validator -> wait -// for it to pass -> let the chain halt at the upgrade height -> bump the -// SeiNetwork image to the post-upgrade build -> assert the upgrade handler ran -// and every validator resumed past the upgrade height. +// task surface: provision a 4-validator chain on the pre-upgrade image -> submit a +// GovSoftwareUpgrade proposal -> resolve its ID from the chain's gov REST +// (chain-as-medium, since the controller does not surface it as a task output) -> +// vote yes from every validator -> wait for it to pass -> let the chain halt at +// the upgrade height -> bump the SeiNetwork image to the post-upgrade build -> +// assert the upgrade handler ran and every validator resumed past the upgrade +// height. // // The upgrade height is scheduled far enough ahead (defaultUpgradeHeightDelta) // that the proposal passes before the chain reaches it; the upgrade-applied check @@ -388,7 +387,7 @@ func taskName(chainID, step string) string { // govProposal models just enough of a proposal to resolve an upgrade proposal by // its plan name and read its status. The legacy (v1beta1) shape carries the plan // at content.plan.name; the v1 shape carries it under messages[].content.plan.name -// — both are accepted, matching the scenario's resolver. +// — both are accepted. type govProposal struct { ProposalID string `json:"proposal_id"` Status string `json:"status"`