From f4c874ada2dc2a2320c17efba729be9bd15398c5 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 23 Jun 2026 13:40:41 -0700 Subject: [PATCH 1/5] chore: remove seitask runner + Chaos-Mesh Workflow scenarios (cutover) The Go-native integration harness (test/integration, one CronJob per target) has replaced the seitask-runner image + the Chaos-Mesh Workflow nightly. Delete the legacy machinery: - cmd/seitask/, internal/seitask/, internal/runner/, internal/taskruntime/, runner/ (the runner binary + its templates/RBAC), scenarios/ (the Workflow CR definitions). - the seitask image-build step in ecr.yml; the runner/runner-image/ runner-push Makefile targets + RUNNER_IMG (already stale paths). internal/keygen survives (general derivation, used by TestRelease); the SeiNodeTask CRD + its controller survive (driven by the SDK). go build, vet, test, and the integration build tag all pass with no dangling refs. Also drops the committed xreview ledgers (process artifacts, not code). Co-Authored-By: Claude Opus 4.8 --- .github/workflows/ecr.yml | 15 - Makefile | 12 +- cmd/seitask/Dockerfile | 28 - cmd/seitask/keygen.go | 54 -- cmd/seitask/main.go | 74 --- cmd/seitask/main_test.go | 47 -- cmd/seitask/provision_node.go | 121 ---- cmd/seitask/provision_snd.go | 105 --- cmd/seitask/runner.go | 180 ----- cmd/seitask/upload_report.go | 86 --- internal/keygen/keygen.go | 4 +- internal/runner/apply.go | 230 ------- internal/runner/fanout.go | 73 --- internal/runner/orchestrate.go | 256 -------- internal/runner/output.go | 133 ---- internal/runner/patchtype.go | 7 - internal/runner/poll.go | 56 -- internal/runner/runner.go | 257 -------- internal/runner/runner_test.go | 564 ---------------- internal/seitask/keygen/keygen.go | 115 ---- internal/seitask/keygen/keygen_test.go | 171 ----- internal/seitask/provisionnode/provision.go | 415 ------------ .../seitask/provisionnode/provision_test.go | 618 ------------------ internal/seitask/provisionsnd/provision.go | 308 --------- .../seitask/provisionsnd/provision_test.go | 345 ---------- internal/seitask/uploadreport/upload.go | 183 ------ internal/seitask/uploadreport/upload_test.go | 229 ------- internal/taskruntime/cm.go | 84 --- internal/taskruntime/cm_test.go | 110 ---- internal/taskruntime/exit.go | 60 -- internal/taskruntime/exit_test.go | 32 - internal/taskruntime/ownerref.go | 94 --- internal/taskruntime/ownerref_test.go | 102 --- internal/taskruntime/scenarios_test.go | 96 --- internal/taskruntime/vars.go | 64 -- internal/taskruntime/vars_test.go | 46 -- runner/rbac.yaml | 94 --- runner/templates/await-condition.yaml.tmpl | 16 - .../templates/await-nodes-at-height.yaml.tmpl | 12 - .../templates/gov-software-upgrade.yaml.tmpl | 27 - runner/templates/gov-vote.yaml.tmpl | 19 - runner/templates/update-node-image.yaml.tmpl | 15 - scenarios/README.md | 264 -------- scenarios/load-test.yaml | 251 ------- scenarios/load-test/rpc.yaml.tmpl | 23 - scenarios/load-test/validator.yaml.tmpl | 16 - scenarios/major-upgrade.yaml | 575 ---------------- scenarios/major-upgrade/validator.yaml.tmpl | 14 - scenarios/release-test.yaml | 189 ------ scenarios/release-test/rpc.yaml.tmpl | 20 - scenarios/release-test/validator.yaml.tmpl | 17 - scenarios/testnet-deployment.yaml | 33 - sdk/sei/.xreview/sdk-task-surface.md | 46 -- test/integration/.xreview/release-suite.md | 40 -- test/integration/.xreview/upgrade-suite.md | 41 -- 55 files changed, 3 insertions(+), 7083 deletions(-) delete mode 100644 cmd/seitask/Dockerfile delete mode 100644 cmd/seitask/keygen.go delete mode 100644 cmd/seitask/main.go delete mode 100644 cmd/seitask/main_test.go delete mode 100644 cmd/seitask/provision_node.go delete mode 100644 cmd/seitask/provision_snd.go delete mode 100644 cmd/seitask/runner.go delete mode 100644 cmd/seitask/upload_report.go delete mode 100644 internal/runner/apply.go delete mode 100644 internal/runner/fanout.go delete mode 100644 internal/runner/orchestrate.go delete mode 100644 internal/runner/output.go delete mode 100644 internal/runner/patchtype.go delete mode 100644 internal/runner/poll.go delete mode 100644 internal/runner/runner.go delete mode 100644 internal/runner/runner_test.go delete mode 100644 internal/seitask/keygen/keygen.go delete mode 100644 internal/seitask/keygen/keygen_test.go delete mode 100644 internal/seitask/provisionnode/provision.go delete mode 100644 internal/seitask/provisionnode/provision_test.go delete mode 100644 internal/seitask/provisionsnd/provision.go delete mode 100644 internal/seitask/provisionsnd/provision_test.go delete mode 100644 internal/seitask/uploadreport/upload.go delete mode 100644 internal/seitask/uploadreport/upload_test.go delete mode 100644 internal/taskruntime/cm.go delete mode 100644 internal/taskruntime/cm_test.go delete mode 100644 internal/taskruntime/exit.go delete mode 100644 internal/taskruntime/exit_test.go delete mode 100644 internal/taskruntime/ownerref.go delete mode 100644 internal/taskruntime/ownerref_test.go delete mode 100644 internal/taskruntime/scenarios_test.go delete mode 100644 internal/taskruntime/vars.go delete mode 100644 internal/taskruntime/vars_test.go delete mode 100644 runner/rbac.yaml delete mode 100644 runner/templates/await-condition.yaml.tmpl delete mode 100644 runner/templates/await-nodes-at-height.yaml.tmpl delete mode 100644 runner/templates/gov-software-upgrade.yaml.tmpl delete mode 100644 runner/templates/gov-vote.yaml.tmpl delete mode 100644 runner/templates/update-node-image.yaml.tmpl delete mode 100644 scenarios/README.md delete mode 100644 scenarios/load-test.yaml delete mode 100644 scenarios/load-test/rpc.yaml.tmpl delete mode 100644 scenarios/load-test/validator.yaml.tmpl delete mode 100644 scenarios/major-upgrade.yaml delete mode 100644 scenarios/major-upgrade/validator.yaml.tmpl delete mode 100644 scenarios/release-test.yaml delete mode 100644 scenarios/release-test/rpc.yaml.tmpl delete mode 100644 scenarios/release-test/validator.yaml.tmpl delete mode 100644 scenarios/testnet-deployment.yaml delete mode 100644 sdk/sei/.xreview/sdk-task-surface.md delete mode 100644 test/integration/.xreview/release-suite.md delete mode 100644 test/integration/.xreview/upgrade-suite.md diff --git a/.github/workflows/ecr.yml b/.github/workflows/ecr.yml index ece11d73..b2786241 100644 --- a/.github/workflows/ecr.yml +++ b/.github/workflows/ecr.yml @@ -40,21 +40,6 @@ jobs: cache-from: type=registry,ref=${{ steps.ecr-login.outputs.registry }}/sei/build-cache:shared cache-to: type=registry,ref=${{ steps.ecr-login.outputs.registry }}/sei/build-cache:shared,mode=max - # Monolithic Workflow-Task primitive binary (keygen, provision-snd, - # runner, ...) per https://github.com/sei-protocol/bdchatham-designs/blob/main/designs/test-harness/test-harness-lld.md. Published to - # sei/seitask-runner so scenarios keep the image-name muscle memory; - # the runner capability moves to args: ["runner", ...]. - - name: Build and push seitask image - uses: docker/build-push-action@v6 - with: - context: . - file: cmd/seitask/Dockerfile - push: true - platforms: linux/amd64 - tags: ${{ steps.ecr-login.outputs.registry }}/sei/seitask-runner:${{ inputs.tag || github.sha }} - cache-from: type=registry,ref=${{ steps.ecr-login.outputs.registry }}/sei/build-cache:shared - cache-to: type=registry,ref=${{ steps.ecr-login.outputs.registry }}/sei/build-cache:shared,mode=max - # The Go-native integration harness (go test -c -tags integration), run by # one CronJob per target (-test.run TestX). Replaces seitask-runner + the # Chaos-Mesh Workflow scenarios once the nightly CronJobs cut over. diff --git a/Makefile b/Makefile index 0ce83ae5..320f7dd7 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,4 @@ IMG ?= sei-k8s-controller:latest -RUNNER_IMG ?= seitask-runner:latest GOLANGCI_LINT ?= $(shell which golangci-lint 2>/dev/null || echo $(HOME)/go/bin/golangci-lint) # Pinned tool versions. Bump together: setup-envtest's release branch tracks @@ -18,14 +17,11 @@ CONTROLLER_GEN_VERSION ?= v0.20.1 LOCALBIN ?= $(CURDIR)/bin SETUP_ENVTEST ?= $(LOCALBIN)/setup-envtest -.PHONY: build runner test test-integration test-all lint manifests generate verify-generated setup-envtest ci docker-build docker-push runner-image runner-push +.PHONY: build test test-integration test-all lint manifests generate verify-generated setup-envtest ci docker-build docker-push build: ## Build manager binary. go build -o bin/manager ./cmd/ -runner: ## Build seitask-runner binary. - go build -o bin/seitask-runner ./cmd/runner/ - test: ## Run tests. go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out @@ -69,9 +65,3 @@ docker-build: ## Build docker image. docker-push: ## Push docker image. docker push ${IMG} - -runner-image: ## Build seitask-runner container image. - docker build --platform linux/amd64 -t ${RUNNER_IMG} -f runner/Dockerfile . - -runner-push: ## Push seitask-runner container image. - docker push ${RUNNER_IMG} diff --git a/cmd/seitask/Dockerfile b/cmd/seitask/Dockerfile deleted file mode 100644 index 46a72dd8..00000000 --- a/cmd/seitask/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -FROM golang:1.26 AS builder -ARG TARGETOS -ARG TARGETARCH - -WORKDIR /workspace -COPY go.mod go.mod -COPY go.sum go.sum -RUN go mod download - -COPY . . - -RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ - go build -a -ldflags="-s -w" -o seitask ./cmd/seitask/ - -FROM gcr.io/distroless/static-debian12:nonroot -WORKDIR / -COPY --from=builder /workspace/seitask /seitask -# Runner templates (SeiNodeTask CRs) at the conventional mount path. -COPY --from=builder /workspace/runner/templates /templates -# Per-scenario SeiNetwork templates (consumed by provision-snd's --template). -# Add a COPY line per scenario; Workflow YAMLs reference paths under -# /scenarios//. -COPY --from=builder /workspace/scenarios/release-test /scenarios/release-test -COPY --from=builder /workspace/scenarios/load-test /scenarios/load-test -COPY --from=builder /workspace/scenarios/major-upgrade /scenarios/major-upgrade -USER 65532:65532 - -ENTRYPOINT ["/seitask"] diff --git a/cmd/seitask/keygen.go b/cmd/seitask/keygen.go deleted file mode 100644 index 6fcd5dfb..00000000 --- a/cmd/seitask/keygen.go +++ /dev/null @@ -1,54 +0,0 @@ -package main - -import ( - "context" - "log" - - "github.com/urfave/cli/v3" - - "github.com/sei-protocol/sei-k8s-controller/internal/seitask/keygen" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -func newKeygenCommand() *cli.Command { - return &cli.Command{ - Name: "keygen", - Usage: "Generate a fresh BIP-39 mnemonic and cosmos secp256k1 keypair, write it to a " + - "per-run Secret, and stamp the address into workflow-vars", - Flags: []cli.Flag{ - &cli.StringFlag{ - Name: "key-name", - Aliases: []string{"k"}, - Usage: "Logical identity name; Secret is named -", - Sources: cli.EnvVars("KEY_NAME"), - Value: "admin", - Required: false, - }, - }, - Action: runKeygen, - } -} - -func runKeygen(ctx context.Context, cmd *cli.Command) error { - c, err := kubeClientFromEnv() - if err != nil { - return err - } - wf, err := taskruntime.LoadWorkflowIdentity(ctx, c) - if err != nil { - return err - } - - res, err := keygen.Run(ctx, c, keygen.Params{ - KeyName: cmd.String("key-name"), - Workflow: wf, - }) - if err != nil { - // Stamp EXIT_REASON so upload-report can recover the failure class. - taskruntime.WriteExitReason(ctx, c, wf, err) - return err - } - taskruntime.WriteExitReason(ctx, c, wf, nil) - log.Printf("keygen: created Secret %q with address %s", res.SecretName, res.Address) - return nil -} diff --git a/cmd/seitask/main.go b/cmd/seitask/main.go deleted file mode 100644 index 6102c2c2..00000000 --- a/cmd/seitask/main.go +++ /dev/null @@ -1,74 +0,0 @@ -// Command seitask is the monolithic Workflow-Task primitive binary: one -// binary, multiple urfave/cli subcommands (keygen, provision-snd, -// provision-node, …) that -// share the internal/taskruntime shared library. See -// https://github.com/sei-protocol/bdchatham-designs/blob/main/designs/test-harness/test-harness-lld.md. -package main - -import ( - "context" - "fmt" - "log" - "os" - "os/signal" - "syscall" - - "github.com/urfave/cli/v3" - "k8s.io/apimachinery/pkg/runtime" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - clientgoscheme "k8s.io/client-go/kubernetes/scheme" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - - seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -// taskScheme is the controller-runtime client scheme for every seitask -// subcommand: builtin K8s types + sei.io/v1alpha1 (SeiNetwork, -// SeiNodeTask, SeiNode) so typed Create/Get round-trips work. Chaos Mesh -// CRs are read via unstructured so they're not registered here. -var taskScheme = func() *runtime.Scheme { - s := runtime.NewScheme() - utilruntime.Must(clientgoscheme.AddToScheme(s)) - utilruntime.Must(seiv1alpha1.AddToScheme(s)) - return s -}() - -func main() { - ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) - defer stop() - - app := &cli.Command{ - Name: "seitask", - Usage: "Workflow Task primitives for the sei-k8s-controller test harness", - Commands: []*cli.Command{ - newKeygenCommand(), - newProvisionSNDCommand(), - newProvisionNodeCommand(), - newRunnerCommand(), - newUploadReportCommand(), - }, - } - - if err := app.Run(ctx, os.Args); err != nil { - // Subcommands wrap with taskruntime.Infra / taskruntime.Task so this - // mapping reaches the right 0/1/2 exit code. - log.Printf("seitask: %v", err) - os.Exit(taskruntime.ExitCodeFor(err)) - } -} - -// kubeClientFromEnv uses standard ctrl.GetConfig discovery (in-cluster SA -// → $KUBECONFIG → ~/.kube/config). -func kubeClientFromEnv() (client.Client, error) { - cfg, err := ctrl.GetConfig() - if err != nil { - return nil, taskruntime.Infra(fmt.Errorf("loading kubeconfig: %w", err)) - } - c, err := client.New(cfg, client.Options{Scheme: taskScheme}) - if err != nil { - return nil, taskruntime.Infra(fmt.Errorf("building client: %w", err)) - } - return c, nil -} diff --git a/cmd/seitask/main_test.go b/cmd/seitask/main_test.go deleted file mode 100644 index 9505a5c7..00000000 --- a/cmd/seitask/main_test.go +++ /dev/null @@ -1,47 +0,0 @@ -package main - -import ( - "testing" - - seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" -) - -const apiGroup = "sei.io" - -// TestTaskScheme_RoundTripsSeiNetwork would have caught the first manual fire's -// `no kind is registered for the type v1alpha1.SeiNetwork in scheme` -// regression at `go test`, not at first cluster fire. Asserts the -// package-level taskScheme has every type provision-snd / keygen / -// upload-report constructs via typed Get/Create. -func TestTaskScheme_RoundTripsSeiNetwork(t *testing.T) { - gvks, _, err := taskScheme.ObjectKinds(&seiv1alpha1.SeiNetwork{}) - if err != nil { - t.Fatalf("SeiNetwork not registered in taskScheme: %v", err) - } - if len(gvks) == 0 { - t.Fatalf("no GVKs returned for SeiNetwork") - } - if gvks[0].Group != apiGroup || gvks[0].Version != "v1alpha1" { - t.Fatalf("SeiNetwork GVK: %+v; want sei.io/v1alpha1", gvks[0]) - } -} - -func TestTaskScheme_RoundTripsSeiNode(t *testing.T) { - gvks, _, err := taskScheme.ObjectKinds(&seiv1alpha1.SeiNode{}) - if err != nil { - t.Fatalf("SeiNode not registered in taskScheme: %v", err) - } - if len(gvks) == 0 || gvks[0].Group != apiGroup || gvks[0].Version != "v1alpha1" { - t.Fatalf("SeiNode GVK wrong: %+v; want sei.io/v1alpha1", gvks) - } -} - -func TestTaskScheme_RoundTripsSeiNodeTask(t *testing.T) { - gvks, _, err := taskScheme.ObjectKinds(&seiv1alpha1.SeiNodeTask{}) - if err != nil { - t.Fatalf("SeiNodeTask not registered in taskScheme: %v", err) - } - if len(gvks) == 0 || gvks[0].Group != apiGroup { - t.Fatalf("SeiNodeTask GVK wrong: %+v", gvks) - } -} diff --git a/cmd/seitask/provision_node.go b/cmd/seitask/provision_node.go deleted file mode 100644 index 7edc1463..00000000 --- a/cmd/seitask/provision_node.go +++ /dev/null @@ -1,121 +0,0 @@ -package main - -import ( - "context" - "log" - "strings" - "time" - - "github.com/urfave/cli/v3" - - "github.com/sei-protocol/sei-k8s-controller/internal/seitask/provisionnode" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -// Flag names shared across the template-rendering subcommands (provision-snd, -// provision-node, runner), declared once so goconst stays green. -const ( - flagTemplate = "template" - flagVar = "var" -) - -func newProvisionNodeCommand() *cli.Command { - return &cli.Command{ - Name: "provision-node", - Usage: "Fan out N standalone SeiNode followers from a template, wait for " + - "Running + per-node TM/EVM readiness, and publish role-scoped endpoints", - Flags: []cli.Flag{ - &cli.StringFlag{ - Name: "role", - Usage: "Role tag for workflow-vars keys (e.g. rpc); uppercased to RPC_*", - Sources: cli.EnvVars("ROLE"), - Required: true, - }, - &cli.StringFlag{ - Name: "name", - Usage: "Base name; the N followers are -0..-(N-1) (defaults to -)", - Sources: cli.EnvVars("NODE_NAME"), - }, - &cli.StringFlag{ - Name: flagTemplate, - Usage: "Path to the Go text/template producing one kind: SeiNode YAML", - Sources: cli.EnvVars("NODE_TEMPLATE"), - Required: true, - }, - &cli.StringSliceFlag{ - Name: flagVar, - Usage: "KEY=VALUE substitution as .KEY (repeatable); .ORDINAL and .NODE_NAME are runtime-injected", - }, - &cli.IntFlag{ - Name: "replicas", - Usage: "N: number of follower SeiNode CRs to fan out", - Sources: cli.EnvVars("NODE_REPLICAS"), - Value: 1, - }, - &cli.StringFlag{ - Name: "network", - Usage: "Genesis SeiNetwork to follow; drives peer auto-wiring + the sei.io/seinetwork object label", - Sources: cli.EnvVars("NETWORK"), - }, - &cli.StringFlag{ - Name: "network-namespace", - Usage: "Namespace of the genesis SeiNetwork for the synthesized peer selector (defaults to the workflow namespace)", - }, - &cli.DurationFlag{ - Name: "running-timeout", - Usage: "Max wait for all N SeiNodes to reach status.phase=Running", - Value: 15 * time.Minute, - }, - &cli.DurationFlag{ - Name: "first-block-timeout", - Usage: "Per-node post-Running readiness budget (TM /status height>0 and EVM eth_blockNumber 200)", - Value: 5 * time.Minute, - }, - &cli.DurationFlag{ - Name: "poll-interval", - Usage: "Status + RPC poll cadence", - Value: 5 * time.Second, - }, - }, - Action: runProvisionNode, - } -} - -func runProvisionNode(ctx context.Context, cmd *cli.Command) error { - c, err := kubeClientFromEnv() - if err != nil { - return err - } - wf, err := taskruntime.LoadWorkflowIdentity(ctx, c) - if err != nil { - return err - } - - vars, err := parseKVPairs(cmd.StringSlice(flagVar)) - if err != nil { - return err - } - - p := provisionnode.Params{ - Role: cmd.String("role"), - Name: cmd.String("name"), - TemplatePath: cmd.String(flagTemplate), - Vars: vars, - Replicas: cmd.Int("replicas"), - Network: cmd.String("network"), - NetworkNamespace: cmd.String("network-namespace"), - RunningTimeout: cmd.Duration("running-timeout"), - FirstBlockTimeout: cmd.Duration("first-block-timeout"), - PollInterval: cmd.Duration("poll-interval"), - Workflow: wf, - } - res, err := provisionnode.Run(ctx, c, p) - if err != nil { - taskruntime.WriteExitReason(ctx, c, wf, err) - return err - } - taskruntime.WriteExitReason(ctx, c, wf, nil) - log.Printf("provision-node: %d SeiNode(s) Running [%s], chainID=%s, EVM_RPC_LIST=%s", - len(res.Names), strings.Join(res.Names, ","), res.ChainID, res.EVMRPCList) - return nil -} diff --git a/cmd/seitask/provision_snd.go b/cmd/seitask/provision_snd.go deleted file mode 100644 index 794f75be..00000000 --- a/cmd/seitask/provision_snd.go +++ /dev/null @@ -1,105 +0,0 @@ -package main - -import ( - "context" - "fmt" - "log" - "strings" - "time" - - "github.com/urfave/cli/v3" - - "github.com/sei-protocol/sei-k8s-controller/internal/seitask/provisionsnd" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -func newProvisionSNDCommand() *cli.Command { - return &cli.Command{ - Name: "provision-snd", - Usage: "Render a SeiNetwork template, apply it, wait for Ready + " + - "first block, and publish role-scoped endpoints to workflow-vars", - Flags: []cli.Flag{ - &cli.StringFlag{ - Name: "role", - Usage: "Role tag for workflow-vars keys (e.g. validator, rpc)", - Sources: cli.EnvVars("ROLE"), - Required: true, - }, - &cli.StringFlag{ - Name: "name", - Usage: "SeiNetwork metadata.name (defaults to -)", - Sources: cli.EnvVars("SND_NAME"), - }, - &cli.StringFlag{ - Name: flagTemplate, - Usage: "Path to the Go text/template producing a SeiNetwork YAML", - Sources: cli.EnvVars("SND_TEMPLATE"), - Required: true, - }, - &cli.StringSliceFlag{ - Name: flagVar, - Usage: "KEY=VALUE substitution exposed to the template as .KEY (repeatable)", - }, - &cli.DurationFlag{ - Name: "ready-timeout", - Usage: "Max wait for status.phase=Ready", - Value: 15 * time.Minute, - }, - &cli.DurationFlag{ - Name: "first-block-timeout", - Usage: "Max post-Ready wait for the chain to produce its first block", - Value: 5 * time.Minute, - }, - }, - Action: runProvisionSND, - } -} - -func runProvisionSND(ctx context.Context, cmd *cli.Command) error { - c, err := kubeClientFromEnv() - if err != nil { - return err - } - wf, err := taskruntime.LoadWorkflowIdentity(ctx, c) - if err != nil { - return err - } - - vars, err := parseKVPairs(cmd.StringSlice(flagVar)) - if err != nil { - return err - } - - p := provisionsnd.Params{ - Role: cmd.String("role"), - Name: cmd.String("name"), - TemplatePath: cmd.String(flagTemplate), - Vars: vars, - ReadyTimeout: cmd.Duration("ready-timeout"), - FirstBlockTimeout: cmd.Duration("first-block-timeout"), - Workflow: wf, - } - res, err := provisionsnd.Run(ctx, c, p) - if err != nil { - taskruntime.WriteExitReason(ctx, c, wf, err) - return err - } - taskruntime.WriteExitReason(ctx, c, wf, nil) - log.Printf("provision-snd: SeiNetwork %q Ready, chainID=%s, TM=%s", res.Name, res.ChainID, res.Endpoints.TendermintRpc) - return nil -} - -func parseKVPairs(pairs []string) (map[string]string, error) { - if len(pairs) == 0 { - return nil, nil - } - out := make(map[string]string, len(pairs)) - for _, kv := range pairs { - idx := strings.IndexByte(kv, '=') - if idx <= 0 { - return nil, fmt.Errorf("--var %q must be KEY=VALUE", kv) - } - out[kv[:idx]] = kv[idx+1:] - } - return out, nil -} diff --git a/cmd/seitask/runner.go b/cmd/seitask/runner.go deleted file mode 100644 index a550c436..00000000 --- a/cmd/seitask/runner.go +++ /dev/null @@ -1,180 +0,0 @@ -package main - -import ( - "context" - "fmt" - "os" - "strings" - "time" - - "github.com/urfave/cli/v3" - "k8s.io/client-go/dynamic" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/clientcmd" - - "github.com/sei-protocol/sei-k8s-controller/internal/runner" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -// newRunnerCommand wires the legacy seitask-runner CLI as a subcommand of -// the monolithic seitask binary. Flag names and semantics match the old -// standalone binary so scenario YAMLs only need to prepend "runner" to args. -// Implementation delegates to internal/runner unchanged. -func newRunnerCommand() *cli.Command { - return &cli.Command{ - Name: "runner", - Usage: "Apply a SeiNodeTask CR from a template and poll until terminal", - Flags: []cli.Flag{ - &cli.StringFlag{ - Name: flagTemplate, - Usage: "Path to the Go text/template producing a SeiNodeTask manifest (required)", - Required: true, - }, - &cli.StringSliceFlag{ - Name: flagVar, - Usage: "KEY=VALUE substitution exposed to the template as .KEY (repeatable)", - }, - &cli.StringSliceFlag{ - Name: "output-jsonpath", - Usage: "JSONPath=ENV_VAR extraction (repeatable)", - }, - &cli.StringFlag{ - Name: "output-env-file", - Usage: "File to append extracted KEY=value pairs to on Complete", - Value: "/workflow/vars/env.sh", - }, - &cli.StringFlag{ - Name: "env-file", - Usage: "Env file to source before render (defaults to /workflow/vars/env.sh when present)", - }, - &cli.DurationFlag{ - Name: "timeout", - Usage: "Total poll timeout per SeiNodeTask", - Value: 10 * time.Minute, - }, - &cli.DurationFlag{ - Name: "poll-interval", - Usage: "Cadence the runner re-reads status.phase", - Value: 5 * time.Second, - }, - &cli.StringFlag{ - Name: "namespace", - Usage: "Namespace to apply into (defaults to the SA's namespace)", - }, - &cli.StringFlag{ - Name: "per-node-selector", - Usage: "Label selector for fan-out over SeiNodes. Empty = single-node mode", - }, - &cli.StringFlag{ - Name: "fanout-mode", - Usage: "all-must-succeed | best-effort | quorum:N", - Value: "all-must-succeed", - }, - &cli.StringFlag{ - Name: "kubeconfig", - Usage: "Path to kubeconfig (defaults to in-cluster config)", - }, - }, - Action: runRunner, - } -} - -func runRunner(ctx context.Context, cmd *cli.Command) error { - varMap, err := parseKVSlice(cmd.StringSlice(flagVar)) - if err != nil { - return err - } - if cmd.String("per-node-selector") != "" { - if _, ok := varMap["NODE"]; ok { - return fmt.Errorf("--per-node-selector is incompatible with --var NODE=...; the runner sets .NODE per match") - } - } - ns, err := resolveRunnerNamespace(cmd.String("namespace")) - if err != nil { - return err - } - cfg, err := loadRunnerKubeConfig(cmd.String("kubeconfig")) - if err != nil { - return fmt.Errorf("load kube config: %w", err) - } - dyn, err := dynamic.NewForConfig(cfg) - if err != nil { - return fmt.Errorf("create dynamic client: %w", err) - } - - // Load the parent Workflow's identity so applied SeiNodeTask CRs - // carry an ownerRef to it — deleting the Workflow then cascades the - // per-step SeiNodeTasks. Matches the keygen / provision-snd pattern. - cliClient, err := kubeClientFromEnv() - if err != nil { - return err - } - wf, err := taskruntime.LoadWorkflowIdentity(ctx, cliClient) - if err != nil { - return err - } - ownerRef := wf.OwnerRef() - - r := &runner.Run{ - Opts: runner.Options{ - TemplatePath: cmd.String(flagTemplate), - Vars: varMap, - OutputJSONPaths: cmd.StringSlice("output-jsonpath"), - OutputEnvFile: cmd.String("output-env-file"), - EnvFile: cmd.String("env-file"), - Timeout: cmd.Duration("timeout"), - PollInterval: cmd.Duration("poll-interval"), - Namespace: ns, - PerNodeSelector: cmd.String("per-node-selector"), - FanoutMode: cmd.String("fanout-mode"), - }, - Stdout: os.Stdout, - Stderr: os.Stderr, - Renderer: runner.DefaultRenderer{OwnerRef: &ownerRef}, - Applier: runner.DynamicApplier{Client: dyn}, - Poller: runner.DynamicPoller{Client: dyn}, - Lister: runner.DynamicNodeLister{Client: dyn}, - Sourcer: runner.FileEnvSourcer{}, - Writer: runner.FileEnvWriter{}, - } - return r.Execute(ctx) -} - -func parseKVSlice(in []string) (map[string]string, error) { - out := map[string]string{} - for _, v := range in { - idx := strings.IndexByte(v, '=') - if idx <= 0 { - return nil, fmt.Errorf("--var %q must be KEY=VALUE", v) - } - out[v[:idx]] = v[idx+1:] - } - return out, nil -} - -// resolveRunnerNamespace falls back to the in-pod SA namespace file when -// --namespace is empty, matching kubectl's resolution order. -func resolveRunnerNamespace(flagNS string) (string, error) { - if flagNS != "" { - return flagNS, nil - } - const saNS = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" - b, err := os.ReadFile(saNS) - if err == nil { - return strings.TrimSpace(string(b)), nil - } - return "", fmt.Errorf("--namespace not provided and SA namespace file unreadable: %w", err) -} - -func loadRunnerKubeConfig(kubeconfig string) (*rest.Config, error) { - if kubeconfig != "" { - return clientcmd.BuildConfigFromFlags("", kubeconfig) - } - if cfg, err := rest.InClusterConfig(); err == nil { - return cfg, nil - } - return clientcmd.NewNonInteractiveDeferredLoadingClientConfig( - clientcmd.NewDefaultClientConfigLoadingRules(), - &clientcmd.ConfigOverrides{}, - ).ClientConfig() -} diff --git a/cmd/seitask/upload_report.go b/cmd/seitask/upload_report.go deleted file mode 100644 index f4c4760d..00000000 --- a/cmd/seitask/upload_report.go +++ /dev/null @@ -1,86 +0,0 @@ -package main - -import ( - "context" - "fmt" - "log" - - "github.com/aws/aws-sdk-go-v2/config" - "github.com/aws/aws-sdk-go-v2/service/s3" - "github.com/urfave/cli/v3" - - "github.com/sei-protocol/sei-k8s-controller/internal/seitask/uploadreport" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -func newUploadReportCommand() *cli.Command { - return &cli.Command{ - Name: "upload-report", - Usage: "Upload Workflow resource snapshot (workflow-vars, Workflow CR, WorkflowNode tree) " + - "to S3; exit code mirrors the EXIT_REASON workflow-vars key. Pod logs are not uploaded; " + - "Loki already ingests them.", - Flags: []cli.Flag{ - &cli.StringFlag{ - Name: "bucket", - Usage: "S3 bucket to upload to", - Sources: cli.EnvVars("S3_BUCKET"), - Required: true, - }, - &cli.StringFlag{ - Name: "prefix", - Usage: "S3 key prefix (typically ${NAMESPACE}/${SCENARIO}/${RUN_ID})", - Sources: cli.EnvVars("S3_PREFIX"), - Required: true, - }, - &cli.StringFlag{ - Name: "region", - Usage: "AWS region", - Sources: cli.EnvVars("AWS_REGION"), - Value: "eu-central-1", - }, - }, - Action: runUploadReport, - } -} - -func runUploadReport(ctx context.Context, cmd *cli.Command) error { - c, err := kubeClientFromEnv() - if err != nil { - return err - } - wf, err := taskruntime.LoadWorkflowIdentity(ctx, c) - if err != nil { - return err - } - - awsCfg, err := config.LoadDefaultConfig(ctx, config.WithRegion(cmd.String("region"))) - if err != nil { - return taskruntime.Infra(fmt.Errorf("loading AWS config: %w", err)) - } - s3client := s3.NewFromConfig(awsCfg) - - // upload-report is the terminal observer — never writes EXIT_REASON - // itself. An infra-fail in the upload would otherwise overwrite a - // genuine upstream task-fail and lose the underlying classification. - res, err := uploadreport.Run(ctx, c, uploadreport.Params{ - Bucket: cmd.String("bucket"), - Prefix: cmd.String("prefix"), - Workflow: wf, - S3: uploadreport.NewS3Uploader(s3client), - }) - if err != nil { - return err - } - log.Printf("upload-report: uploaded %d artifacts; upstream exit-reason=%s", - len(res.UploadedKeys), res.ExitReason) - - // Mirror upstream verdict so the Workflow's terminal phase reflects - // scenario outcome rather than upload-step success. - switch res.ExitReason { - case taskruntime.ExitReasonInfraFail: - return taskruntime.Infra(fmt.Errorf("upstream task reported infra-fail")) - case taskruntime.ExitReasonTaskFail: - return taskruntime.Task(fmt.Errorf("upstream task reported task-fail")) - } - return nil -} diff --git a/internal/keygen/keygen.go b/internal/keygen/keygen.go index a9c9e250..1c715286 100644 --- a/internal/keygen/keygen.go +++ b/internal/keygen/keygen.go @@ -6,8 +6,8 @@ // keyring. // // This is the general, k8s-free derivation primitive. Callers that need to stamp -// the result into a Secret / workflow-vars layer sit on top of it — see -// internal/seitask/keygen for the seitask-runner's Secret writer. +// the result into a Secret layer it on top (the integration harness writes a +// per-run Secret the release-test pod reads via secretKeyRef). package keygen import ( diff --git a/internal/runner/apply.go b/internal/runner/apply.go deleted file mode 100644 index 417fcd80..00000000 --- a/internal/runner/apply.go +++ /dev/null @@ -1,230 +0,0 @@ -package runner - -import ( - "bytes" - "context" - "crypto/sha256" - "encoding/hex" - "fmt" - "os" - "sort" - "strings" - "text/template" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/client-go/dynamic" - "sigs.k8s.io/yaml" -) - -const ( - // FieldOwner is the field-manager string used for server-side apply. - // Distinct from "seinode-task-controller" so apply diffs are attributable - // to the runner versus the reconciler. - FieldOwner = "seitask-runner" - - // shortHashLen is the number of hex chars taken from the SHA-256 of the - // (kind|vars|node) tuple to form the metadata.name suffix. - shortHashLen = 10 -) - -// DefaultRenderer renders Go text/template files. The resulting manifest is -// parsed back to assert it is a SeiNodeTask, and metadata.name is -// rewritten to a deterministic value derived from (kind, vars, NODE) so -// re-applies hit the same CR. When OwnerRef is non-nil, it replaces (not -// merges) ownerReferences so the rendered SeiNodeTask cascades on parent -// Workflow deletion. -type DefaultRenderer struct { - // OwnerRef, when non-nil, is stamped onto the rendered manifest as - // the sole entry of metadata.ownerReferences. The runner subcommand - // populates it from taskruntime.LoadWorkflowIdentity at startup. - OwnerRef *metav1.OwnerReference -} - -// Render parses templatePath as a Go text/template and executes it against -// vars. The template author can use {{ .NODE }}, {{ .PROPOSAL_ID }}, etc. -// All keys from vars are exposed as top-level fields with .KEY. -// -// After rendering, the metadata.name is replaced with -// "--" (NODE omitted if empty), where the hash -// covers the template content + sorted vars. This guarantees re-applies -// with identical inputs target the same CR (Workflow restart idempotency). -func (r DefaultRenderer) Render(templatePath string, vars map[string]string) ([]byte, string, error) { - raw, err := os.ReadFile(templatePath) //nolint:gosec // path is operator-controlled CLI arg - if err != nil { - return nil, "", fmt.Errorf("read template: %w", err) - } - return RenderBytes(templatePath, raw, vars, r.OwnerRef) -} - -// RenderBytes is the byte-input variant of Render, exposed for tests. -// When ownerRef is non-nil, it replaces (not merges) ownerReferences on -// the rendered manifest. -func RenderBytes(name string, raw []byte, vars map[string]string, ownerRef *metav1.OwnerReference) ([]byte, string, error) { - tmpl, err := template.New(name). - Option("missingkey=error"). - Parse(string(raw)) - if err != nil { - return nil, "", fmt.Errorf("parse template: %w", err) - } - var buf bytes.Buffer - if err := tmpl.Execute(&buf, vars); err != nil { - return nil, "", fmt.Errorf("execute template: %w", err) - } - - obj := &unstructured.Unstructured{} - if err := yaml.Unmarshal(buf.Bytes(), &obj.Object); err != nil { - return nil, "", fmt.Errorf("parse rendered manifest: %w", err) - } - if obj.GetKind() != "SeiNodeTask" { - return nil, "", fmt.Errorf("rendered manifest is %s, want SeiNodeTask", obj.GetKind()) - } - - // Discover the spec.kind so the deterministic name carries the - // per-kind prefix. - specKind, _, err := unstructured.NestedString(obj.Object, "spec", "kind") - if err != nil || specKind == "" { - return nil, "", fmt.Errorf("spec.kind missing on rendered manifest") - } - - deterministic := DeterministicName(specKind, vars, raw) - obj.SetName(deterministic) - - // Replace (not merge) ownerReferences so a template that smuggles a - // bogus ref can't leak through. Mirrors provisionsnd.stampMetadata. - if ownerRef != nil { - obj.SetOwnerReferences([]metav1.OwnerReference{*ownerRef}) - } - - out, err := yaml.Marshal(obj.Object) - if err != nil { - return nil, "", fmt.Errorf("re-marshal manifest: %w", err) - } - return out, deterministic, nil -} - -// DeterministicName produces a stable metadata.name from -// (spec.kind, vars, template-content). Format: -// -// [-]-<10-hex> -// -// NODE is included as a human-readable infix when present in vars; the hash -// alone already provides uniqueness (template content + sorted vars), so the -// infix is purely operator ergonomics for `kubectl get snt`. -func DeterministicName(specKind string, vars map[string]string, templateContent []byte) string { - keys := make([]string, 0, len(vars)) - for k := range vars { - keys = append(keys, k) - } - sort.Strings(keys) - - h := sha256.New() - h.Write(templateContent) - for _, k := range keys { - h.Write([]byte{0}) - h.Write([]byte(k)) - h.Write([]byte{'='}) - h.Write([]byte(vars[k])) - } - sum := hex.EncodeToString(h.Sum(nil))[:shortHashLen] - - prefix := kebab(specKind) - parts := []string{prefix} - if node := vars["NODE"]; node != "" { - parts = append(parts, sanitizeForDNS(node)) - } - parts = append(parts, sum) - return strings.Join(parts, "-") -} - -// kebab converts CamelCase to kebab-case (GovSoftwareUpgrade -> -// gov-software-upgrade). Plain ASCII only. -func kebab(s string) string { - var b strings.Builder - for i, r := range s { - switch { - case r >= 'A' && r <= 'Z': - if i > 0 { - b.WriteByte('-') - } - b.WriteRune(r + ('a' - 'A')) - default: - b.WriteRune(r) - } - } - return b.String() -} - -// sanitizeForDNS replaces characters that aren't DNS-1123 label safe with -// '-'. Truncates to 40 chars to keep the final name under K8s' 253-byte -// resource name limit even with a long kind prefix. -func sanitizeForDNS(s string) string { - const maxLen = 40 - b := make([]byte, 0, len(s)) - for i := 0; i < len(s); i++ { - c := s[i] - switch { - case c >= 'a' && c <= 'z', c >= '0' && c <= '9', c == '-': - b = append(b, c) - case c >= 'A' && c <= 'Z': - b = append(b, c+('a'-'A')) - default: - b = append(b, '-') - } - } - if len(b) > maxLen { - b = b[:maxLen] - } - // Trim trailing '-' so the joined name doesn't end with "--". - for len(b) > 0 && b[len(b)-1] == '-' { - b = b[:len(b)-1] - } - return string(b) -} - -// DynamicApplier implements Applier against the K8s dynamic client. -// Server-side apply is used so re-applies are no-ops at the apiserver level. -type DynamicApplier struct { - Client dynamic.Interface -} - -// SeiNodeTaskGVR is the GroupVersionResource for SeiNodeTask. -var SeiNodeTaskGVR = schema.GroupVersionResource{ - Group: "sei.io", - Version: "v1alpha1", - Resource: "seinodetasks", -} - -// Apply performs server-side apply of the rendered manifest. The manifest -// must already carry metadata.name; the namespace is taken from the runner's -// pod namespace. -func (a DynamicApplier) Apply(ctx context.Context, namespace string, manifest []byte) error { - obj := &unstructured.Unstructured{} - if err := yaml.Unmarshal(manifest, &obj.Object); err != nil { - return fmt.Errorf("parse manifest for apply: %w", err) - } - obj.SetNamespace(namespace) - - data, err := obj.MarshalJSON() - if err != nil { - return fmt.Errorf("marshal manifest for apply: %w", err) - } - force := true - _, err = a.Client. - Resource(SeiNodeTaskGVR). - Namespace(namespace). - Patch(ctx, obj.GetName(), apiTypesApplyPatch, data, metav1.PatchOptions{ - FieldManager: FieldOwner, - Force: &force, - }) - if err != nil && !apierrors.IsAlreadyExists(err) { - return fmt.Errorf("apply SeiNodeTask %q: %w", obj.GetName(), err) - } - return nil -} - -// apiTypesApplyPatch is a local alias so we don't pull k8s.io/apimachinery/pkg/types -// in the type signature of Apply (kept on a separate line for grep-ability). -var apiTypesApplyPatch = applyPatchTypeMarker() diff --git a/internal/runner/fanout.go b/internal/runner/fanout.go deleted file mode 100644 index 22aa8ecd..00000000 --- a/internal/runner/fanout.go +++ /dev/null @@ -1,73 +0,0 @@ -package runner - -import ( - "context" - "fmt" - "maps" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/client-go/dynamic" -) - -// SeiNodeGVR is the GroupVersionResource for SeiNode. -var SeiNodeGVR = schema.GroupVersionResource{ - Group: "sei.io", - Version: "v1alpha1", - Resource: "seinodes", -} - -// DynamicNodeLister lists SeiNodes by label selector. -type DynamicNodeLister struct { - Client dynamic.Interface -} - -// List returns the names of all SeiNodes in namespace matching selector. -// Empty selector means list-all. Names are returned in apiserver order -// (typically by creation timestamp); the runner does not stably sort because -// the fanout policies are insensitive to ordering. -func (l DynamicNodeLister) List(ctx context.Context, namespace, selector string) ([]string, error) { - opts := metav1.ListOptions{} - if selector != "" { - opts.LabelSelector = selector - } - list, err := l.Client.Resource(SeiNodeGVR).Namespace(namespace).List(ctx, opts) - if err != nil { - return nil, fmt.Errorf("list SeiNodes (selector=%q): %w", selector, err) - } - names := make([]string, 0, len(list.Items)) - for _, item := range list.Items { - names = append(names, item.GetName()) - } - return names, nil -} - -// FanoutTarget is one rendered SeiNodeTask in a fan-out batch. -type FanoutTarget struct { - // Node is the SeiNode name that .NODE was set to during render. - Node string - // Name is the deterministic metadata.name the runner applied. - Name string - // Manifest is the rendered SeiNodeTask manifest. - Manifest []byte -} - -// RenderFanout produces one FanoutTarget per node in nodes, varying only -// the .NODE template variable. baseVars is shared across all renders. -func RenderFanout(r Renderer, templatePath string, baseVars map[string]string, nodes []string) ([]FanoutTarget, error) { - if len(nodes) == 0 { - return nil, fmt.Errorf("fanout: selector matched zero SeiNodes") - } - out := make([]FanoutTarget, 0, len(nodes)) - for _, n := range nodes { - vars := make(map[string]string, len(baseVars)+1) - maps.Copy(vars, baseVars) - vars["NODE"] = n - manifest, name, err := r.Render(templatePath, vars) - if err != nil { - return nil, fmt.Errorf("render for node %q: %w", n, err) - } - out = append(out, FanoutTarget{Node: n, Name: name, Manifest: manifest}) - } - return out, nil -} diff --git a/internal/runner/orchestrate.go b/internal/runner/orchestrate.go deleted file mode 100644 index a02ab0d6..00000000 --- a/internal/runner/orchestrate.go +++ /dev/null @@ -1,256 +0,0 @@ -package runner - -import ( - "context" - "errors" - "fmt" - "sync" - "time" -) - -// Execute runs the full single-or-fanout orchestration for r and returns -// nil on success or an error suitable for printing to stderr + exit 1. -// -//nolint:gocyclo // single linear orchestration; splitting hurts readability -func (r *Run) Execute(ctx context.Context) error { - if r.Now == nil { - r.Now = time.Now - } - - if err := r.sourceEnv(); err != nil { - return fmt.Errorf("source env file: %w", err) - } - - policy, err := ParseFanoutMode(r.Opts.FanoutMode) - if err != nil { - return err - } - - targets, err := r.buildTargets(ctx) - if err != nil { - return err - } - - // Apply all targets up-front so they reconcile in parallel. - for _, t := range targets { - if err := r.Applier.Apply(ctx, r.Opts.Namespace, t.Manifest); err != nil { - return fmt.Errorf("apply %s (node=%s): %w", t.Name, t.Node, err) - } - _, _ = fmt.Fprintf(r.Stdout, "applied SeiNodeTask %s (node=%s)\n", t.Name, t.Node) - } - - // Single-node short-circuit avoids the goroutine-per-target overhead and - // preserves the natural exit-code semantics (failureReason on the single - // task is the runner's exit error). - if len(targets) == 1 { - return r.pollSingle(ctx, targets[0]) - } - return r.pollFanout(ctx, targets, policy) -} - -func (r *Run) sourceEnv() error { - path := r.Opts.EnvFile - if path == "" { - path = "/workflow/vars/env.sh" - } - pairs, err := r.Sourcer.Source(path) - if err != nil { - return err - } - // Sourced values are merged into Vars only if not already set on the CLI; - // CLI --var takes precedence so an explicit override at the Workflow step - // level wins over a stale env-file value. - if r.Opts.Vars == nil { - r.Opts.Vars = map[string]string{} - } - for k, v := range pairs { - if _, set := r.Opts.Vars[k]; !set { - r.Opts.Vars[k] = v - } - } - return nil -} - -func (r *Run) buildTargets(ctx context.Context) ([]FanoutTarget, error) { - // Single-node mode: render once with whatever NODE the operator passed via --var. - if r.Opts.PerNodeSelector == "" { - manifest, name, err := r.Renderer.Render(r.Opts.TemplatePath, r.Opts.Vars) - if err != nil { - return nil, fmt.Errorf("render: %w", err) - } - node := r.Opts.Vars["NODE"] - return []FanoutTarget{{Node: node, Name: name, Manifest: manifest}}, nil - } - // Fan-out: discover SeiNodes by selector, render one CR per match. - nodes, err := r.Lister.List(ctx, r.Opts.Namespace, r.Opts.PerNodeSelector) - if err != nil { - return nil, err - } - return RenderFanout(r.Renderer, r.Opts.TemplatePath, r.Opts.Vars, nodes) -} - -func (r *Run) pollSingle(ctx context.Context, t FanoutTarget) error { - pollCtx, cancel := context.WithTimeout(ctx, r.Opts.Timeout) - defer cancel() - phase, obj, reason, err := r.Poller.Poll(pollCtx, r.Opts.Namespace, t.Name, r.Opts.PollInterval) - if err != nil { - return err - } - if phase == PhaseFailed { - return fmt.Errorf("SeiNodeTask %s failed: %s", t.Name, reason) - } - return r.writeOutputs(obj) -} - -func (r *Run) pollFanout(ctx context.Context, targets []FanoutTarget, policy FanoutPolicy) error { - pollCtx, cancel := context.WithTimeout(ctx, r.Opts.Timeout) - defer cancel() - - type result struct { - idx int - outcome Outcome - obj map[string]any - reason string - err error - } - - // Goroutine-per-target: each loops on Poll until its task is terminal or - // the shared deadline cancels. We aggregate decisions as results arrive. - resultCh := make(chan result, len(targets)) - var wg sync.WaitGroup - for i, t := range targets { - wg.Add(1) - go func(idx int, target FanoutTarget) { - defer wg.Done() - phase, obj, reason, err := r.Poller.Poll(pollCtx, r.Opts.Namespace, target.Name, r.Opts.PollInterval) - out := OutcomeUnknown - switch { - case err != nil: - // Treat poll error (incl. deadline) as failed outcome with the underlying error message. - out = OutcomeFailed - if reason == "" { - reason = err.Error() - } - case phase == PhaseComplete: - out = OutcomeComplete - case phase == PhaseFailed: - out = OutcomeFailed - } - resultCh <- result{idx: idx, outcome: out, obj: obj, reason: reason, err: err} - }(i, t) - } - go func() { wg.Wait(); close(resultCh) }() - - outcomes := make([]Outcome, 0, len(targets)) - objects := make([]map[string]any, len(targets)) - var firstFailure string - - for res := range resultCh { - outcomes = append(outcomes, res.outcome) - if res.obj != nil { - objects[res.idx] = res.obj - } - if res.outcome == OutcomeFailed && firstFailure == "" { - firstFailure = fmt.Sprintf("%s: %s", targets[res.idx].Name, res.reason) - } - _, _ = fmt.Fprintf(r.Stdout, "fanout %s (node=%s): %s\n", targets[res.idx].Name, targets[res.idx].Node, outcomeLabel(res.outcome)) - - done, ok := policy.Decide(outcomes, len(targets)) - if done { - // Early-exit: cancel in-flight pollers; they'll surface as failed/unknown - // and be discarded (we already have a verdict). - cancel() - //nolint:revive // intentionally drain the channel to let the WaitGroup complete cleanly - for range resultCh { - } - if !ok { - if firstFailure == "" { - firstFailure = "fanout policy not satisfied" - } - return errors.New(firstFailure) - } - return r.writeFanoutOutputs(objects) - } - } - - // Channel drained without an early verdict (all outcomes in but Decide - // kept saying "not done"). That is a logic bug — Decide must terminate - // once total entries are seen. - done, ok := policy.Decide(outcomes, len(targets)) - if !done { - return fmt.Errorf("fanout policy %q failed to terminate with %d/%d outcomes (internal bug)", r.Opts.FanoutMode, len(outcomes), len(targets)) - } - if !ok { - if firstFailure == "" { - firstFailure = "fanout policy not satisfied" - } - return errors.New(firstFailure) - } - return r.writeFanoutOutputs(objects) -} - -func outcomeLabel(o Outcome) string { - switch o { - case OutcomeComplete: - return "Complete" - case OutcomeFailed: - return "Failed" - default: - return "Pending" - } -} - -func (r *Run) writeOutputs(obj map[string]any) error { - if obj == nil || len(r.Opts.OutputJSONPaths) == 0 { - return nil - } - kvs, err := ExtractOutputs(r.Opts.OutputJSONPaths, obj) - if err != nil { - return fmt.Errorf("extract outputs: %w", err) - } - if r.Opts.OutputEnvFile == "" { - // Nowhere to write — emit to stdout as a fallback so the runner is - // still useful when invoked standalone. - for _, kv := range kvs { - _, _ = fmt.Fprintf(r.Stdout, "%s=%s\n", kv.Key, kv.Value) - } - return nil - } - return r.Writer.Append(r.Opts.OutputEnvFile, kvs) -} - -// writeFanoutOutputs writes outputs from the *first* Complete object only. -// Fan-out jsonpath extraction across N nodes is undefined in the LLD — the -// canonical use case (UpdateNodeImage fanout) yields the same appliedImage -// per node, so first-Complete is a reasonable convention. -// -// If operators need per-node aggregation later, the right primitive is a -// separate --output-jsonpath-fanout flag with an explicit aggregation policy -// (join, json-array, max). Punted until a scenario demands it. -func (r *Run) writeFanoutOutputs(objects []map[string]any) error { - for _, obj := range objects { - if obj == nil { - continue - } - if nestedString(obj, "status", "phase") == PhaseComplete { - return r.writeOutputs(obj) - } - } - return nil -} - -// nestedString is a small map walker that returns the string value at the -// given key path, or "" if any intermediate node is missing or non-string. -// Kept inline so this file doesn't pull apimachinery just for one helper. -func nestedString(obj map[string]any, fields ...string) string { - var cur any = obj - for _, f := range fields { - m, ok := cur.(map[string]any) - if !ok { - return "" - } - cur = m[f] - } - s, _ := cur.(string) - return s -} diff --git a/internal/runner/output.go b/internal/runner/output.go deleted file mode 100644 index a1bdf448..00000000 --- a/internal/runner/output.go +++ /dev/null @@ -1,133 +0,0 @@ -package runner - -import ( - "bufio" - "bytes" - "fmt" - "os" - "strings" - - "k8s.io/client-go/util/jsonpath" -) - -// ExtractOutputs evaluates each "=" spec against obj and -// returns the env-var assignments. JSONPath syntax is the standard -// k8s.io/client-go/util/jsonpath (the same used by `kubectl get -o jsonpath`). -// -// Missing fields are not errors — the resulting KV is omitted. This is load- -// bearing for sidecar-backed kinds whose status.outputs. is empty in MVP -// (see LLD: "No structured outputs from sidecar-backed kinds"); a runner step -// that targets such an output should not fail just because the field is -// missing — the Workflow author can decide whether downstream steps need it. -func ExtractOutputs(specs []string, obj map[string]any) ([]KV, error) { - if len(specs) == 0 { - return nil, nil - } - out := make([]KV, 0, len(specs)) - for _, spec := range specs { - pathRaw, envVarRaw, ok := strings.Cut(spec, "=") - if !ok { - return nil, fmt.Errorf("output-jsonpath %q missing '=ENV_VAR' suffix", spec) - } - path := strings.TrimSpace(pathRaw) - envVar := strings.TrimSpace(envVarRaw) - if path == "" || envVar == "" { - return nil, fmt.Errorf("output-jsonpath %q has empty path or env var", spec) - } - val, ok, err := evalJSONPath(path, obj) - if err != nil { - return nil, fmt.Errorf("evaluate %q: %w", path, err) - } - if !ok { - continue - } - out = append(out, KV{Key: envVar, Value: val}) - } - return out, nil -} - -func evalJSONPath(path string, obj map[string]any) (string, bool, error) { - jp := jsonpath.New("output").AllowMissingKeys(true) - // kubectl's syntax accepts the leading dot (".status.phase"); the jsonpath - // lib wants "{.status.phase}". Normalize. - expr := path - if !strings.HasPrefix(expr, "{") { - expr = "{" + expr + "}" - } - if err := jp.Parse(expr); err != nil { - return "", false, err - } - var buf bytes.Buffer - if err := jp.Execute(&buf, obj); err != nil { - return "", false, err - } - s := strings.TrimSpace(buf.String()) - if s == "" { - return "", false, nil - } - return s, true, nil -} - -// FileEnvSourcer loads KEY=VALUE pairs from a shell-style env file. Lines -// starting with '#' and blank lines are skipped. Quoted values are not -// unquoted — the runner's writes are unquoted by construction, so any quoting -// the operator authored is preserved verbatim. -type FileEnvSourcer struct{} - -// Source reads KEY=VALUE pairs from path. Returns (nil, nil) when path -// doesn't exist — missing env files are not an error (first Workflow step -// has nothing to source). -func (FileEnvSourcer) Source(path string) (map[string]string, error) { - f, err := os.Open(path) //nolint:gosec // path is operator-controlled CLI arg - if err != nil { - if os.IsNotExist(err) { - return nil, nil - } - return nil, err - } - defer func() { _ = f.Close() }() - - out := map[string]string{} - scanner := bufio.NewScanner(f) - for scanner.Scan() { - line := strings.TrimSpace(scanner.Text()) - if line == "" || strings.HasPrefix(line, "#") { - continue - } - // Strip a leading "export " so files written by other tools are accepted. - line = strings.TrimPrefix(line, "export ") - idx := strings.IndexByte(line, '=') - if idx <= 0 { - continue - } - out[line[:idx]] = line[idx+1:] - } - return out, scanner.Err() -} - -// FileEnvWriter appends KEY=value lines to an env file. Parent directories -// must already exist (Chaos Mesh Workflow mounts the emptyDir for us). -type FileEnvWriter struct{} - -// Append appends KEY=value lines to path. The file is opened in append mode -// and flushed on close. Values are not quoted; values containing whitespace -// or shell metacharacters survive the runner's `source` because the runner -// reads with a line parser, not a shell. Operators wiring to a real -// `source` should keep values free of shell metacharacters — by convention, -// SeiNodeTask outputs (txHash, height, image) are. -func (FileEnvWriter) Append(path string, kv []KV) error { - if len(kv) == 0 { - return nil - } - f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644) //nolint:gosec // path is operator-controlled CLI arg - if err != nil { - return err - } - defer func() { _ = f.Close() }() - for _, p := range kv { - if _, err := fmt.Fprintf(f, "%s=%s\n", p.Key, p.Value); err != nil { - return err - } - } - return nil -} diff --git a/internal/runner/patchtype.go b/internal/runner/patchtype.go deleted file mode 100644 index 464c87d9..00000000 --- a/internal/runner/patchtype.go +++ /dev/null @@ -1,7 +0,0 @@ -package runner - -import "k8s.io/apimachinery/pkg/types" - -// applyPatchTypeMarker returns the patch type used for server-side apply. -// Split out so apply.go's imports stay tight. -func applyPatchTypeMarker() types.PatchType { return types.ApplyPatchType } diff --git a/internal/runner/poll.go b/internal/runner/poll.go deleted file mode 100644 index 04ea0413..00000000 --- a/internal/runner/poll.go +++ /dev/null @@ -1,56 +0,0 @@ -package runner - -import ( - "context" - "fmt" - "time" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/client-go/dynamic" -) - -// DynamicPoller polls SeiNodeTask.status.phase via the dynamic client. -type DynamicPoller struct { - Client dynamic.Interface -} - -// Poll re-reads the SeiNodeTask until phase is Complete or Failed, or the -// context is cancelled. The returned obj is the most recent observation, -// suitable for jsonpath extraction. failureReason is populated when phase=Failed -// (from .status.task.err) so callers can surface it on exit-1. -func (p DynamicPoller) Poll(ctx context.Context, namespace, name string, interval time.Duration) (string, map[string]any, string, error) { - ticker := time.NewTicker(interval) - defer ticker.Stop() - // Read once before sleeping so a fast-completing task isn't blocked on the - // first interval. - for { - obj, err := p.Client.Resource(SeiNodeTaskGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) - if err != nil { - if !apierrors.IsNotFound(err) { - return "", nil, "", fmt.Errorf("get SeiNodeTask %s/%s: %w", namespace, name, err) - } - // Not-found is expected briefly after apply on slow caches; keep polling. - } else { - phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") - switch phase { - case PhaseComplete, PhaseFailed: - reason := "" - if phase == PhaseFailed { - reason, _, _ = unstructured.NestedString(obj.Object, "status", "task", "err") - if reason == "" { - reason = "task reached Failed phase (no error message)" - } - } - return phase, obj.Object, reason, nil - } - } - - select { - case <-ctx.Done(): - return "", nil, "", fmt.Errorf("timeout waiting for %s/%s to reach terminal phase: %w", namespace, name, ctx.Err()) - case <-ticker.C: - } - } -} diff --git a/internal/runner/runner.go b/internal/runner/runner.go deleted file mode 100644 index 99327b42..00000000 --- a/internal/runner/runner.go +++ /dev/null @@ -1,257 +0,0 @@ -// Package runner implements the seitask-runner orchestration container that -// Chaos Mesh Workflow Task steps use to apply SeiNodeTask CRs and wait for -// completion. The runner is intentionally generic — the per-kind shape comes -// from text/template files mounted at /templates, not from CLI subcommands. -// -// Behavior contract (LLD: https://github.com/sei-protocol/bdchatham-designs/blob/main/designs/seinode-task/seinode-task-lld.md "Runner container"): -// -// 1. Source /workflow/vars/env.sh if present (env-file bridge between Workflow -// steps). -// 2. Render the template with --var KEY=VALUE substitutions. Single-node -// mode renders once with .NODE set from --var NODE=; fan-out mode -// lists SeiNodes by selector and renders once per match with .NODE set -// to each SeiNode's name. -// 3. Server-side apply each rendered SeiNodeTask (fieldOwner=seitask-runner). -// 4. Poll .status.phase until terminal (Complete | Failed) or --timeout. -// 5. On Complete, run --output-jsonpath extractions and append KEY=value -// lines to --output-env-file. Exit 0. -// 6. On Failed or timeout, exit 1. -// -// The runner talks to the K8s API only (no pods/exec, no kubectl binary). -package runner - -import ( - "context" - "fmt" - "io" - "strings" - "time" -) - -// Options is the parsed CLI invocation. -type Options struct { - // TemplatePath is the path to the Go text/template file producing a - // SeiNodeTask manifest. - TemplatePath string - - // Vars are KEY=VALUE substitutions exposed to the template as - // .KEY accessors. - Vars map[string]string - - // OutputJSONPaths are extraction expressions of the form - // '.status.outputs.govVote.txHash=TX_HASH'. The left side is a JSONPath - // against the SeiNodeTask object; the right side is the env var name - // written to OutputEnvFile. - OutputJSONPaths []string - - // OutputEnvFile is the path the runner appends KEY=value lines to on - // Complete. Conventionally /workflow/vars/env.sh. - OutputEnvFile string - - // Timeout bounds the total poll duration per apply. - Timeout time.Duration - - // PollInterval is the cadence the runner re-reads .status.phase. - PollInterval time.Duration - - // Namespace overrides the in-cluster namespace (defaults to the SA - // namespace mounted at /var/run/secrets/kubernetes.io/serviceaccount/namespace). - Namespace string - - // PerNodeSelector enables fan-out mode. Empty means single-node. - // Value is a Kubernetes label selector (e.g. "role=validator"). - PerNodeSelector string - - // FanoutMode selects the success policy: all-must-succeed (default), - // best-effort, or quorum:N. - FanoutMode string - - // EnvFile is the env-file the runner sources at startup. When empty, - // /workflow/vars/env.sh is used if it exists. - EnvFile string -} - -// FanoutPolicy modes. -const ( - fanoutModeAll = "all" - fanoutModeBestEffort = "best-effort" - fanoutModeQuorum = "quorum" - - // PhaseComplete and PhaseFailed mirror the SeiNodeTask CRD phase values. - // Centralized here so the runner doesn't import the api types package - // (keeps the runner build closure small). - PhaseComplete = "Complete" - PhaseFailed = "Failed" -) - -// FanoutPolicy is the parsed --fanout-mode value. -type FanoutPolicy struct { - // Mode is one of "all", "best-effort", "quorum". - Mode string - // Quorum is the N from "quorum:N"; zero otherwise. - Quorum int -} - -// ParseFanoutMode parses a --fanout-mode value into a policy. The empty -// string maps to "all-must-succeed" (the default). -// -// Fail-fast semantics: under "all-must-succeed", the runner exits non-zero -// as soon as one target fails — remaining targets are left in-flight, and -// the SeiNodeTaskReconciler keeps reconciling them until they reach a -// terminal phase or are garbage-collected. The runner does NOT delete -// CRs on exit (post-mortem `kubectl describe` would lose context). For -// tx-emitting kinds the broadcast has already happened by the time the -// runner polls, so early-exit changes runner wall-clock but not chain state. -func ParseFanoutMode(s string) (FanoutPolicy, error) { - switch s { - case "", "all-must-succeed": - return FanoutPolicy{Mode: fanoutModeAll}, nil - case fanoutModeBestEffort: - return FanoutPolicy{Mode: fanoutModeBestEffort}, nil - } - if rest, ok := strings.CutPrefix(s, "quorum:"); ok { - n, err := parsePositiveInt(rest) - if err != nil { - return FanoutPolicy{}, fmt.Errorf("invalid quorum value: %w", err) - } - return FanoutPolicy{Mode: fanoutModeQuorum, Quorum: n}, nil - } - return FanoutPolicy{}, fmt.Errorf("unknown fanout-mode %q (want all-must-succeed | best-effort | quorum:N)", s) -} - -// Decide evaluates a set of per-target outcomes against the policy. -// -// - all: every outcome must be true; any false fails. Returns ok=true only -// once every entry is true; ok=false the moment any entry is false. -// - best-effort: ok=true if at least one outcome is true; ok=false only -// when every entry has terminated and zero were true. -// - quorum:N: ok=true once Quorum entries are true; ok=false when so many -// have failed that Quorum can no longer be reached. -// -// done indicates whether the policy can conclude given the outcomes so far. -// When done=false, the caller should keep polling. When done=true, ok is the -// final verdict. -func (p FanoutPolicy) Decide(outcomes []Outcome, total int) (done, ok bool) { - var completed, failed, pending int - for _, o := range outcomes { - switch o { - case OutcomeComplete: - completed++ - case OutcomeFailed: - failed++ - default: - pending++ - } - } - pending += total - len(outcomes) - - switch p.Mode { - case fanoutModeAll: - if failed > 0 { - return true, false - } - if completed == total { - return true, true - } - return false, false - case fanoutModeBestEffort: - if completed > 0 && pending == 0 { - return true, true - } - if pending == 0 { - return true, completed > 0 - } - return false, false - case fanoutModeQuorum: - if completed >= p.Quorum { - return true, true - } - // If remaining successes can no longer reach quorum, fail fast. - if completed+pending < p.Quorum { - return true, false - } - return false, false - } - return true, false -} - -// Outcome is the terminal status of one applied SeiNodeTask. -type Outcome int - -const ( - // OutcomeUnknown means the task has not reached a terminal phase. - OutcomeUnknown Outcome = iota - // OutcomeComplete means .status.phase=Complete. - OutcomeComplete - // OutcomeFailed means .status.phase=Failed. - OutcomeFailed -) - -// Run is the top-level entrypoint. It is split out of main so it can be -// unit tested with stubbed K8s and filesystem dependencies. -type Run struct { - Opts Options - Stdout io.Writer - Stderr io.Writer - Now func() time.Time - Renderer Renderer - Applier Applier - Poller Poller - Lister NodeLister - Sourcer EnvSourcer - Writer EnvWriter -} - -// Renderer renders a template file with vars to a SeiNodeTask manifest. -type Renderer interface { - Render(templatePath string, vars map[string]string) (rendered []byte, name string, err error) -} - -// Applier applies a rendered SeiNodeTask manifest to the cluster. -type Applier interface { - Apply(ctx context.Context, namespace string, manifest []byte) error -} - -// Poller polls a SeiNodeTask's status until terminal or context is done. -// Returns the final phase plus the raw object (for jsonpath extraction). -type Poller interface { - Poll(ctx context.Context, namespace, name string, interval time.Duration) (phase string, obj map[string]any, failureReason string, err error) -} - -// NodeLister lists SeiNode names in a namespace matching a label selector. -type NodeLister interface { - List(ctx context.Context, namespace, selector string) (names []string, err error) -} - -// EnvSourcer reads a shell-style env file into a map. Lines that aren't -// KEY=VALUE are skipped silently (commenting/empty/etc). -type EnvSourcer interface { - Source(path string) (map[string]string, error) -} - -// EnvWriter appends KEY=value lines to an env file. -type EnvWriter interface { - Append(path string, kv []KV) error -} - -// KV is a single env-file pair. -type KV struct { - Key, Value string -} - -func parsePositiveInt(s string) (int, error) { - if s == "" { - return 0, fmt.Errorf("empty value") - } - n := 0 - for _, c := range s { - if c < '0' || c > '9' { - return 0, fmt.Errorf("not an integer: %q", s) - } - n = n*10 + int(c-'0') - } - if n == 0 { - return 0, fmt.Errorf("must be > 0") - } - return n, nil -} diff --git a/internal/runner/runner_test.go b/internal/runner/runner_test.go deleted file mode 100644 index 96b023d3..00000000 --- a/internal/runner/runner_test.go +++ /dev/null @@ -1,564 +0,0 @@ -package runner_test - -import ( - "context" - "errors" - "fmt" - "maps" - "os" - "path/filepath" - "strings" - "sync" - "testing" - "time" - - . "github.com/onsi/gomega" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/yaml" - - "github.com/sei-protocol/sei-k8s-controller/internal/runner" -) - -// Test fixtures kept as constants to satisfy goconst across the file. These -// are pure test inputs — refactoring callers to use them improves nothing -// semantically; the constants exist solely to keep the linter quiet on -// strings that recur >=3 times. -const ( - tChainID = "sei-localnet" - tChainIDKey = "CHAIN_ID" - tStatusKey = "status" - tPhaseKey = "phase" - tNodeKey = "NODE" - tComplete = "Complete" - tTxHashEnv = "TX_HASH" - tTxHashVal = "ABCD" - tStubName = "stub-name" - tImageV2 = "seid:v2" - tIgnored = "ignored" - tPropIDKey = "PROPOSAL_ID" - tValidator0 = "validator-0" -) - -// --------------------------------------------------------------------------- -// Apply / render -// --------------------------------------------------------------------------- - -func TestRenderBytes_DeterministicName(t *testing.T) { - g := NewWithT(t) - tmpl := []byte(`apiVersion: sei.io/v1alpha1 -kind: SeiNodeTask -metadata: - name: PLACEHOLDER -spec: - kind: GovVote - target: - nodeRef: - name: {{ .NODE }} - govVote: - chainId: {{ .CHAIN_ID }} - keyName: k - proposalId: {{ .PROPOSAL_ID }} - option: yes - fees: 2000usei - gas: 200000 -`) - vars := map[string]string{tNodeKey: tValidator0, tChainIDKey: tChainID, tPropIDKey: "47"} - - manifest1, name1, err := runner.RenderBytes("t.tmpl", tmpl, vars, nil) - g.Expect(err).NotTo(HaveOccurred()) - manifest2, name2, err := runner.RenderBytes("t.tmpl", tmpl, vars, nil) - g.Expect(err).NotTo(HaveOccurred()) - - g.Expect(name1).To(Equal(name2), "name must be deterministic for identical inputs") - g.Expect(string(manifest1)).To(Equal(string(manifest2))) - g.Expect(name1).To(HavePrefix("gov-vote-validator-0-"), "name should embed kind + NODE for operator ergonomics") - g.Expect(name1).To(MatchRegexp(`^gov-vote-validator-0-[0-9a-f]{10}$`)) - - // Re-render with a different var should change the hash. - vars[tPropIDKey] = "48" - _, name3, err := runner.RenderBytes("t.tmpl", tmpl, vars, nil) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(name3).NotTo(Equal(name1)) -} - -func TestRenderBytes_RejectsNonSeiNodeTask(t *testing.T) { - g := NewWithT(t) - tmpl := []byte("apiVersion: v1\nkind: ConfigMap\nmetadata: {name: PLACEHOLDER}\n") - _, _, err := runner.RenderBytes("t.tmpl", tmpl, nil, nil) - g.Expect(err).To(MatchError(ContainSubstring("rendered manifest is ConfigMap"))) -} - -func TestRenderBytes_StampsOwnerRef(t *testing.T) { - g := NewWithT(t) - tmpl := []byte(`apiVersion: sei.io/v1alpha1 -kind: SeiNodeTask -metadata: - name: PLACEHOLDER - ownerReferences: - - apiVersion: rogue.example.com/v1 - kind: Impostor - name: smuggled - uid: 00000000-0000-0000-0000-000000000000 -spec: - kind: GovVote - target: - nodeRef: - name: {{ .NODE }} - govVote: - chainId: c - keyName: k - proposalId: "1" - option: yes - fees: 0usei - gas: 0 -`) - ctrlF := false - blockF := false - ownerRef := &metav1.OwnerReference{ - APIVersion: "chaos-mesh.org/v1alpha1", - Kind: "Workflow", - Name: "release-test-20260521", - UID: types.UID("abcd-uid"), - Controller: &ctrlF, - BlockOwnerDeletion: &blockF, - } - - manifest, _, err := runner.RenderBytes("t.tmpl", tmpl, map[string]string{tNodeKey: tValidator0}, ownerRef) - g.Expect(err).NotTo(HaveOccurred()) - - obj := &unstructured.Unstructured{} - g.Expect(yaml.Unmarshal(manifest, &obj.Object)).To(Succeed()) - refs := obj.GetOwnerReferences() - g.Expect(refs).To(HaveLen(1), "render must REPLACE ownerReferences so a template-smuggled ref can't leak through") - g.Expect(refs[0].Kind).To(Equal("Workflow")) - g.Expect(refs[0].Name).To(Equal("release-test-20260521")) - g.Expect(refs[0].UID).To(Equal(types.UID("abcd-uid"))) - - // Nil ownerRef leaves template-declared refs alone (no-stamp path). - manifestNil, _, err := runner.RenderBytes("t.tmpl", tmpl, map[string]string{tNodeKey: tValidator0}, nil) - g.Expect(err).NotTo(HaveOccurred()) - objNil := &unstructured.Unstructured{} - g.Expect(yaml.Unmarshal(manifestNil, &objNil.Object)).To(Succeed()) - g.Expect(objNil.GetOwnerReferences()).To(HaveLen(1)) - g.Expect(objNil.GetOwnerReferences()[0].Kind).To(Equal("Impostor")) -} - -func TestRenderBytes_MissingKeyIsError(t *testing.T) { - g := NewWithT(t) - tmpl := []byte("apiVersion: sei.io/v1alpha1\nkind: SeiNodeTask\nmetadata: {name: PLACEHOLDER}\nspec:\n kind: GovVote\n target: {nodeRef: {name: {{ .NODE }}}}\n") - _, _, err := runner.RenderBytes("t.tmpl", tmpl, map[string]string{}, nil) - g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("execute template")) -} - -// --------------------------------------------------------------------------- -// Fanout policy -// --------------------------------------------------------------------------- - -func TestParseFanoutMode(t *testing.T) { - g := NewWithT(t) - - p, err := runner.ParseFanoutMode("") - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(p.Mode).To(Equal("all")) - - p, err = runner.ParseFanoutMode("best-effort") - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(p.Mode).To(Equal("best-effort")) - - p, err = runner.ParseFanoutMode("quorum:3") - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(p.Mode).To(Equal("quorum")) - g.Expect(p.Quorum).To(Equal(3)) - - _, err = runner.ParseFanoutMode("quorum:0") - g.Expect(err).To(HaveOccurred()) - - _, err = runner.ParseFanoutMode("nope") - g.Expect(err).To(HaveOccurred()) -} - -func TestFanoutPolicy_Decide(t *testing.T) { - g := NewWithT(t) - - all := runner.FanoutPolicy{Mode: "all"} - // All complete -> ok. - done, ok := all.Decide([]runner.Outcome{runner.OutcomeComplete, runner.OutcomeComplete}, 2) - g.Expect(done).To(BeTrue()) - g.Expect(ok).To(BeTrue()) - // Any fail -> fail fast. - done, ok = all.Decide([]runner.Outcome{runner.OutcomeFailed}, 3) - g.Expect(done).To(BeTrue()) - g.Expect(ok).To(BeFalse()) - // Partial complete, none failed -> keep going. - done, _ = all.Decide([]runner.Outcome{runner.OutcomeComplete}, 3) - g.Expect(done).To(BeFalse()) - - best := runner.FanoutPolicy{Mode: "best-effort"} - // One complete, others still pending -> wait. - done, _ = best.Decide([]runner.Outcome{runner.OutcomeComplete}, 3) - g.Expect(done).To(BeFalse()) - // All terminated, >=1 complete -> ok. - done, ok = best.Decide([]runner.Outcome{runner.OutcomeComplete, runner.OutcomeFailed, runner.OutcomeFailed}, 3) - g.Expect(done).To(BeTrue()) - g.Expect(ok).To(BeTrue()) - // All failed -> not ok. - done, ok = best.Decide([]runner.Outcome{runner.OutcomeFailed, runner.OutcomeFailed}, 2) - g.Expect(done).To(BeTrue()) - g.Expect(ok).To(BeFalse()) - - quorum := runner.FanoutPolicy{Mode: "quorum", Quorum: 2} - // Hit quorum -> ok early. - done, ok = quorum.Decide([]runner.Outcome{runner.OutcomeComplete, runner.OutcomeComplete}, 4) - g.Expect(done).To(BeTrue()) - g.Expect(ok).To(BeTrue()) - // Too many failed to reach quorum -> fail early. - done, ok = quorum.Decide([]runner.Outcome{runner.OutcomeFailed, runner.OutcomeFailed, runner.OutcomeFailed}, 4) - g.Expect(done).To(BeTrue()) - g.Expect(ok).To(BeFalse()) -} - -// --------------------------------------------------------------------------- -// Output extraction -// --------------------------------------------------------------------------- - -func TestExtractOutputs(t *testing.T) { - g := NewWithT(t) - obj := map[string]any{ - tStatusKey: map[string]any{ - tPhaseKey: tComplete, - "outputs": map[string]any{ - "govVote": map[string]any{ - "txHash": tTxHashVal, - "height": int64(1234), - }, - }, - }, - } - - kvs, err := runner.ExtractOutputs( - []string{".status.outputs.govVote.txHash=TX_HASH", ".status.outputs.govVote.height=HEIGHT"}, - obj, - ) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(kvs).To(HaveLen(2)) - g.Expect(kvs[0]).To(Equal(runner.KV{Key: tTxHashEnv, Value: tTxHashVal})) - g.Expect(kvs[1].Key).To(Equal("HEIGHT")) - g.Expect(kvs[1].Value).To(Equal("1234")) -} - -func TestExtractOutputs_MissingFieldOmitted(t *testing.T) { - g := NewWithT(t) - // Sidecar-backed kinds (govVote/govSoftwareUpgrade in MVP) have empty - // status.outputs. The extractor must drop missing fields, not error. - obj := map[string]any{tStatusKey: map[string]any{tPhaseKey: tComplete}} - kvs, err := runner.ExtractOutputs([]string{".status.outputs.govVote.txHash=TX_HASH"}, obj) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(kvs).To(BeEmpty()) -} - -func TestExtractOutputs_MalformedSpec(t *testing.T) { - g := NewWithT(t) - _, err := runner.ExtractOutputs([]string{"no-equals-sign"}, map[string]any{}) - g.Expect(err).To(MatchError(ContainSubstring("missing '=ENV_VAR'"))) -} - -// --------------------------------------------------------------------------- -// Env sourcer / writer -// --------------------------------------------------------------------------- - -func TestFileEnvSourcer_RoundTrip(t *testing.T) { - g := NewWithT(t) - dir := t.TempDir() - path := filepath.Join(dir, "env.sh") - - w := runner.FileEnvWriter{} - g.Expect(w.Append(path, []runner.KV{{Key: tTxHashEnv, Value: tTxHashVal}, {Key: tPropIDKey, Value: "47"}})).To(Succeed()) - g.Expect(w.Append(path, []runner.KV{{Key: "HEIGHT", Value: "1234"}})).To(Succeed()) - - got, err := runner.FileEnvSourcer{}.Source(path) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(got).To(Equal(map[string]string{ - tTxHashEnv: tTxHashVal, - tPropIDKey: "47", - "HEIGHT": "1234", - })) -} - -func TestFileEnvSourcer_MissingFileIsNotAnError(t *testing.T) { - g := NewWithT(t) - got, err := runner.FileEnvSourcer{}.Source(filepath.Join(t.TempDir(), "absent.sh")) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(got).To(BeNil()) -} - -// --------------------------------------------------------------------------- -// Stub-driven Execute() integration -// --------------------------------------------------------------------------- - -type stubRenderer struct { - calls []map[string]string - name string -} - -func (s *stubRenderer) Render(_ string, vars map[string]string) ([]byte, string, error) { - clone := make(map[string]string, len(vars)) - maps.Copy(clone, vars) - s.calls = append(s.calls, clone) - manifest := fmt.Appendf(nil, "apiVersion: sei.io/v1alpha1\nkind: SeiNodeTask\nmetadata:\n name: %s\nspec:\n kind: GovVote\n", s.name) - return manifest, s.name, nil -} - -type stubApplier struct { - mu sync.Mutex - applied []string -} - -func (s *stubApplier) Apply(_ context.Context, _ string, manifest []byte) error { - s.mu.Lock() - defer s.mu.Unlock() - obj := map[string]any{} - _ = yaml.Unmarshal(manifest, &obj) - if md, ok := obj["metadata"].(map[string]any); ok { - if name, ok := md["name"].(string); ok { - s.applied = append(s.applied, name) - } - } - return nil -} - -type stubPoller struct { - phase string - obj map[string]any - reason string - err error -} - -func (s stubPoller) Poll(_ context.Context, _, _ string, _ time.Duration) (string, map[string]any, string, error) { - return s.phase, s.obj, s.reason, s.err -} - -type stubLister struct{ names []string } - -func (s stubLister) List(_ context.Context, _, _ string) ([]string, error) { return s.names, nil } - -type stubSourcer struct{ env map[string]string } - -func (s stubSourcer) Source(_ string) (map[string]string, error) { return s.env, nil } - -type stubWriter struct { - mu sync.Mutex - written []runner.KV -} - -func (w *stubWriter) Append(_ string, kv []runner.KV) error { - w.mu.Lock() - defer w.mu.Unlock() - w.written = append(w.written, kv...) - return nil -} - -func newRun(opts runner.Options, poller runner.Poller, applier runner.Applier, lister runner.NodeLister) (*runner.Run, *stubWriter) { - w := &stubWriter{} - return &runner.Run{ - Opts: opts, - Stdout: os.Stderr, - Stderr: os.Stderr, - Renderer: &stubRenderer{name: tStubName}, - Applier: applier, - Poller: poller, - Lister: lister, - Sourcer: stubSourcer{}, - Writer: w, - }, w -} - -func TestExecute_SingleNode_CompleteExtractsOutputs(t *testing.T) { - g := NewWithT(t) - app := &stubApplier{} - poller := stubPoller{ - phase: tComplete, - obj: map[string]any{ - tStatusKey: map[string]any{ - tPhaseKey: tComplete, - "outputs": map[string]any{"updateNodeImage": map[string]any{"appliedImage": tImageV2}}, - }, - }, - } - r, w := newRun(runner.Options{ - TemplatePath: tIgnored, - Vars: map[string]string{tNodeKey: tValidator0}, - OutputJSONPaths: []string{".status.outputs.updateNodeImage.appliedImage=APPLIED_IMAGE"}, - OutputEnvFile: filepath.Join(t.TempDir(), "env.sh"), - Timeout: time.Second, - PollInterval: 10 * time.Millisecond, - Namespace: "ns", - }, poller, app, nil) - g.Expect(r.Execute(context.Background())).To(Succeed()) - g.Expect(app.applied).To(Equal([]string{tStubName})) - g.Expect(w.written).To(Equal([]runner.KV{{Key: "APPLIED_IMAGE", Value: tImageV2}})) -} - -func TestExecute_SingleNode_FailedReturnsReason(t *testing.T) { - g := NewWithT(t) - poller := stubPoller{phase: "Failed", reason: "deposit too small"} - r, _ := newRun(runner.Options{ - TemplatePath: tIgnored, - Vars: map[string]string{tNodeKey: tValidator0}, - Timeout: time.Second, - PollInterval: 10 * time.Millisecond, - Namespace: "ns", - }, poller, &stubApplier{}, nil) - err := r.Execute(context.Background()) - g.Expect(err).To(MatchError(ContainSubstring("deposit too small"))) -} - -func TestExecute_Fanout_AllMustSucceed(t *testing.T) { - g := NewWithT(t) - app := &stubApplier{} - poller := stubPoller{phase: tComplete, obj: map[string]any{tStatusKey: map[string]any{tPhaseKey: tComplete}}} - r, _ := newRun(runner.Options{ - TemplatePath: tIgnored, - Vars: map[string]string{"IMAGE": tImageV2}, - PerNodeSelector: "role=validator", - FanoutMode: "all-must-succeed", - Timeout: time.Second, - PollInterval: 10 * time.Millisecond, - Namespace: "ns", - }, poller, app, stubLister{names: []string{"v0", "v1", "v2"}}) - g.Expect(r.Execute(context.Background())).To(Succeed()) - g.Expect(app.applied).To(HaveLen(3)) -} - -func TestExecute_Fanout_BestEffortAllowsFailures(t *testing.T) { - g := NewWithT(t) - // Per-target poller that returns different verdicts based on the task name - // to simulate a partial-fail fan-out. - poller := poller2{ - results: map[string]stubPoller{ - tStubName: {phase: tComplete, obj: map[string]any{tStatusKey: map[string]any{tPhaseKey: tComplete}}}, - }, - def: stubPoller{phase: "Failed", reason: "boom"}, - } - app := &stubApplier{} - r, _ := newRun(runner.Options{ - TemplatePath: tIgnored, - PerNodeSelector: "role=validator", - FanoutMode: "best-effort", - Timeout: time.Second, - PollInterval: 10 * time.Millisecond, - Namespace: "ns", - }, poller, app, stubLister{names: []string{"v0", "v1"}}) - // stubRenderer returns the same name for both renders, so the poller's - // stub-name map applies to both. We just need at least one Complete to succeed. - g.Expect(r.Execute(context.Background())).To(Succeed()) -} - -// poller2 is a per-name dispatcher used by best-effort fanout tests. -type poller2 struct { - results map[string]stubPoller - def stubPoller -} - -func (p poller2) Poll(_ context.Context, _, name string, _ time.Duration) (string, map[string]any, string, error) { - if s, ok := p.results[name]; ok { - return s.phase, s.obj, s.reason, s.err - } - return p.def.phase, p.def.obj, p.def.reason, p.def.err -} - -// --------------------------------------------------------------------------- -// Embedded templates: each renders against representative vars. -// --------------------------------------------------------------------------- - -func TestEmbeddedTemplates_Render(t *testing.T) { - g := NewWithT(t) - repoRoot := findRepoRoot(t) - dir := filepath.Join(repoRoot, "runner", "templates") - - cases := []struct { - file string - vars map[string]string - }{ - { - file: "gov-software-upgrade.yaml.tmpl", - vars: map[string]string{ - tNodeKey: tValidator0, tChainIDKey: tChainID, "KEY_NAME": "admin", - "TITLE": "Upgrade to v2.0.0", "DESCRIPTION": "rollout v2", - "UPGRADE_NAME": "v2.0.0", "UPGRADE_HEIGHT": "1500", - "INITIAL_DEPOSIT": "10000000usei", "FEES": "2000usei", "GAS": "500000", - }, - }, - { - file: "gov-vote.yaml.tmpl", - vars: map[string]string{ - tNodeKey: tValidator0, tChainIDKey: tChainID, "KEY_NAME": "admin", - tPropIDKey: "47", "OPTION": "yes", "FEES": "2000usei", "GAS": "200000", - }, - }, - { - file: "await-condition.yaml.tmpl", - vars: map[string]string{tNodeKey: tValidator0, "TARGET_HEIGHT": "1500"}, - }, - { - file: "update-node-image.yaml.tmpl", - vars: map[string]string{tNodeKey: tValidator0, "IMAGE": "ghcr.io/sei/seid:v2.0.0"}, - }, - { - file: "await-nodes-at-height.yaml.tmpl", - vars: map[string]string{tNodeKey: tValidator0, "TARGET_HEIGHT": "1500"}, - }, - } - - for _, c := range cases { - t.Run(c.file, func(t *testing.T) { - g := NewWithT(t) - raw, err := os.ReadFile(filepath.Join(dir, c.file)) - g.Expect(err).NotTo(HaveOccurred(), "read template") - manifest, name, err := runner.RenderBytes(c.file, raw, c.vars, nil) - g.Expect(err).NotTo(HaveOccurred(), "render template") - g.Expect(name).NotTo(BeEmpty()) - - obj := map[string]any{} - g.Expect(yaml.Unmarshal(manifest, &obj)).To(Succeed()) - g.Expect(obj["apiVersion"]).To(Equal("sei.io/v1alpha1")) - g.Expect(obj["kind"]).To(Equal("SeiNodeTask")) - spec, ok := obj["spec"].(map[string]any) - g.Expect(ok).To(BeTrue()) - g.Expect(spec).To(HaveKey("kind")) - g.Expect(spec).To(HaveKey("target")) - }) - } - _ = g -} - -func findRepoRoot(t *testing.T) string { - t.Helper() - wd, err := os.Getwd() - if err != nil { - t.Fatal(err) - } - for dir := wd; dir != "/" && dir != "."; dir = filepath.Dir(dir) { - if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { - return dir - } - } - t.Fatal("repo root not found") - return "" -} - -// --------------------------------------------------------------------------- -// Sanity -// --------------------------------------------------------------------------- - -func TestNoStrayErrors(t *testing.T) { - // Tripwire so refactors that accidentally remove the deterministic-name - // contract surface as a one-line test failure. - g := NewWithT(t) - name := runner.DeterministicName("GovVote", map[string]string{tNodeKey: "v0", "X": "1"}, []byte("body")) - g.Expect(name).To(HavePrefix("gov-vote-v0-")) - g.Expect(strings.Count(name, "-")).To(BeNumerically(">=", 3)) - g.Expect(errors.New("noop")).To(HaveOccurred()) -} diff --git a/internal/seitask/keygen/keygen.go b/internal/seitask/keygen/keygen.go deleted file mode 100644 index 23ddeb91..00000000 --- a/internal/seitask/keygen/keygen.go +++ /dev/null @@ -1,115 +0,0 @@ -// Package keygen implements `seitask keygen`: derive a Sei account via the -// general internal/keygen primitive, write the mnemonic to a per-run Secret named -// "-", and publish ADMIN_ADDRESS / ADMIN_SECRET_NAME -// to workflow-vars. All created resources carry an ownerRef to the parent -// Workflow CR for cascade GC. The key derivation itself lives in -// internal/keygen (k8s-free, reused by the test harness); this package is the -// seitask-runner's Secret/workflow-vars writer on top of it. -package keygen - -import ( - "context" - "fmt" - - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client" - - keyderive "github.com/sei-protocol/sei-k8s-controller/internal/keygen" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -const fieldOwner client.FieldOwner = "seitask-keygen" - -// Params carries the typed inputs to Run. -type Params struct { - // KeyName is the logical identity (e.g. "admin"). Secret name is - // "-" to disambiguate concurrent runs. - KeyName string - Workflow taskruntime.WorkflowIdentity -} - -type Result struct { - SecretName string - Address string -} - -// Run generates the keypair, writes the Secret, and stamps workflow-vars. -// Idempotent: re-running on an existing Secret reuses the key. -func Run(ctx context.Context, c client.Client, p Params) (Result, error) { - if p.KeyName == "" { - return Result{}, fmt.Errorf("keygen: empty KeyName") - } - if p.Workflow.Name == "" || p.Workflow.Namespace == "" { - return Result{}, fmt.Errorf("keygen: workflow identity not loaded (downward-API env not projected)") - } - - secretName := p.KeyName + "-" + p.Workflow.Name - - // Check for an existing Secret first — re-running keygen on an already- - // initialized run should be a no-op so manual retries don't rotate the - // key out from under downstream steps. - existing := &corev1.Secret{} - err := c.Get(ctx, client.ObjectKey{Namespace: p.Workflow.Namespace, Name: secretName}, existing) - switch { - case err == nil: - // Re-stamp the workflow-vars CM in case it was cleared, then return. - addr, exists := existing.Data["address"] - if !exists { - return Result{}, taskruntime.Infra(fmt.Errorf("existing Secret %q is missing address data", secretName)) - } - if err := writeWorkflowVars(ctx, c, p.Workflow, string(addr), secretName); err != nil { - return Result{}, err - } - return Result{SecretName: secretName, Address: string(addr)}, nil - case !apierrors.IsNotFound(err): - return Result{}, taskruntime.Infra(fmt.Errorf("reading existing Secret %q: %w", secretName, err)) - } - - id, err := keyderive.Derive() - if err != nil { - return Result{}, taskruntime.Infra(fmt.Errorf("deriving identity: %w", err)) - } - - secret := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: secretName, - Namespace: p.Workflow.Namespace, - OwnerReferences: []metav1.OwnerReference{p.Workflow.OwnerRef()}, - }, - Type: corev1.SecretTypeOpaque, - Data: map[string][]byte{ - keyderive.SecretMnemonicKey: []byte(id.Mnemonic), - // address is duplicated into the Secret so a re-run of keygen - // can reuse the existing identity without re-deriving from the - // mnemonic (the Secret is the source of truth for both). - "address": []byte(id.Address), - }, - } - if err := c.Create(ctx, secret, fieldOwner); err != nil { - // Race: another keygen Pod won. Re-read and fall through to - // idempotent path. - if apierrors.IsAlreadyExists(err) { - return Run(ctx, c, p) - } - return Result{}, taskruntime.Infra(fmt.Errorf("creating Secret %q: %w", secretName, err)) - } - - if err := writeWorkflowVars(ctx, c, p.Workflow, id.Address, secretName); err != nil { - return Result{}, err - } - return Result{SecretName: secretName, Address: id.Address}, nil -} - -func writeWorkflowVars(ctx context.Context, c client.Client, w taskruntime.WorkflowIdentity, address, secretName string) error { - if err := taskruntime.EnsureWorkflowVarsCM(ctx, c, w, map[taskruntime.VarKey]string{ - taskruntime.KeyRunID: w.Name, - }); err != nil { - return err - } - return taskruntime.SetVars(ctx, c, w, map[taskruntime.VarKey]string{ - taskruntime.KeyAdminAddress: address, - taskruntime.KeyAdminSecretName: secretName, - }) -} diff --git a/internal/seitask/keygen/keygen_test.go b/internal/seitask/keygen/keygen_test.go deleted file mode 100644 index 4dbe69bc..00000000 --- a/internal/seitask/keygen/keygen_test.go +++ /dev/null @@ -1,171 +0,0 @@ -package keygen - -import ( - "context" - "strings" - "testing" - - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - keyderive "github.com/sei-protocol/sei-k8s-controller/internal/keygen" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -func newScheme(t *testing.T) *runtime.Scheme { - t.Helper() - s := runtime.NewScheme() - if err := corev1.AddToScheme(s); err != nil { - t.Fatal(err) - } - return s -} - -const ( - testKeyName = "admin" - testNamespace = "nightly" - testWorkflowName = "wf-test" - testSecretName = "admin-wf-test" - testWorkflowVarsCM = "workflow-vars-wf-test" -) - -func testWorkflow() taskruntime.WorkflowIdentity { - return taskruntime.WorkflowIdentity{Name: testWorkflowName, UID: "uid-test", Namespace: testNamespace} -} - -// Sanity check: keygen produces a Secret with the right shape + a -// workflow-vars ConfigMap with the right keys + the address is a valid -// "sei1..." bech32. -func TestRun_CreatesSecretAndWorkflowVars(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - w := testWorkflow() - - res, err := Run(context.Background(), c, Params{KeyName: testKeyName, Workflow: w}) - if err != nil { - t.Fatalf("Run: %v", err) - } - if res.SecretName != testSecretName { - t.Fatalf("SecretName: got %q, want admin-wf-test", res.SecretName) - } - if !strings.HasPrefix(res.Address, "sei1") { - t.Fatalf("Address %q does not have sei1 prefix", res.Address) - } - - // Secret must carry the mnemonic + address. - secret := &corev1.Secret{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testSecretName}, secret); err != nil { - t.Fatalf("Get Secret: %v", err) - } - mnemonic, ok := secret.Data[keyderive.SecretMnemonicKey] - if !ok || len(mnemonic) == 0 { - t.Fatalf("mnemonic missing from Secret") - } - // 24 words separated by spaces. - if got := len(strings.Fields(string(mnemonic))); got != 24 { - t.Fatalf("mnemonic word count: got %d, want 24", got) - } - if got := string(secret.Data["address"]); got != res.Address { - t.Fatalf("Secret address %q != Result.Address %q", got, res.Address) - } - - // workflow-vars ConfigMap must carry ADMIN_ADDRESS + ADMIN_SECRET_NAME. - cm := &corev1.ConfigMap{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, cm); err != nil { - t.Fatalf("Get CM: %v", err) - } - if got := cm.Data[string(taskruntime.KeyAdminAddress)]; got != res.Address { - t.Fatalf("CM ADMIN_ADDRESS = %q, want %q", got, res.Address) - } - if got := cm.Data[string(taskruntime.KeyAdminSecretName)]; got != testSecretName { - t.Fatalf("CM ADMIN_SECRET_NAME = %q, want admin-wf-test", got) - } - if got := cm.Data[string(taskruntime.KeyRunID)]; got != testWorkflowName { - t.Fatalf("CM RUN_ID = %q, want wf-test", got) - } -} - -// Idempotency: re-running keygen with an existing Secret reuses the key -// rather than rotating it. Manual scenario retries shouldn't blow away the -// identity downstream Tasks already consumed. -func TestRun_Idempotent(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - w := testWorkflow() - - first, err := Run(context.Background(), c, Params{KeyName: testKeyName, Workflow: w}) - if err != nil { - t.Fatalf("first Run: %v", err) - } - - // Drop the workflow-vars CM to simulate it being cleared somewhere. - cm := &corev1.ConfigMap{} - _ = c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, cm) - if err := c.Delete(context.Background(), cm); err != nil { - t.Fatalf("delete CM: %v", err) - } - - second, err := Run(context.Background(), c, Params{KeyName: testKeyName, Workflow: w}) - if err != nil { - t.Fatalf("second Run: %v", err) - } - if second.Address != first.Address { - t.Fatalf("identity rotated on second Run: %q -> %q", first.Address, second.Address) - } - // CM should be re-created. - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, &corev1.ConfigMap{}); err != nil { - t.Fatalf("CM not re-created on idempotent run: %v", err) - } -} - -// Reject missing inputs. -func TestRun_RejectsBadInputs(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - for _, tc := range []struct { - name string - p Params - }{ - {"empty key name", Params{KeyName: "", Workflow: testWorkflow()}}, - {"missing workflow name", Params{KeyName: testKeyName, Workflow: taskruntime.WorkflowIdentity{Namespace: "ns"}}}, - {"missing workflow namespace", Params{KeyName: testKeyName, Workflow: taskruntime.WorkflowIdentity{Name: "wf"}}}, - } { - t.Run(tc.name, func(t *testing.T) { - _, err := Run(context.Background(), c, tc.p) - if err == nil { - t.Fatalf("expected error") - } - }) - } -} - -// Secret + ConfigMap carry ownerReferences to the parent Workflow so -// Workflow deletion cascades. Bench-chain scenarios depend on this for -// cleanup (no trap-on-EXIT logic). -func TestRun_StampsOwnerReferences(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - w := testWorkflow() - - if _, err := Run(context.Background(), c, Params{KeyName: testKeyName, Workflow: w}); err != nil { - t.Fatalf("Run: %v", err) - } - - for _, target := range []client.Object{ - &corev1.Secret{}, - &corev1.ConfigMap{}, - } { - var name string - if _, ok := target.(*corev1.Secret); ok { - name = testSecretName - } else { - name = testWorkflowVarsCM - } - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: name}, target); err != nil { - t.Fatalf("Get %s: %v", name, err) - } - refs := target.GetOwnerReferences() - if len(refs) != 1 || refs[0].Kind != "Workflow" || refs[0].Name != testWorkflowName || string(refs[0].UID) != "uid-test" { - t.Fatalf("%s ownerRefs = %+v", name, refs) - } - } -} diff --git a/internal/seitask/provisionnode/provision.go b/internal/seitask/provisionnode/provision.go deleted file mode 100644 index a7d8cfe1..00000000 --- a/internal/seitask/provisionnode/provision.go +++ /dev/null @@ -1,415 +0,0 @@ -// Package provisionnode implements `seitask provision-node`: fan out N -// standalone SeiNode follower CRs from one Go template, stamp an ownerRef -// to the parent Workflow, Create them, await PhaseRunning, run a two-stage -// per-node readiness probe (Tendermint /status height>0, then EVM -// eth_blockNumber 200), then publish role-scoped endpoints to workflow-vars -// (_EVM_RPC_LIST, _EVM_RPC, _TM_RPC, _REST, CHAIN_ID). -// -// Unlike provision-snd (genesis SeiNetwork, waits Ready, reads the fleet -// aggregate), provision-node provisions followers that join an existing chain. -// It assembles every workflow-vars key from the N per-node .status.endpoint -// scalars because a standalone SeiNode has no fleet ClusterIP to aggregate. -// -// The N CRs are named -0..-(N-1); the controller stamps -// sei.io/node= on each pod, preserving the chaos suite's pod -// selectors. provision-node also stamps sei.io/role=node (always) and -// sei.io/seinetwork= (when --network is set) on each CR's -// metadata.labels — the shared object-label producer contract with -// `seictl node apply`, which the follower-discovery query -// (node list -l sei.io/seinetwork=,sei.io/role=node) matches on. -package provisionnode - -import ( - "bytes" - "context" - "fmt" - "maps" - "net/http" - "os" - "reflect" - "strconv" - "strings" - "text/template" - "time" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/wait" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/yaml" - - seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" - "github.com/sei-protocol/sei-k8s-controller/sdk/sei" -) - -const fieldOwner client.FieldOwner = "seitask-provision-node" - -// Object-label producer contract (§2.2a) — MUST stay byte-identical to -// `seictl node apply`. The keys/values mirror the controller's canonical -// constants (noderesource: sei.io/role / "node"; seinetwork: sei.io/seinetwork), -// which are unexported, so we re-declare them. The contract test pins these -// literals so an accidental edit here fails; the controller side is independently -// pinned by noderesource_test.go. -const ( - labelRole = "sei.io/role" - roleValueNode = "node" - labelSeiNetwork = "sei.io/seinetwork" -) - -// Params carries the typed inputs to Run. -type Params struct { - // Role tags the workflow-vars keys this Task writes (e.g. "rpc"). - // Uppercased to compose RPC_EVM_RPC_LIST etc. Required. - Role string - - // Name is the BASE name; the N followers are -0..-(N-1). - // Defaults to "-" (or "-" when no - // CHAIN_ID var) so chaos sei.io/node selectors stay valid. - Name string - - // TemplatePath is the on-disk path to the Go text/template producing - // ONE kind: SeiNode YAML. Rendered once per replica with .ORDINAL and - // .NODE_NAME injected. Required. - TemplatePath string - - // Vars are the template's substitution context (the .KEY map). Missing - // keys referenced by the template fail rendering. The runtime injects - // .ORDINAL and .NODE_NAME per replica; a --var collision on either is - // rejected (mirrors the runner's --var NODE= guard). - Vars map[string]string - - // Replicas is N: the number of follower SeiNode CRs to fan out. >=1. - Replicas int - - // Network is the genesis SeiNetwork to follow. When set, the runtime - // (a) synthesizes a LabelPeerSource selecting sei.io/seinetwork= - // and (b) stamps the sei.io/seinetwork= object label. - Network string - - // NetworkNamespace is the namespace of the genesis SeiNetwork for the - // synthesized peer selector. Defaults to the Workflow namespace. - NetworkNamespace string - - // RunningTimeout bounds the wait for all N SeiNodes to reach PhaseRunning. - RunningTimeout time.Duration - - // FirstBlockTimeout bounds the post-Running readiness probe (the TM caught-up - // stage and the EVM eth_blockNumber stage), per node. - FirstBlockTimeout time.Duration - - // PollInterval is the interval between SeiNode status reads (waitForRunning). - // The readiness RPC probes are paced by the SDK's own cadence. - PollInterval time.Duration - - // HTTPClient overrides the RPC client; nil means http.DefaultClient. - HTTPClient *http.Client - - // Workflow is the parent Chaos Mesh Workflow identity (downward-API). - Workflow taskruntime.WorkflowIdentity -} - -// Result is the post-Run summary, returned so main can log it before exit. -type Result struct { - // Names are the N created SeiNode names, ordinal-ordered. - Names []string - // ChainID is the resolved chain ID published as CHAIN_ID. - ChainID string - // EVMRPCList is the assembled _EVM_RPC_LIST CSV. - EVMRPCList string -} - -// Run renders the template N times, creates N SeiNode followers with an -// ownerRef to the parent Workflow, waits for all to reach PhaseRunning, runs -// the per-node two-stage readiness probe, then publishes role-scoped endpoints. -func Run(ctx context.Context, c client.Client, p Params) (Result, error) { - if err := validateParams(p); err != nil { - return Result{}, err - } - p = withDefaults(p) - - names := make([]string, 0, p.Replicas) - for ordinal := 0; ordinal < p.Replicas; ordinal++ { - node, err := renderNode(p, ordinal) - if err != nil { - return Result{}, taskruntime.Task(fmt.Errorf("rendering template %s (ordinal %d): %w", p.TemplatePath, ordinal, err)) - } - stampMetadata(node, p, ordinal) - - if err := c.Create(ctx, node, fieldOwner); err != nil { - if !apierrors.IsAlreadyExists(err) { - return Result{}, taskruntime.Infra(fmt.Errorf("creating SeiNode %s/%s: %w", node.Namespace, node.Name, err)) - } - // Re-runs land here. Surface drift loudly so an operator who - // edited the template since the original Create knows the cluster - // is still at the original spec — we don't force-apply. - warnIfDrift(ctx, c, node) - } - names = append(names, node.Name) - } - - // Wait for all N to reach Running under one shared deadline. - if err := waitForRunning(ctx, c, p.Workflow.Namespace, names, p.RunningTimeout, p.PollInterval); err != nil { - return Result{}, err - } - - // Re-read each node post-Running for its .status.endpoint, then run the - // two-stage readiness probe before publishing. - nodes := make([]*seiv1alpha1.SeiNode, 0, len(names)) - httpClient := p.HTTPClient - if httpClient == nil { - httpClient = http.DefaultClient - } - for _, name := range names { - node := &seiv1alpha1.SeiNode{} - if err := c.Get(ctx, types.NamespacedName{Namespace: p.Workflow.Namespace, Name: name}, node); err != nil { - return Result{}, taskruntime.Infra(fmt.Errorf("re-reading SeiNode %s post-Running: %w", name, err)) - } - ep := node.Status.Endpoint - if ep == nil || ep.TendermintRpc == "" { - return Result{}, taskruntime.Infra(fmt.Errorf("SeiNode %s Running but .status.endpoint.tendermintRpc empty", name)) - } - // Stage 2 — TM readiness: the follower has joined consensus and is caught - // up (height>1 && catching_up==false), via the SDK's shared primitive. - if err := waitReady(ctx, httpClient, sei.WaitCaughtUp, ep.TendermintRpc, name, p.FirstBlockTimeout); err != nil { - return Result{}, err - } - // Stage 3 — EVM readiness: the JSON-RPC listener is bound before its URL - // enters RPC_EVM_RPC_LIST. A caught-up TM does NOT prove the EVM listener serves. - if ep.EvmJsonRpc == "" { - return Result{}, taskruntime.Infra(fmt.Errorf("SeiNode %s Running but .status.endpoint.evmJsonRpc empty", name)) - } - if err := waitReady(ctx, httpClient, sei.WaitEVMServing, ep.EvmJsonRpc, name, p.FirstBlockTimeout); err != nil { - return Result{}, err - } - nodes = append(nodes, node) - } - - chainID := p.Vars[string(taskruntime.KeyChainID)] - if chainID == "" && len(nodes) > 0 { - chainID = nodes[0].Spec.ChainID - } - - evmList, err := publishEndpoints(ctx, c, p.Workflow, p.Role, chainID, nodes) - if err != nil { - return Result{}, err - } - return Result{Names: names, ChainID: chainID, EVMRPCList: evmList}, nil -} - -func validateParams(p Params) error { - switch { - case p.Role == "": - return fmt.Errorf("provision-node: --role is required") - case p.TemplatePath == "": - return fmt.Errorf("provision-node: --template is required") - case p.Replicas < 1: - return fmt.Errorf("provision-node: --replicas must be >= 1, got %d", p.Replicas) - case p.Workflow.Name == "" || p.Workflow.Namespace == "": - return fmt.Errorf("provision-node: workflow identity not loaded") - } - // The runtime injects .ORDINAL and .NODE_NAME per replica; a --var on - // either would silently shadow them. Reject, mirroring the runner's - // --var NODE= guard under --per-node-selector. - if _, ok := p.Vars["ORDINAL"]; ok { - return fmt.Errorf("provision-node: --var ORDINAL=... collides with the runtime-injected .ORDINAL") - } - if _, ok := p.Vars["NODE_NAME"]; ok { - return fmt.Errorf("provision-node: --var NODE_NAME=... collides with the runtime-injected .NODE_NAME") - } - return nil -} - -func withDefaults(p Params) Params { - if p.Name == "" { - base := p.Workflow.Name - if cid := p.Vars[string(taskruntime.KeyChainID)]; cid != "" { - base = cid - } - p.Name = base + "-" + p.Role - } - if p.NetworkNamespace == "" { - p.NetworkNamespace = p.Workflow.Namespace - } - if p.RunningTimeout == 0 { - p.RunningTimeout = 15 * time.Minute - } - if p.FirstBlockTimeout == 0 { - p.FirstBlockTimeout = 5 * time.Minute - } - if p.PollInterval == 0 { - p.PollInterval = 5 * time.Second - } - return p -} - -// renderNode parses the template, executes it against the caller's vars plus -// the runtime-injected .ORDINAL and .NODE_NAME, then strict-unmarshals the -// rendered bytes into a SeiNode so field typos fail here, not at Create time. -func renderNode(p Params, ordinal int) (*seiv1alpha1.SeiNode, error) { - raw, err := os.ReadFile(p.TemplatePath) - if err != nil { - return nil, fmt.Errorf("read: %w", err) - } - tmpl, err := template.New(p.TemplatePath).Option("missingkey=error").Parse(string(raw)) - if err != nil { - return nil, fmt.Errorf("parse: %w", err) - } - ctxVars := make(map[string]string, len(p.Vars)+2) - maps.Copy(ctxVars, p.Vars) - ctxVars["ORDINAL"] = strconv.Itoa(ordinal) - ctxVars["NODE_NAME"] = nodeName(p.Name, ordinal) - - var buf bytes.Buffer - if err := tmpl.Execute(&buf, ctxVars); err != nil { - return nil, fmt.Errorf("execute: %w", err) - } - out := &seiv1alpha1.SeiNode{} - if err := yaml.UnmarshalStrict(buf.Bytes(), out); err != nil { - return nil, fmt.Errorf("unmarshal rendered yaml: %w", err) - } - return out, nil -} - -func nodeName(base string, ordinal int) string { - return base + "-" + strconv.Itoa(ordinal) -} - -// stampMetadata overwrites metadata fields the template MUST NOT control, -// stamps the shared object-label producer contract (§2.2a), and appends the -// synthesized peer source (§3). OwnerReferences are assigned (not appended) -// so a template that smuggles a bogus ref can't leak through. -func stampMetadata(node *seiv1alpha1.SeiNode, p Params, ordinal int) { - node.APIVersion = seiv1alpha1.GroupVersion.String() - node.Kind = "SeiNode" - node.Name = nodeName(p.Name, ordinal) - node.Namespace = p.Workflow.Namespace - node.OwnerReferences = []metav1.OwnerReference{p.Workflow.OwnerRef()} - - // Object-label producer contract — identical to `seictl node apply`. - if node.Labels == nil { - node.Labels = map[string]string{} - } - node.Labels[labelRole] = roleValueNode - if p.Network != "" { - node.Labels[labelSeiNetwork] = p.Network - } - - // Peer auto-wiring: synthesize the genesis-pool label source. Appended - // (not assigned) so a template's own static seed peers compose naturally. - if p.Network != "" { - node.Spec.Peers = append(node.Spec.Peers, seiv1alpha1.PeerSource{ - Label: &seiv1alpha1.LabelPeerSource{ - Selector: map[string]string{labelSeiNetwork: p.Network}, - Namespace: p.NetworkNamespace, - }, - }) - } -} - -// warnIfDrift logs when a re-run finds the on-cluster SeiNode.Spec different -// from the freshly-rendered one. Operators who edited the template since the -// original Create need to know the cluster still has the old spec. -func warnIfDrift(ctx context.Context, c client.Client, fresh *seiv1alpha1.SeiNode) { - existing := &seiv1alpha1.SeiNode{} - if err := c.Get(ctx, types.NamespacedName{Namespace: fresh.Namespace, Name: fresh.Name}, existing); err != nil { - return - } - if reflect.DeepEqual(existing.Spec, fresh.Spec) { - return - } - fmt.Fprintf(os.Stderr, "WARN: SeiNode %s/%s exists with spec different from rendered template; reusing on-cluster spec\n", fresh.Namespace, fresh.Name) -} - -// waitForRunning polls each of the named SeiNodes until .status.phase == -// PhaseRunning, failing fast on PhaseFailed. All N share one deadline. -func waitForRunning(ctx context.Context, c client.Client, ns string, names []string, timeout, interval time.Duration) error { - return wait.PollUntilContextTimeout(ctx, interval, timeout, true, func(ctx context.Context) (bool, error) { - for _, name := range names { - node := &seiv1alpha1.SeiNode{} - if err := c.Get(ctx, types.NamespacedName{Namespace: ns, Name: name}, node); err != nil { - if apierrors.IsNotFound(err) { - return false, nil - } - return false, taskruntime.Infra(fmt.Errorf("reading SeiNode %s: %w", name, err)) - } - switch node.Status.Phase { - case seiv1alpha1.PhaseRunning: - // this node done; check the rest - case seiv1alpha1.PhaseFailed: - return false, taskruntime.Task(fmt.Errorf("SeiNode %s reached Failed phase", name)) - default: - return false, nil - } - } - return true, nil - }) -} - -// waitReady runs an SDK readiness probe under a per-node deadline, mapping a -// failure into the workflow's Infra error class. The probe logic (TM caught-up, -// EVM serving) is the SDK's shared primitive — provisionnode no longer carries -// its own copy. -func waitReady(ctx context.Context, hc *http.Client, probe func(context.Context, *http.Client, string) error, url, node string, timeout time.Duration) error { - wctx, cancel := context.WithTimeout(ctx, timeout) - defer cancel() - if err := probe(wctx, hc, url); err != nil { - return taskruntime.Infra(fmt.Errorf("SeiNode %s: %w", node, err)) - } - return nil -} - -// publishEndpoints assembles all five workflow-vars keys from the N per-node -// .status.endpoint scalars (a standalone SeiNode has no fleet aggregate) and -// writes them. Returns the assembled EVM CSV for the Result summary. -// -// Empty-guard (§6.4): every node's evmJsonRpc must be non-empty (a missing -// follower endpoint is a provisioning fault, not a filterable condition), and -// node-0's tendermintRpc must be non-empty before it feeds _TM_RPC / -// _REST (guards a future non-EVM role from emitting a garbage URL the -// chaos wait-for-caught-up probe would curl). -func publishEndpoints(ctx context.Context, c client.Client, w taskruntime.WorkflowIdentity, role, chainID string, nodes []*seiv1alpha1.SeiNode) (string, error) { - if len(nodes) == 0 { - return "", taskruntime.Infra(fmt.Errorf("provision-node: no SeiNodes to publish")) - } - - urls := make([]string, 0, len(nodes)) - for _, n := range nodes { // nodes ordered 0..N-1 - ep := n.Status.Endpoint - if ep == nil || ep.EvmJsonRpc == "" { - return "", taskruntime.Infra(fmt.Errorf("SeiNode %s Running but .status.endpoint.evmJsonRpc empty", n.Name)) - } - urls = append(urls, ep.EvmJsonRpc) - } - - node0 := nodes[0].Status.Endpoint - if node0.TendermintRpc == "" { - return "", taskruntime.Infra(fmt.Errorf("SeiNode %s Running but .status.endpoint.tendermintRpc empty", nodes[0].Name)) - } - if node0.TendermintRest == "" { - return "", taskruntime.Infra(fmt.Errorf("SeiNode %s Running but .status.endpoint.tendermintRest empty", nodes[0].Name)) - } - - evmList := strings.Join(urls, ",") - - if err := taskruntime.EnsureWorkflowVarsCM(ctx, c, w, map[taskruntime.VarKey]string{ - taskruntime.KeyRunID: w.Name, - }); err != nil { - return "", err - } - vars := map[taskruntime.VarKey]string{ - // CHAIN_ID lives in SetVars (merge), not the EnsureWorkflowVarsCM seed - // (no-op on AlreadyExists): the genesis provision step creates the CM - // first, so a CHAIN_ID seed here would be silently dropped. - taskruntime.KeyChainID: chainID, - taskruntime.RoleScoped(role, taskruntime.KeyEVMJSONRPCList): evmList, - taskruntime.RoleScoped(role, taskruntime.KeyEVMJSONRPC): node0.EvmJsonRpc, - taskruntime.RoleScoped(role, taskruntime.KeyTendermintRPC): node0.TendermintRpc, - taskruntime.RoleScoped(role, taskruntime.KeyTendermintREST): node0.TendermintRest, - } - if err := taskruntime.SetVars(ctx, c, w, vars); err != nil { - return "", err - } - return evmList, nil -} diff --git a/internal/seitask/provisionnode/provision_test.go b/internal/seitask/provisionnode/provision_test.go deleted file mode 100644 index d804bfe9..00000000 --- a/internal/seitask/provisionnode/provision_test.go +++ /dev/null @@ -1,618 +0,0 @@ -package provisionnode - -import ( - "context" - "encoding/json" - "net/http" - "net/http/httptest" - "os" - "path/filepath" - "sync" - "sync/atomic" - "testing" - "time" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -const ( - testNamespace = "nightly" - testWorkflowName = "wf-test" - testWorkflowVarsCM = "workflow-vars-wf-test" - testRole = "rpc" - testChainID = "bench-1" - testImage = "ghcr.io/sei/sei-chain:abc123" - testNetwork = "bench-1" - varKeyChainID = "CHAIN_ID" - varKeyImage = "IMAGE" - - testBase = testChainID + "-" + testRole // "bench-1-rpc" - testNode0 = testBase + "-0" // "bench-1-rpc-0" - testNode1 = testBase + "-1" // "bench-1-rpc-1" - - tmSyncInfoField = "sync_info" - tmHeightField = "latest_block_height" -) - -const fullNodeTmpl = `apiVersion: sei.io/v1alpha1 -kind: SeiNode -metadata: - name: PLACEHOLDER -spec: - chainId: {{ .CHAIN_ID }} - image: {{ .IMAGE }} - fullNode: {} -` - -func newScheme(t *testing.T) *runtime.Scheme { - t.Helper() - s := runtime.NewScheme() - for _, add := range []func(*runtime.Scheme) error{ - corev1.AddToScheme, - seiv1alpha1.AddToScheme, - } { - if err := add(s); err != nil { - t.Fatal(err) - } - } - return s -} - -func writeTmpl(t *testing.T, body string) string { - t.Helper() - dir := t.TempDir() - p := filepath.Join(dir, "node.yaml.tmpl") - if err := os.WriteFile(p, []byte(body), 0o600); err != nil { - t.Fatal(err) - } - return p -} - -func testWorkflow() taskruntime.WorkflowIdentity { - return taskruntime.WorkflowIdentity{Name: testWorkflowName, UID: "uid-test", Namespace: testNamespace} -} - -func baseParams() Params { - return Params{ - Role: testRole, - Name: testChainID + "-" + testRole, - TemplatePath: "x.yaml.tmpl", - Replicas: 2, - Workflow: testWorkflow(), - } -} - -// --- renderNode ----------------------------------------------------------- - -func TestRenderNode_SubstitutesVarsAndInjectsOrdinal(t *testing.T) { - path := writeTmpl(t, `apiVersion: sei.io/v1alpha1 -kind: SeiNode -metadata: - name: {{ .NODE_NAME }} -spec: - chainId: {{ .CHAIN_ID }} - image: {{ .IMAGE }} - fullNode: {} - overrides: - ordinal: "{{ .ORDINAL }}" -`) - p := baseParams() - p.Name = testBase - p.TemplatePath = path - p.Vars = map[string]string{varKeyChainID: testChainID, varKeyImage: testImage} - - node, err := renderNode(p, 1) - if err != nil { - t.Fatalf("renderNode: %v", err) - } - if node.Spec.ChainID != testChainID { - t.Errorf("ChainID = %q", node.Spec.ChainID) - } - if node.Spec.Image != testImage { - t.Errorf("Image = %q", node.Spec.Image) - } - // .NODE_NAME and .ORDINAL injected for ordinal 1. - if node.Name != testNode1 { - t.Errorf("NODE_NAME injection: name = %q, want bench-1-rpc-1", node.Name) - } - if got := node.Spec.Overrides["ordinal"]; got != "1" { - t.Errorf("ORDINAL injection: overrides[ordinal] = %q, want 1", got) - } -} - -func TestRenderNode_MissingVarFailsRender(t *testing.T) { - path := writeTmpl(t, fullNodeTmpl) - p := baseParams() - p.TemplatePath = path - p.Vars = map[string]string{varKeyChainID: testChainID} // IMAGE missing - if _, err := renderNode(p, 0); err == nil { - t.Fatalf("expected error: IMAGE not provided") - } -} - -func TestRenderNode_StrictUnmarshalCatchesTypos(t *testing.T) { - path := writeTmpl(t, `apiVersion: sei.io/v1alpha1 -kind: SeiNode -metadata: - name: PLACEHOLDER -spec: - chainId: {{ .CHAIN_ID }} - imagge: {{ .IMAGE }} - fullNode: {} -`) - p := baseParams() - p.TemplatePath = path - p.Vars = map[string]string{varKeyChainID: testChainID, varKeyImage: testImage} - if _, err := renderNode(p, 0); err == nil { - t.Fatalf("expected strict-unmarshal error on `imagge` typo") - } -} - -// --- validateParams (collision guards) ------------------------------------ - -func TestValidateParams(t *testing.T) { - full := Params{ - Role: testRole, - TemplatePath: "x.yaml.tmpl", - Replicas: 2, - Workflow: testWorkflow(), - } - cases := []struct { - name string - mut func(*Params) - want bool - }{ - {"complete", func(*Params) {}, false}, - {"missing role", func(p *Params) { p.Role = "" }, true}, - {"missing template", func(p *Params) { p.TemplatePath = "" }, true}, - {"replicas zero", func(p *Params) { p.Replicas = 0 }, true}, - {"missing workflow.Name", func(p *Params) { p.Workflow.Name = "" }, true}, - {"ORDINAL collision", func(p *Params) { p.Vars = map[string]string{"ORDINAL": "x"} }, true}, - {"NODE_NAME collision", func(p *Params) { p.Vars = map[string]string{"NODE_NAME": "x"} }, true}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - p := full - tc.mut(&p) - err := validateParams(p) - if (err != nil) != tc.want { - t.Fatalf("validateParams err=%v wantErr=%v", err, tc.want) - } - }) - } -} - -// --- stampMetadata (object labels + peer wiring) -------------------------- - -func TestStampMetadata_NamingAndOwnerRef(t *testing.T) { - node := &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{ - // Template author smuggling a bogus ownerRef: must be overwritten. - OwnerReferences: []metav1.OwnerReference{{ - APIVersion: "evil/v1", Kind: "Bogus", Name: "smuggled", UID: "bad", - }}, - }, - } - p := baseParams() - p.Name = testBase - stampMetadata(node, p, 1) - - if node.Name != testNode1 { - t.Fatalf("name = %q, want bench-1-rpc-1", node.Name) - } - if node.Namespace != testNamespace { - t.Fatalf("namespace = %q", node.Namespace) - } - if len(node.OwnerReferences) != 1 || node.OwnerReferences[0].Kind != "Workflow" { - t.Fatalf("ownerRef not replaced: %+v", node.OwnerReferences) - } -} - -func TestStampMetadata_ObjectLabels_WithNetwork(t *testing.T) { - node := &seiv1alpha1.SeiNode{} - p := baseParams() - p.Network = testNetwork - stampMetadata(node, p, 0) - - if got := node.Labels[labelRole]; got != roleValueNode { - t.Errorf("label %s = %q, want %q", labelRole, got, roleValueNode) - } - if got := node.Labels[labelSeiNetwork]; got != testNetwork { - t.Errorf("label %s = %q, want %q", labelSeiNetwork, got, testNetwork) - } - // Producer-contract literals — must match WS-A's seictl node apply. - if labelRole != "sei.io/role" || roleValueNode != "node" || labelSeiNetwork != "sei.io/seinetwork" { - t.Fatalf("object-label producer contract drifted: %s=%s, %s", labelRole, roleValueNode, labelSeiNetwork) - } -} - -func TestStampMetadata_ObjectLabels_NoNetwork_OmitsNetworkLabel(t *testing.T) { - node := &seiv1alpha1.SeiNode{} - p := baseParams() - p.Network = "" // no --network - stampMetadata(node, p, 0) - - if got := node.Labels[labelRole]; got != roleValueNode { - t.Errorf("label %s = %q, want %q (unconditional)", labelRole, got, roleValueNode) - } - if _, ok := node.Labels[labelSeiNetwork]; ok { - t.Errorf("label %s present without --network; must be OMITTED, not stamped empty", labelSeiNetwork) - } -} - -func TestStampMetadata_PeerWiring_WithNetwork(t *testing.T) { - node := &seiv1alpha1.SeiNode{} - p := baseParams() - p.Network = testNetwork - p.NetworkNamespace = "genesis-ns" - stampMetadata(node, p, 0) - - if len(node.Spec.Peers) != 1 { - t.Fatalf("peers = %d, want 1 synthesized", len(node.Spec.Peers)) - } - lbl := node.Spec.Peers[0].Label - if lbl == nil { - t.Fatalf("synthesized peer is not a LabelPeerSource: %+v", node.Spec.Peers[0]) - } - if got := lbl.Selector[labelSeiNetwork]; got != testNetwork { - t.Errorf("peer selector %s = %q, want %q", labelSeiNetwork, got, testNetwork) - } - if lbl.Namespace != "genesis-ns" { - t.Errorf("peer namespace = %q, want genesis-ns", lbl.Namespace) - } -} - -func TestStampMetadata_NoNetwork_NoSynthesizedPeer(t *testing.T) { - node := &seiv1alpha1.SeiNode{} - p := baseParams() - p.Network = "" - stampMetadata(node, p, 0) - if len(node.Spec.Peers) != 0 { - t.Fatalf("peers = %d, want 0 (no --network)", len(node.Spec.Peers)) - } -} - -func TestStampMetadata_PreservesTemplatePeer_Appends(t *testing.T) { - node := &seiv1alpha1.SeiNode{ - Spec: seiv1alpha1.SeiNodeSpec{ - Peers: []seiv1alpha1.PeerSource{{ - Static: &seiv1alpha1.StaticPeerSource{Addresses: []string{"id@1.2.3.4:26656"}}, - }}, - }, - } - p := baseParams() - p.Network = testNetwork - stampMetadata(node, p, 0) - - if len(node.Spec.Peers) != 2 { - t.Fatalf("peers = %d, want 2 (template static + synthesized label)", len(node.Spec.Peers)) - } - if node.Spec.Peers[0].Static == nil { - t.Errorf("template static peer not preserved as first element: %+v", node.Spec.Peers[0]) - } - if node.Spec.Peers[1].Label == nil { - t.Errorf("synthesized label peer not appended as second element: %+v", node.Spec.Peers[1]) - } -} - -// --- waitForRunning ------------------------------------------------------- - -func TestWaitForRunning(t *testing.T) { - cases := []struct { - name string - phase seiv1alpha1.SeiNodePhase - wantErr bool - taskErr bool // expect a task-class (terminal) error - }{ - {"running", seiv1alpha1.PhaseRunning, false, false}, - {"failed", seiv1alpha1.PhaseFailed, true, true}, - {"pending times out", seiv1alpha1.PhasePending, true, false}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - node := &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{Name: testNode0, Namespace: testNamespace}, - Status: seiv1alpha1.SeiNodeStatus{Phase: tc.phase}, - } - c := fake.NewClientBuilder(). - WithScheme(newScheme(t)). - WithObjects(node). - WithStatusSubresource(&seiv1alpha1.SeiNode{}). - Build() - - err := waitForRunning(context.Background(), c, testNamespace, - []string{testNode0}, 200*time.Millisecond, 20*time.Millisecond) - if (err != nil) != tc.wantErr { - t.Fatalf("err=%v wantErr=%v", err, tc.wantErr) - } - if tc.taskErr && taskruntime.ExitCodeFor(err) != taskruntime.ExitTaskFailure { - t.Fatalf("Failed phase should yield a task-class error, got exit code %d", taskruntime.ExitCodeFor(err)) - } - }) - } -} - -// --- readiness probe (stage 2 TM, stage 3 EVM) ---------------------------- - -// Readiness probes (TM caught-up, EVM serving) moved to the SDK -// (sdk/sei/readiness_test.go); the Run-level tests below exercise them in situ. - -// TestRun_PublishBlockedWhileEVMDialFails is the finding-2 gate: TM reports -// height>0 but the EVM listener never binds, so publish must NOT proceed and -// no workflow-vars are written. -func TestRun_PublishBlockedWhileEVMDialFails(t *testing.T) { - w := testWorkflow() - tmplPath := writeTmpl(t, fullNodeTmpl) - vars := map[string]string{varKeyChainID: testChainID, varKeyImage: testImage} - - // TM /status answers height>0; EVM POST always 503 (never bound). - var tmHits atomic.Int32 - srv := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { - if r.Method == http.MethodGet { // TM /status - tmHits.Add(1) - _ = json.NewEncoder(rw).Encode(map[string]any{ - tmSyncInfoField: map[string]any{tmHeightField: "9"}, - }) - return - } - rw.WriteHeader(http.StatusServiceUnavailable) // EVM eth_blockNumber - })) - defer srv.Close() - - node := &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{Name: testChainID + "-" + testRole + "-0", Namespace: testNamespace}, - Spec: seiv1alpha1.SeiNodeSpec{ChainID: testChainID, FullNode: &seiv1alpha1.FullNodeSpec{}}, - Status: seiv1alpha1.SeiNodeStatus{ - Phase: seiv1alpha1.PhaseRunning, - Endpoint: &seiv1alpha1.NodeEndpointStatus{ - TendermintRpc: srv.URL, - TendermintRest: "http://rest.svc:1317", - EvmJsonRpc: srv.URL, // 503 path - }, - }, - } - c := fake.NewClientBuilder(). - WithScheme(newScheme(t)). - WithObjects(node). - WithStatusSubresource(&seiv1alpha1.SeiNode{}). - Build() - - _, err := Run(context.Background(), c, Params{ - Role: testRole, - TemplatePath: tmplPath, - Vars: vars, - Replicas: 1, - RunningTimeout: time.Second, - FirstBlockTimeout: 150 * time.Millisecond, - PollInterval: 20 * time.Millisecond, - HTTPClient: srv.Client(), - Workflow: w, - }) - if err == nil { - t.Fatalf("Run should fail when EVM never binds even at TM height>0") - } - if tmHits.Load() == 0 { - t.Fatalf("TM stage was never reached") - } - // No workflow-vars CM must be written — publish was blocked. - cm := &corev1.ConfigMap{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, cm); err == nil { - t.Fatalf("workflow-vars CM was written despite blocked publish: %+v", cm.Data) - } -} - -// --- publish assembly (the contract test) --------------------------------- - -func fakeNode(name, evm, tmRPC, tmREST string) *seiv1alpha1.SeiNode { - return &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: testNamespace}, - Status: seiv1alpha1.SeiNodeStatus{ - Phase: seiv1alpha1.PhaseRunning, - Endpoint: &seiv1alpha1.NodeEndpointStatus{ - EvmJsonRpc: evm, - TendermintRpc: tmRPC, - TendermintRest: tmREST, - }, - }, - } -} - -func TestPublishEndpoints_AssemblesAllFiveKeys(t *testing.T) { - w := testWorkflow() - nodes := []*seiv1alpha1.SeiNode{ - fakeNode("bench-1-rpc-0", "http://bench-1-rpc-0.nightly.svc:8545", "http://bench-1-rpc-0.nightly.svc:26657", "http://bench-1-rpc-0.nightly.svc:1317"), - fakeNode("bench-1-rpc-1", "http://bench-1-rpc-1.nightly.svc:8545", "http://bench-1-rpc-1.nightly.svc:26657", "http://bench-1-rpc-1.nightly.svc:1317"), - } - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - - evmList, err := publishEndpoints(context.Background(), c, w, testRole, testChainID, nodes) - if err != nil { - t.Fatalf("publishEndpoints: %v", err) - } - wantList := "http://bench-1-rpc-0.nightly.svc:8545,http://bench-1-rpc-1.nightly.svc:8545" - if evmList != wantList { - t.Fatalf("returned EVM list = %q, want %q", evmList, wantList) - } - - cm := &corev1.ConfigMap{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, cm); err != nil { - t.Fatalf("get CM: %v", err) - } - want := map[string]string{ - "CHAIN_ID": testChainID, - "RPC_EVM_RPC_LIST": wantList, - "RPC_EVM_RPC": "http://bench-1-rpc-0.nightly.svc:8545", // node-0 scalar - "RPC_TM_RPC": "http://bench-1-rpc-0.nightly.svc:26657", - "RPC_REST": "http://bench-1-rpc-0.nightly.svc:1317", - } - for k, v := range want { - if cm.Data[k] != v { - t.Errorf("CM[%s] = %q, want %q", k, cm.Data[k], v) - } - } -} - -func TestPublishEndpoints_EmptyGuards(t *testing.T) { - w := testWorkflow() - cases := []struct { - name string - nodes []*seiv1alpha1.SeiNode - }{ - { - "nil endpoint", - []*seiv1alpha1.SeiNode{{ObjectMeta: metav1.ObjectMeta{Name: "n0"}, Status: seiv1alpha1.SeiNodeStatus{Phase: seiv1alpha1.PhaseRunning}}}, - }, - { - "empty evmJsonRpc on node-1", - []*seiv1alpha1.SeiNode{ - fakeNode("n0", "http://n0:8545", "http://n0:26657", "http://n0:1317"), - fakeNode("n1", "", "http://n1:26657", "http://n1:1317"), - }, - }, - { - "empty tendermintRpc on node-0 (finding 6c)", - []*seiv1alpha1.SeiNode{fakeNode("n0", "http://n0:8545", "", "http://n0:1317")}, - }, - { - "empty tendermintRest on node-0", - []*seiv1alpha1.SeiNode{fakeNode("n0", "http://n0:8545", "http://n0:26657", "")}, - }, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - _, err := publishEndpoints(context.Background(), c, w, testRole, testChainID, tc.nodes) - if err == nil { - t.Fatalf("expected infra-fail empty-guard error") - } - if taskruntime.ExitCodeFor(err) != taskruntime.ExitInfraError { - t.Fatalf("empty-guard should be infra-class, got exit code %d", taskruntime.ExitCodeFor(err)) - } - }) - } -} - -// --- Run end-to-end fan-out (naming + happy publish) ---------------------- - -func TestRun_FanOutNamingAndPublish(t *testing.T) { - w := testWorkflow() - tmplPath := writeTmpl(t, fullNodeTmpl) - vars := map[string]string{varKeyChainID: testChainID, varKeyImage: testImage} - - // Healthy TM + EVM for every node. - srv := healthyRPCServer(t) - defer srv.Close() - - // Pre-stage N=2 SeiNodes already Running with endpoints (the controller's - // job; we test seitask's wait+probe+publish, not reconcile). - objs := make([]*seiv1alpha1.SeiNode, 0, 2) - for i := range 2 { - n := &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{Name: nodeName(testChainID+"-"+testRole, i), Namespace: testNamespace}, - Spec: seiv1alpha1.SeiNodeSpec{ChainID: testChainID, FullNode: &seiv1alpha1.FullNodeSpec{}}, - Status: seiv1alpha1.SeiNodeStatus{ - Phase: seiv1alpha1.PhaseRunning, - Endpoint: &seiv1alpha1.NodeEndpointStatus{ - EvmJsonRpc: srv.URL, - TendermintRpc: srv.URL, - TendermintRest: "http://rest.svc:1317", - }, - }, - } - objs = append(objs, n) - } - c := fake.NewClientBuilder(). - WithScheme(newScheme(t)). - WithObjects(objs[0], objs[1]). - WithStatusSubresource(&seiv1alpha1.SeiNode{}). - Build() - - res, err := Run(context.Background(), c, Params{ - Role: testRole, - Name: testChainID + "-" + testRole, - TemplatePath: tmplPath, - Vars: vars, - Replicas: 2, - Network: testNetwork, - RunningTimeout: time.Second, - FirstBlockTimeout: time.Second, - PollInterval: 10 * time.Millisecond, - HTTPClient: srv.Client(), - Workflow: w, - }) - if err != nil { - t.Fatalf("Run: %v", err) - } - wantNames := []string{testNode0, testNode1} - if len(res.Names) != 2 || res.Names[0] != wantNames[0] || res.Names[1] != wantNames[1] { - t.Fatalf("fan-out names = %v, want %v", res.Names, wantNames) - } - - // Pre-staged objects already exist (the AlreadyExists path), so the - // object-label producer contract is exercised by the stampMetadata unit - // tests; here we assert fan-out naming (above) and the publish CM (below). - cm := &corev1.ConfigMap{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, cm); err != nil { - t.Fatalf("get CM: %v", err) - } - if cm.Data["CHAIN_ID"] != testChainID { - t.Errorf("CHAIN_ID = %q", cm.Data["CHAIN_ID"]) - } - if cm.Data["RPC_EVM_RPC_LIST"] == "" { - t.Errorf("RPC_EVM_RPC_LIST empty") - } -} - -func healthyRPCServer(t *testing.T) *httptest.Server { - t.Helper() - var mu sync.Mutex - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - mu.Lock() - defer mu.Unlock() - if r.Method == http.MethodGet { // TM /status - _ = json.NewEncoder(w).Encode(map[string]any{ - tmSyncInfoField: map[string]any{tmHeightField: "12"}, - }) - return - } - _ = json.NewEncoder(w).Encode(map[string]any{"jsonrpc": "2.0", "id": 1, "result": "0x10"}) - })) - t.Cleanup(srv.Close) - return srv -} - -// TestBundledTemplates_RenderClean ensures every bundled SeiNode (rpc follower) -// scenario template renders + strict-unmarshals against a representative --var -// set. Guards against template-vs-schema drift after a CRD field rename. The -// SeiNetwork genesis/validator templates are validated in provisionsnd. -func TestBundledTemplates_RenderClean(t *testing.T) { - repoRoot, err := filepath.Abs("../../../") - if err != nil { - t.Fatal(err) - } - for _, scenario := range []string{"load-test", "release-test"} { - t.Run(scenario+"/rpc.yaml.tmpl", func(t *testing.T) { - p := Params{ - TemplatePath: filepath.Join(repoRoot, "scenarios", scenario, "rpc.yaml.tmpl"), - Vars: map[string]string{varKeyChainID: testChainID, varKeyImage: testImage}, - } - node, err := renderNode(p, 0) - if err != nil { - t.Fatalf("render: %v", err) - } - if node.Spec.ChainID != testChainID { - t.Fatalf("chainId = %q, want %q", node.Spec.ChainID, testChainID) - } - if node.Spec.FullNode == nil { - t.Fatalf("rpc template must render a fullNode SeiNode; spec = %+v", node.Spec) - } - }) - } -} diff --git a/internal/seitask/provisionsnd/provision.go b/internal/seitask/provisionsnd/provision.go deleted file mode 100644 index e1340180..00000000 --- a/internal/seitask/provisionsnd/provision.go +++ /dev/null @@ -1,308 +0,0 @@ -// Package provisionsnd implements `seitask provision-snd`: render a Go -// template to a SeiNetwork YAML, stamp an ownerRef to the parent -// Workflow, Create it, await Ready, poll the chain RPC for first block, -// then publish endpoints to workflow-vars under role-scoped keys -// (VALIDATOR_TM_RPC, RPC_EVM_RPC, etc.). -// -// Templates are scenario-intrinsic: the full SeiNetwork shape (mode, overrides, -// peers, genesis ceremony) lives in the template body as proper YAML. -// Per-run scalars (CHAIN_ID, IMAGE, ADMIN_ADDRESS, ...) flow in via --var -// and resolve at render time. Same `--template + --var` contract as the -// runner subcommand. -package provisionsnd - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "net/http" - "os" - "reflect" - "strings" - "text/template" - "time" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/wait" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/yaml" - - seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -const fieldOwner client.FieldOwner = "seitask-provision-snd" - -// Params carries the typed inputs to Run. -type Params struct { - // Role tags the workflow-vars keys this Task writes (e.g. "validator", - // "rpc"). Required for scenarios with multiple provision-snd Tasks; - // values get uppercased to compose VALIDATOR_TM_RPC etc. - Role string - - // Name is the SeiNetwork metadata.name. Defaults to - // "-" when empty. - Name string - - // TemplatePath is the on-disk path to the Go text/template producing - // a SeiNetwork YAML. Required. - TemplatePath string - - // Vars are the template's substitution context (the .KEY map in - // template syntax). Missing keys referenced by the template fail - // rendering rather than silently expanding to empty strings. - Vars map[string]string - - // ReadyTimeout bounds the wait for status.phase=Ready. - ReadyTimeout time.Duration - - // FirstBlockTimeout bounds the post-Ready wait for the chain to produce - // its first block. - FirstBlockTimeout time.Duration - - // PollInterval is the interval between status reads and chain RPC reads. - PollInterval time.Duration - - // HTTPClient overrides the chain-RPC client; nil means http.DefaultClient. - // Tests use this seam. - HTTPClient *http.Client - - // Workflow is the parent Chaos Mesh Workflow identity (downward-API). - Workflow taskruntime.WorkflowIdentity -} - -// Result is the post-Run summary, returned so main can log it before exit. -type Result struct { - Name string - ChainID string - Endpoints seiv1alpha1.Endpoints -} - -// Run renders the template, creates the SeiNetwork with an ownerRef to the -// parent Workflow, waits for Ready, polls the chain RPC for first block, and -// writes role-scoped endpoints to workflow-vars. -func Run(ctx context.Context, c client.Client, p Params) (Result, error) { - if err := validateParams(p); err != nil { - return Result{}, err - } - p = withDefaults(p) - - snd, err := renderTemplate(p.TemplatePath, p.Vars) - if err != nil { - return Result{}, taskruntime.Task(fmt.Errorf("rendering template %s: %w", p.TemplatePath, err)) - } - stampMetadata(snd, p) - - if err := c.Create(ctx, snd, fieldOwner); err != nil { - if !apierrors.IsAlreadyExists(err) { - return Result{}, taskruntime.Infra(fmt.Errorf("creating SeiNetwork %s/%s: %w", snd.Namespace, snd.Name, err)) - } - // Re-runs land here. Surface drift loudly so an operator who edited - // the template since the original Create knows the cluster is still - // at the original spec — we don't force-apply to avoid clobbering - // hand-edits or in-flight reconciliation. - warnIfDrift(ctx, c, snd) - } - - if err := waitForReady(ctx, c, types.NamespacedName{Namespace: snd.Namespace, Name: snd.Name}, p.ReadyTimeout, p.PollInterval); err != nil { - return Result{}, err - } - - current := &seiv1alpha1.SeiNetwork{} - if err := c.Get(ctx, types.NamespacedName{Namespace: snd.Namespace, Name: snd.Name}, current); err != nil { - return Result{}, taskruntime.Infra(fmt.Errorf("re-reading SeiNetwork post-Ready: %w", err)) - } - if current.Status.Endpoints == nil || current.Status.Endpoints.TendermintRpc == "" { - return Result{}, taskruntime.Infra(fmt.Errorf("SeiNetwork %s reached Ready but .status.endpoints.tendermintRpc is empty", current.Name)) - } - endpoints := *current.Status.Endpoints - chainID := current.Spec.Genesis.ChainID - - httpClient := p.HTTPClient - if httpClient == nil { - httpClient = http.DefaultClient - } - if err := waitForFirstBlock(ctx, httpClient, endpoints.TendermintRpc, p.FirstBlockTimeout, p.PollInterval); err != nil { - return Result{}, err - } - - if err := publishEndpoints(ctx, c, p.Workflow, p.Role, chainID, endpoints); err != nil { - return Result{}, err - } - return Result{Name: snd.Name, ChainID: chainID, Endpoints: endpoints}, nil -} - -func validateParams(p Params) error { - switch { - case p.Role == "": - return fmt.Errorf("provision-snd: --role is required") - case p.TemplatePath == "": - return fmt.Errorf("provision-snd: --template is required") - case p.Workflow.Name == "" || p.Workflow.Namespace == "": - return fmt.Errorf("provision-snd: workflow identity not loaded") - } - return nil -} - -func withDefaults(p Params) Params { - if p.Name == "" { - p.Name = p.Workflow.Name + "-" + p.Role - } - if p.ReadyTimeout == 0 { - p.ReadyTimeout = 15 * time.Minute - } - if p.FirstBlockTimeout == 0 { - p.FirstBlockTimeout = 5 * time.Minute - } - if p.PollInterval == 0 { - p.PollInterval = 5 * time.Second - } - return p -} - -// renderTemplate parses the file at path as a Go text/template, executes it -// against vars (missing keys fail the render — `missingkey=error` option), -// then strict-unmarshals the rendered bytes into a SeiNetwork so -// typos in field names fail here, not at apiserver-Create time. -func renderTemplate(path string, vars map[string]string) (*seiv1alpha1.SeiNetwork, error) { - raw, err := os.ReadFile(path) - if err != nil { - return nil, fmt.Errorf("read: %w", err) - } - tmpl, err := template.New(path).Option("missingkey=error").Parse(string(raw)) - if err != nil { - return nil, fmt.Errorf("parse: %w", err) - } - var buf bytes.Buffer - if err := tmpl.Execute(&buf, vars); err != nil { - return nil, fmt.Errorf("execute: %w", err) - } - out := &seiv1alpha1.SeiNetwork{} - if err := yaml.UnmarshalStrict(buf.Bytes(), out); err != nil { - return nil, fmt.Errorf("unmarshal rendered yaml: %w", err) - } - return out, nil -} - -// stampMetadata overwrites metadata fields the template MUST NOT control. -// OwnerReferences are assigned (not appended) so a template that smuggles -// a bogus ref can't leak through. -func stampMetadata(snd *seiv1alpha1.SeiNetwork, p Params) { - snd.APIVersion = seiv1alpha1.GroupVersion.String() - snd.Kind = "SeiNetwork" - snd.Name = p.Name - snd.Namespace = p.Workflow.Namespace - snd.OwnerReferences = []metav1.OwnerReference{p.Workflow.OwnerRef()} -} - -// warnIfDrift logs when a re-run finds the on-cluster SeiNetwork.Spec different -// from the freshly-rendered one. Operators who edited the template since -// the original Create need to know the cluster still has the old spec. -func warnIfDrift(ctx context.Context, c client.Client, fresh *seiv1alpha1.SeiNetwork) { - existing := &seiv1alpha1.SeiNetwork{} - if err := c.Get(ctx, types.NamespacedName{Namespace: fresh.Namespace, Name: fresh.Name}, existing); err != nil { - return - } - if reflect.DeepEqual(existing.Spec, fresh.Spec) { - return - } - fmt.Fprintf(os.Stderr, "WARN: SeiNetwork %s/%s exists with spec different from rendered template; reusing on-cluster spec\n", fresh.Namespace, fresh.Name) -} - -func waitForReady(ctx context.Context, c client.Client, key types.NamespacedName, timeout, interval time.Duration) error { - return wait.PollUntilContextTimeout(ctx, interval, timeout, true, func(ctx context.Context) (bool, error) { - snd := &seiv1alpha1.SeiNetwork{} - if err := c.Get(ctx, key, snd); err != nil { - if apierrors.IsNotFound(err) { - return false, nil - } - return false, taskruntime.Infra(fmt.Errorf("reading SeiNetwork %s: %w", key, err)) - } - switch snd.Status.Phase { - case seiv1alpha1.GroupPhaseReady: - return true, nil - case seiv1alpha1.GroupPhaseFailed: - return false, taskruntime.Task(fmt.Errorf("SeiNetwork %s reached Failed phase", key)) - } - return false, nil - }) -} - -// tendermintStatusResponse models the subset of Tendermint /status we need. -// Sei's CometBFT fork sometimes returns the body unwrapped (no JSON-RPC -// envelope), so we accept both shapes and fall back via Result/SyncInfo. -type tendermintStatusResponse struct { - Result *struct { - SyncInfo struct { - LatestBlockHeight string `json:"latest_block_height"` - } `json:"sync_info"` - } `json:"result,omitempty"` - SyncInfo struct { - LatestBlockHeight string `json:"latest_block_height"` - } `json:"sync_info"` -} - -func (r *tendermintStatusResponse) latestHeight() string { - if r.Result != nil && r.Result.SyncInfo.LatestBlockHeight != "" { - return r.Result.SyncInfo.LatestBlockHeight - } - return r.SyncInfo.LatestBlockHeight -} - -func waitForFirstBlock(ctx context.Context, hc *http.Client, tmRPC string, timeout, interval time.Duration) error { - return wait.PollUntilContextTimeout(ctx, interval, timeout, true, func(ctx context.Context) (bool, error) { - req, err := http.NewRequestWithContext(ctx, http.MethodGet, tmRPC+"/status", nil) - if err != nil { - return false, taskruntime.Infra(fmt.Errorf("status req: %w", err)) - } - resp, err := hc.Do(req) - if err != nil { - return false, nil - } - defer func() { _ = resp.Body.Close() }() - if resp.StatusCode != http.StatusOK { - return false, nil - } - var parsed tendermintStatusResponse - if err := json.NewDecoder(resp.Body).Decode(&parsed); err != nil { - return false, nil - } - h := parsed.latestHeight() - if h == "" || h == "0" { - return false, nil - } - return true, nil - }) -} - -// publishEndpoints assumes one chain-id per Workflow. CHAIN_ID is written via -// SetVars (a merge patch) — running provision-snd twice with the same -// --var=CHAIN_ID is idempotent; running it against two distinct chains -// silently overwrites and needs an explicit conflict check here. -func publishEndpoints(ctx context.Context, c client.Client, w taskruntime.WorkflowIdentity, role, chainID string, ep seiv1alpha1.Endpoints) error { - if err := taskruntime.EnsureWorkflowVarsCM(ctx, c, w, map[taskruntime.VarKey]string{ - taskruntime.KeyRunID: w.Name, - }); err != nil { - return err - } - vars := map[taskruntime.VarKey]string{ - // CHAIN_ID lives in SetVars (merge), not the EnsureWorkflowVarsCM seed - // (no-op on AlreadyExists): keygen-admin runs first and creates the - // CM, so a CHAIN_ID seed here would be silently dropped. - taskruntime.KeyChainID: chainID, - taskruntime.RoleScoped(role, taskruntime.KeyTendermintRPC): ep.TendermintRpc, - taskruntime.RoleScoped(role, taskruntime.KeyTendermintREST): ep.TendermintRest, - } - if len(ep.Nodes) > 0 { - vars[taskruntime.RoleScoped(role, taskruntime.KeyEVMJSONRPC)] = ep.Nodes[0].EvmJsonRpc - urls := make([]string, 0, len(ep.Nodes)) - for _, n := range ep.Nodes { - urls = append(urls, n.EvmJsonRpc) - } - vars[taskruntime.RoleScoped(role, taskruntime.KeyEVMJSONRPCList)] = strings.Join(urls, ",") - } - return taskruntime.SetVars(ctx, c, w, vars) -} diff --git a/internal/seitask/provisionsnd/provision_test.go b/internal/seitask/provisionsnd/provision_test.go deleted file mode 100644 index 6b97fde1..00000000 --- a/internal/seitask/provisionsnd/provision_test.go +++ /dev/null @@ -1,345 +0,0 @@ -package provisionsnd - -import ( - "context" - "encoding/json" - "net/http" - "net/http/httptest" - "os" - "path/filepath" - "sync" - "testing" - "time" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -const ( - testNamespace = "nightly" - testWorkflowName = "wf-test" - testWorkflowVarsCM = "workflow-vars-wf-test" - testRole = "validator" - testChainID = "bench-1" - testImage = "ghcr.io/sei/sei-chain:abc123" - testAdminAddress = "sei1admin" - varKeyChainID = "CHAIN_ID" - varKeyImage = "IMAGE" - varKeyAdminAddress = "ADMIN_ADDRESS" -) - -const validatorTmpl = `apiVersion: sei.io/v1alpha1 -kind: SeiNetwork -metadata: - name: PLACEHOLDER -spec: - image: {{ .IMAGE }} - replicas: 4 - genesis: - chainId: {{ .CHAIN_ID }} - accounts: - - address: {{ .ADMIN_ADDRESS }} - balance: 1000000000000usei -` - -const configOverridesTmpl = `apiVersion: sei.io/v1alpha1 -kind: SeiNetwork -metadata: - name: PLACEHOLDER -spec: - image: {{ .IMAGE }} - replicas: 2 - genesis: - chainId: {{ .CHAIN_ID }} - configOverrides: - evm.http_port: "8545" -` - -func newScheme(t *testing.T) *runtime.Scheme { - t.Helper() - s := runtime.NewScheme() - for _, add := range []func(*runtime.Scheme) error{ - corev1.AddToScheme, - seiv1alpha1.AddToScheme, - } { - if err := add(s); err != nil { - t.Fatal(err) - } - } - return s -} - -func writeTmpl(t *testing.T, body string) string { - t.Helper() - dir := t.TempDir() - p := filepath.Join(dir, "snd.yaml.tmpl") - if err := os.WriteFile(p, []byte(body), 0o600); err != nil { - t.Fatal(err) - } - return p -} - -func testWorkflow() taskruntime.WorkflowIdentity { - return taskruntime.WorkflowIdentity{Name: testWorkflowName, UID: "uid-test", Namespace: testNamespace} -} - -func TestRenderTemplate_SubstitutesVars(t *testing.T) { - path := writeTmpl(t, validatorTmpl) - snd, err := renderTemplate(path, map[string]string{ - varKeyChainID: testChainID, - varKeyImage: testImage, - varKeyAdminAddress: testAdminAddress, - }) - if err != nil { - t.Fatalf("renderTemplate: %v", err) - } - if snd.Spec.Image != testImage { - t.Errorf("Image: %q", snd.Spec.Image) - } - if snd.Spec.Genesis.ChainID != testChainID { - t.Errorf("Genesis.ChainID: %q", snd.Spec.Genesis.ChainID) - } - if len(snd.Spec.Genesis.Accounts) != 1 || snd.Spec.Genesis.Accounts[0].Address != testAdminAddress { - t.Errorf("Genesis.Accounts: %+v", snd.Spec.Genesis.Accounts) - } -} - -func TestRenderTemplate_MissingVarFailsRender(t *testing.T) { - path := writeTmpl(t, validatorTmpl) - if _, err := renderTemplate(path, map[string]string{varKeyChainID: testChainID}); err == nil { - t.Fatalf("expected error: IMAGE and ADMIN_ADDRESS not provided") - } -} - -func TestRenderTemplate_StrictUnmarshalCatchesTypos(t *testing.T) { - tmpl := `apiVersion: sei.io/v1alpha1 -kind: SeiNetwork -metadata: - name: PLACEHOLDER -spec: - replcas: 4 - image: {{ .IMAGE }} - genesis: - chainId: {{ .CHAIN_ID }} -` - path := writeTmpl(t, tmpl) - if _, err := renderTemplate(path, map[string]string{varKeyChainID: testChainID, varKeyImage: testImage}); err == nil { - t.Fatalf("expected strict-unmarshal error on `replcas` typo") - } -} - -func TestRenderTemplate_ConfigOverridesSubstitution(t *testing.T) { - path := writeTmpl(t, configOverridesTmpl) - snd, err := renderTemplate(path, map[string]string{ - varKeyChainID: testChainID, - varKeyImage: testImage, - }) - if err != nil { - t.Fatalf("renderTemplate: %v", err) - } - if got := snd.Spec.ConfigOverrides["evm.http_port"]; got != "8545" { - t.Errorf("configOverrides[evm.http_port] = %q; want %q", got, "8545") - } - if snd.Spec.Genesis.ChainID != testChainID { - t.Errorf("Genesis.ChainID = %q; want %q", snd.Spec.Genesis.ChainID, testChainID) - } -} - -// TestBundledTemplates_RenderClean ensures every bundled SeiNetwork scenario -// template (genesis/validator) renders + strict-unmarshals against a -// representative --var set. Guards against template-vs-schema drift after a -// CRD field rename. The SeiNode rpc.yaml.tmpl is validated in provisionnode. -func TestBundledTemplates_RenderClean(t *testing.T) { - repoRoot, err := filepath.Abs("../../../") - if err != nil { - t.Fatal(err) - } - cases := []struct { - path string - vars map[string]string - }{ - { - path: filepath.Join(repoRoot, "scenarios", "release-test", "validator.yaml.tmpl"), - vars: map[string]string{varKeyChainID: "rel-test", varKeyImage: "img:1", varKeyAdminAddress: testAdminAddress}, - }, - } - for _, tc := range cases { - t.Run(filepath.Base(tc.path), func(t *testing.T) { - snd, err := renderTemplate(tc.path, tc.vars) - if err != nil { - t.Fatalf("render: %v", err) - } - if snd.Spec.Genesis.ChainID == "" { - t.Fatalf("genesis.chainId empty after render: %+v", snd.Spec) - } - }) - } -} - -func TestStampMetadata_AssignsOwnerRefsNotAppend(t *testing.T) { - snd := &seiv1alpha1.SeiNetwork{ - // Template author smuggling a bogus ownerRef: stampMetadata MUST - // overwrite, not append. - ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: []metav1.OwnerReference{{ - APIVersion: "evil/v1", Kind: "Bogus", Name: "smuggled", UID: "bad", - }}, - }, - } - stampMetadata(snd, Params{Role: testRole, Name: testRole, Workflow: testWorkflow()}) - if len(snd.OwnerReferences) != 1 { - t.Fatalf("ownerReferences: want 1, got %d (%+v)", len(snd.OwnerReferences), snd.OwnerReferences) - } - if snd.OwnerReferences[0].Kind != "Workflow" { - t.Fatalf("ownerRef not replaced: %+v", snd.OwnerReferences[0]) - } -} - -func TestValidateParams(t *testing.T) { - full := Params{ - Role: testRole, - TemplatePath: "x.yaml.tmpl", - Workflow: testWorkflow(), - } - cases := []struct { - name string - mut func(*Params) - want bool - }{ - {"complete", func(*Params) {}, false}, - {"missing role", func(p *Params) { p.Role = "" }, true}, - {"missing template", func(p *Params) { p.TemplatePath = "" }, true}, - {"missing workflow.Name", func(p *Params) { p.Workflow.Name = "" }, true}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - p := full - tc.mut(&p) - err := validateParams(p) - if (err != nil) != tc.want { - t.Fatalf("validateParams err=%v wantErr=%v", err, tc.want) - } - }) - } -} - -func TestRun_EndToEnd_FakeClient(t *testing.T) { - w := testWorkflow() - tmplPath := writeTmpl(t, validatorTmpl) - vars := map[string]string{varKeyChainID: testChainID, varKeyImage: testImage, varKeyAdminAddress: testAdminAddress} - - prestaged, err := renderTemplate(tmplPath, vars) - if err != nil { - t.Fatal(err) - } - stampMetadata(prestaged, Params{Role: testRole, Name: testRole, Workflow: w}) - prestaged.Status.Phase = seiv1alpha1.GroupPhaseReady - prestaged.Status.Endpoints = &seiv1alpha1.Endpoints{ - TendermintRpc: "http://tm.svc:26657", - TendermintRest: "http://rest.svc:1317", - Nodes: []seiv1alpha1.NodeEndpoint{ - {Name: "validator-0", EvmJsonRpc: "http://evm-0.svc:8545"}, - {Name: "validator-1", EvmJsonRpc: "http://evm-1.svc:8545"}, - }, - } - - c := fake.NewClientBuilder(). - WithScheme(newScheme(t)). - WithObjects(prestaged). - WithStatusSubresource(&seiv1alpha1.SeiNetwork{}). - Build() - - srv := fakeStatusServer(t, "42") - defer srv.Close() - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testRole}, prestaged); err != nil { - t.Fatal(err) - } - prestaged.Status.Endpoints.TendermintRpc = srv.URL - if err := c.Status().Update(context.Background(), prestaged); err != nil { - t.Fatal(err) - } - - res, err := Run(context.Background(), c, Params{ - Role: testRole, - Name: testRole, - TemplatePath: tmplPath, - Vars: vars, - ReadyTimeout: 2 * time.Second, - FirstBlockTimeout: 2 * time.Second, - PollInterval: 10 * time.Millisecond, - HTTPClient: srv.Client(), - Workflow: w, - }) - if err != nil { - t.Fatalf("Run: %v", err) - } - if res.Name != testRole || res.ChainID != testChainID { - t.Fatalf("Result: %+v", res) - } - - cm := &corev1.ConfigMap{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, cm); err != nil { - t.Fatalf("get CM: %v", err) - } - if got := cm.Data["CHAIN_ID"]; got != testChainID { - t.Fatalf("CHAIN_ID = %q", got) - } - if cm.Data["VALIDATOR_TM_RPC"] == "" { - t.Fatalf("VALIDATOR_TM_RPC empty") - } - if cm.Data["VALIDATOR_EVM_RPC"] != "http://evm-0.svc:8545" { - t.Fatalf("VALIDATOR_EVM_RPC = %q (want pod-0 only)", cm.Data["VALIDATOR_EVM_RPC"]) - } - if cm.Data["VALIDATOR_EVM_RPC_LIST"] != "http://evm-0.svc:8545,http://evm-1.svc:8545" { - t.Fatalf("VALIDATOR_EVM_RPC_LIST = %q (want comma-separated all-pod URLs)", cm.Data["VALIDATOR_EVM_RPC_LIST"]) - } -} - -func fakeStatusServer(t *testing.T, height string) *httptest.Server { - t.Helper() - var ( - mu sync.Mutex - calls int - ) - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - mu.Lock() - calls++ - mu.Unlock() - _ = json.NewEncoder(w).Encode(map[string]any{ - "sync_info": map[string]any{ - "latest_block_height": height, - }, - }) - })) - t.Cleanup(srv.Close) - return srv -} - -func TestTendermintStatusResponse_LatestHeight(t *testing.T) { - cases := []struct { - name string - body string - want string - }{ - {"jsonrpc envelope", `{"result":{"sync_info":{"latest_block_height":"42"}}}`, "42"}, - {"bare", `{"sync_info":{"latest_block_height":"7"}}`, "7"}, - {"empty", `{}`, ""}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - var r tendermintStatusResponse - if err := json.Unmarshal([]byte(tc.body), &r); err != nil { - t.Fatal(err) - } - if got := r.latestHeight(); got != tc.want { - t.Fatalf("got %q, want %q", got, tc.want) - } - }) - } -} diff --git a/internal/seitask/uploadreport/upload.go b/internal/seitask/uploadreport/upload.go deleted file mode 100644 index d43e0c94..00000000 --- a/internal/seitask/uploadreport/upload.go +++ /dev/null @@ -1,183 +0,0 @@ -// Package uploadreport implements `seitask upload-report`: collect Workflow -// observability artifacts that Loki doesn't index — workflow-vars CM, the -// parent Workflow CR, the WorkflowNode tree — and upload them to S3 under -// a per-run prefix. -// -// Pod stdout/stderr is NOT uploaded; Alloy + Loki on the cluster already -// ingest every Task pod's logs indexed by chaos-mesh.org/workflow. -// upload-report's job is the K8s resource snapshot Loki can't give you: -// the structural record of what fired and how each step terminated. -// -// Runs as the final step of a scenario. The subcommand's exit code mirrors -// the EXIT_REASON workflow-vars value so the Workflow's terminal phase -// reflects scenario outcome rather than upload-step success. Does NOT -// write EXIT_REASON — upload-report is the terminal observer, not a -// producer of the upstream verdict. -// -// +kubebuilder:rbac:groups=chaos-mesh.org,resources=workflows;workflownodes,verbs=get;list -package uploadreport - -import ( - "bytes" - "context" - "fmt" - "strings" - - "github.com/aws/aws-sdk-go-v2/aws" - "github.com/aws/aws-sdk-go-v2/service/s3" - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/yaml" - - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -// S3Uploader is the seam tests inject. Production wires *s3.Client. -type S3Uploader interface { - Put(ctx context.Context, bucket, key string, body []byte) error -} - -// Workflow + WorkflowNode are chaos-mesh.org/v1alpha1 CRs. unstructured so -// the binary doesn't depend on the chaos-mesh Go types for read-only -// artifact collection. -const ( - chaosMeshGroup = "chaos-mesh.org" - chaosMeshVersion = "v1alpha1" - chaosMeshWFLabel = chaosMeshGroup + "/workflow" -) - -var ( - workflowGVK = schema.GroupVersionKind{Group: chaosMeshGroup, Version: chaosMeshVersion, Kind: "Workflow"} - workflowNodeGVK = schema.GroupVersionKind{Group: chaosMeshGroup, Version: chaosMeshVersion, Kind: "WorkflowNode"} -) - -// Params carries the typed inputs to Run. -type Params struct { - Bucket string - Prefix string // S3 key prefix; leading/trailing slashes are trimmed. - Workflow taskruntime.WorkflowIdentity - - S3 S3Uploader -} - -type Result struct { - UploadedKeys []string - ExitReason taskruntime.ExitReason -} - -// Run uploads workflow-vars + Workflow CR + WorkflowNode tree under -// s3:////. Reads EXIT_REASON from workflow-vars so the -// caller can propagate it into the process's exit code. -func Run(ctx context.Context, c client.Client, p Params) (Result, error) { - if err := validate(p); err != nil { - return Result{}, err - } - prefix := strings.Trim(p.Prefix, "/") - res := Result{ExitReason: taskruntime.ExitReasonPass} - - if err := uploadWorkflowVars(ctx, c, p, prefix, &res); err != nil { - return res, err - } - if err := uploadWorkflowResources(ctx, c, p, prefix, &res); err != nil { - return res, err - } - return res, nil -} - -func validate(p Params) error { - switch { - case p.Bucket == "": - return fmt.Errorf("upload-report: --bucket is required") - case p.Prefix == "": - return fmt.Errorf("upload-report: --prefix is required") - case p.Workflow.Name == "" || p.Workflow.Namespace == "": - return fmt.Errorf("upload-report: workflow identity not loaded") - case p.S3 == nil: - return fmt.Errorf("upload-report: S3Uploader seam is required") - } - return nil -} - -func uploadWorkflowVars(ctx context.Context, c client.Client, p Params, prefix string, res *Result) error { - cm := &corev1.ConfigMap{} - err := c.Get(ctx, types.NamespacedName{Namespace: p.Workflow.Namespace, Name: taskruntime.WorkflowVarsName(p.Workflow.Name)}, cm) - switch { - case apierrors.IsNotFound(err): - // No prior Task initialized the CM — pass, nothing to upload. - return nil - case err != nil: - return taskruntime.Infra(fmt.Errorf("reading workflow-vars: %w", err)) - } - if reason := taskruntime.ExitReason(cm.Data[string(taskruntime.KeyExitReason)]); reason != "" { - res.ExitReason = reason - } - body, err := yaml.Marshal(cm.Data) - if err != nil { - return taskruntime.Infra(fmt.Errorf("marshal workflow-vars: %w", err)) - } - return putAt(ctx, p, prefix+"/workflow-vars.yaml", body, res) -} - -// uploadWorkflowResources serializes the parent Workflow CR + every -// WorkflowNode owned by it. The WorkflowNode tree is the canonical record -// of which Task fired, in what order, and how each terminated — the -// structural artifact Loki can't give you from log queries alone. -func uploadWorkflowResources(ctx context.Context, c client.Client, p Params, prefix string, res *Result) error { - wf := &unstructured.Unstructured{} - wf.SetGroupVersionKind(workflowGVK) - if err := c.Get(ctx, types.NamespacedName{Namespace: p.Workflow.Namespace, Name: p.Workflow.Name}, wf); err != nil { - if apierrors.IsNotFound(err) { - return nil // workflow was deleted before our turn; nothing to upload - } - return taskruntime.Infra(fmt.Errorf("reading Workflow CR: %w", err)) - } - body, err := yaml.Marshal(wf.Object) - if err != nil { - return taskruntime.Infra(fmt.Errorf("marshal Workflow CR: %w", err)) - } - if err := putAt(ctx, p, prefix+"/workflow.yaml", body, res); err != nil { - return err - } - - nodes := &unstructured.UnstructuredList{} - nodes.SetGroupVersionKind(workflowNodeGVK) - if err := c.List(ctx, nodes, - client.InNamespace(p.Workflow.Namespace), - client.MatchingLabels{chaosMeshWFLabel: p.Workflow.Name}, - ); err != nil { - return taskruntime.Infra(fmt.Errorf("listing WorkflowNodes: %w", err)) - } - body, err = yaml.Marshal(nodes.Object) - if err != nil { - return taskruntime.Infra(fmt.Errorf("marshal WorkflowNodes: %w", err)) - } - return putAt(ctx, p, prefix+"/workflownodes.yaml", body, res) -} - -func putAt(ctx context.Context, p Params, key string, body []byte, res *Result) error { - if err := p.S3.Put(ctx, p.Bucket, key, body); err != nil { - return taskruntime.Infra(fmt.Errorf("upload %s: %w", key, err)) - } - res.UploadedKeys = append(res.UploadedKeys, key) - return nil -} - -// NewS3Uploader wraps an *s3.Client as an S3Uploader. -func NewS3Uploader(s3c *s3.Client) S3Uploader { - return &s3Uploader{client: s3c} -} - -type s3Uploader struct{ client *s3.Client } - -func (u *s3Uploader) Put(ctx context.Context, bucket, key string, body []byte) error { - _, err := u.client.PutObject(ctx, &s3.PutObjectInput{ - Bucket: aws.String(bucket), - Key: aws.String(key), - Body: bytes.NewReader(body), - }) - return err -} diff --git a/internal/seitask/uploadreport/upload_test.go b/internal/seitask/uploadreport/upload_test.go deleted file mode 100644 index 1fb4f888..00000000 --- a/internal/seitask/uploadreport/upload_test.go +++ /dev/null @@ -1,229 +0,0 @@ -package uploadreport - -import ( - "context" - "errors" - "strings" - "sync" - "testing" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - "github.com/sei-protocol/sei-k8s-controller/internal/taskruntime" -) - -const ( - testBucket = "harbor-validation-results" - testPrefix = "nightly/release-test/wf-test" - testNamespace = "nightly" - testWorkflowName = "wf-test" -) - -func newScheme(t *testing.T) *runtime.Scheme { - t.Helper() - s := runtime.NewScheme() - if err := corev1.AddToScheme(s); err != nil { - t.Fatal(err) - } - return s -} - -func testWorkflow() taskruntime.WorkflowIdentity { - return taskruntime.WorkflowIdentity{Name: testWorkflowName, UID: "uid-test", Namespace: testNamespace} -} - -// fakeS3 records every PutObject so tests can assert keys + bodies. -type fakeS3 struct { - mu sync.Mutex - objects map[string][]byte - failOn string // bucket+"/"+key prefix to fail on -} - -func newFakeS3() *fakeS3 { return &fakeS3{objects: map[string][]byte{}} } - -func (s *fakeS3) Put(_ context.Context, bucket, key string, body []byte) error { - s.mu.Lock() - defer s.mu.Unlock() - full := bucket + "/" + key - if s.failOn != "" && strings.HasPrefix(full, s.failOn) { - return errors.New("s3 simulated failure") - } - s.objects[full] = append([]byte(nil), body...) - return nil -} - -func workflowVarsCM(data map[string]string) *corev1.ConfigMap { - return &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: taskruntime.WorkflowVarsName(testWorkflowName), - Namespace: testNamespace, - }, - Data: data, - } -} - -func workflowCR() *unstructured.Unstructured { - u := &unstructured.Unstructured{} - u.SetGroupVersionKind(schema.GroupVersionKind{Group: chaosMeshGroup, Version: chaosMeshVersion, Kind: "Workflow"}) - u.SetName(testWorkflowName) - u.SetNamespace(testNamespace) - u.Object["status"] = map[string]any{"phase": "Succeed"} - return u -} - -func workflowNode(name string) *unstructured.Unstructured { - u := &unstructured.Unstructured{} - u.SetGroupVersionKind(schema.GroupVersionKind{Group: chaosMeshGroup, Version: chaosMeshVersion, Kind: "WorkflowNode"}) - u.SetName(name) - u.SetNamespace(testNamespace) - u.SetLabels(map[string]string{chaosMeshWFLabel: testWorkflowName}) - return u -} - -func TestRun_UploadsAllArtifacts(t *testing.T) { - c := fake.NewClientBuilder(). - WithScheme(newScheme(t)). - WithObjects(workflowVarsCM(map[string]string{ - string(taskruntime.KeyRunID): testWorkflowName, - string(taskruntime.KeyExitReason): string(taskruntime.ExitReasonPass), - })). - WithObjects(workflowCR(), workflowNode("step-1"), workflowNode("step-2")). - Build() - s3 := newFakeS3() - - res, err := Run(context.Background(), c, Params{ - Bucket: testBucket, Prefix: testPrefix, Workflow: testWorkflow(), S3: s3, - }) - if err != nil { - t.Fatalf("Run: %v", err) - } - if res.ExitReason != taskruntime.ExitReasonPass { - t.Fatalf("ExitReason = %q", res.ExitReason) - } - for _, k := range []string{ - testBucket + "/" + testPrefix + "/workflow-vars.yaml", - testBucket + "/" + testPrefix + "/workflow.yaml", - testBucket + "/" + testPrefix + "/workflownodes.yaml", - } { - if _, ok := s3.objects[k]; !ok { - t.Fatalf("expected S3 key %q, got %v", k, keysOf(s3)) - } - } -} - -func TestRun_PropagatesEXITReasonInfraFail(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).WithObjects( - workflowVarsCM(map[string]string{string(taskruntime.KeyExitReason): string(taskruntime.ExitReasonInfraFail)}), - ).Build() - - res, err := Run(context.Background(), c, Params{ - Bucket: testBucket, Prefix: testPrefix, Workflow: testWorkflow(), S3: newFakeS3(), - }) - if err != nil { - t.Fatalf("Run: %v", err) - } - if res.ExitReason != taskruntime.ExitReasonInfraFail { - t.Fatalf("ExitReason = %q", res.ExitReason) - } -} - -func TestRun_PropagatesEXITReasonTaskFail(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).WithObjects( - workflowVarsCM(map[string]string{string(taskruntime.KeyExitReason): string(taskruntime.ExitReasonTaskFail)}), - ).Build() - - res, err := Run(context.Background(), c, Params{ - Bucket: testBucket, Prefix: testPrefix, Workflow: testWorkflow(), S3: newFakeS3(), - }) - if err != nil { - t.Fatalf("Run: %v", err) - } - if res.ExitReason != taskruntime.ExitReasonTaskFail { - t.Fatalf("ExitReason = %q", res.ExitReason) - } -} - -func TestRun_NoWorkflowVarsCMTreatsAsPass(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - - res, err := Run(context.Background(), c, Params{ - Bucket: testBucket, Prefix: testPrefix, Workflow: testWorkflow(), S3: newFakeS3(), - }) - if err != nil { - t.Fatalf("Run: %v", err) - } - if res.ExitReason != taskruntime.ExitReasonPass { - t.Fatalf("ExitReason = %q", res.ExitReason) - } -} - -func TestRun_NormalizesLeadingAndTrailingSlashInPrefix(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).WithObjects(workflowCR()).Build() - s3 := newFakeS3() - - _, err := Run(context.Background(), c, Params{ - Bucket: testBucket, Prefix: "/" + testPrefix + "/", Workflow: testWorkflow(), S3: s3, - }) - if err != nil { - t.Fatalf("Run: %v", err) - } - want := testBucket + "/" + testPrefix + "/workflow.yaml" - if _, ok := s3.objects[want]; !ok { - t.Fatalf("expected normalized key %q, got %v", want, keysOf(s3)) - } -} - -func TestRun_S3UploadFailureIsInfraError(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).WithObjects(workflowCR()).Build() - s3 := newFakeS3() - s3.failOn = testBucket + "/" + testPrefix - - _, err := Run(context.Background(), c, Params{ - Bucket: testBucket, Prefix: testPrefix, Workflow: testWorkflow(), S3: s3, - }) - if err == nil { - t.Fatalf("expected error") - } - var infra *taskruntime.InfraError - if !errors.As(err, &infra) { - t.Fatalf("expected InfraError, got %T: %v", err, err) - } -} - -func TestValidate(t *testing.T) { - full := Params{Bucket: testBucket, Prefix: testPrefix, Workflow: testWorkflow(), S3: newFakeS3()} - cases := []struct { - name string - mut func(*Params) - want bool - }{ - {"complete", func(*Params) {}, false}, - {"missing bucket", func(p *Params) { p.Bucket = "" }, true}, - {"missing prefix", func(p *Params) { p.Prefix = "" }, true}, - {"missing workflow", func(p *Params) { p.Workflow.Name = "" }, true}, - {"missing s3 seam", func(p *Params) { p.S3 = nil }, true}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - p := full - tc.mut(&p) - err := validate(p) - if (err != nil) != tc.want { - t.Fatalf("validate err=%v wantErr=%v", err, tc.want) - } - }) - } -} - -func keysOf(s *fakeS3) []string { - out := make([]string, 0, len(s.objects)) - for k := range s.objects { - out = append(out, k) - } - return out -} diff --git a/internal/taskruntime/cm.go b/internal/taskruntime/cm.go deleted file mode 100644 index ee0d4678..00000000 --- a/internal/taskruntime/cm.go +++ /dev/null @@ -1,84 +0,0 @@ -package taskruntime - -import ( - "context" - "fmt" - - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -// cmFieldOwner identifies this library to server-side-apply conflict detection. -const cmFieldOwner client.FieldOwner = "seitask" - -// EnsureWorkflowVarsCM creates the per-run workflow-vars ConfigMap with an -// ownerRef + optional seed entries. AlreadyExists is treated as success -// (idempotent). Called before any SetVar. -func EnsureWorkflowVarsCM(ctx context.Context, c client.Client, w WorkflowIdentity, seed map[VarKey]string) error { - cm := &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: WorkflowVarsName(w.Name), - Namespace: w.Namespace, - OwnerReferences: []metav1.OwnerReference{w.OwnerRef()}, - }, - Data: stringifyKeys(seed), - } - err := c.Create(ctx, cm, cmFieldOwner) - if err == nil { - return nil - } - if apierrors.IsAlreadyExists(err) { - return nil - } - return Infra(fmt.Errorf("creating workflow-vars ConfigMap: %w", err)) -} - -// SetVar writes one key via SetVars. -func SetVar(ctx context.Context, c client.Client, w WorkflowIdentity, key VarKey, value string) error { - return SetVars(ctx, c, w, map[VarKey]string{key: value}) -} - -// SetVars merges multiple keys with MergeFromWithOptimisticLock (matches the -// status-patch discipline in CLAUDE.md). Caller retries on IsConflict if the -// flow is known-racy. -func SetVars(ctx context.Context, c client.Client, w WorkflowIdentity, kv map[VarKey]string) error { - if len(kv) == 0 { - return nil - } - current := &corev1.ConfigMap{} - if err := c.Get(ctx, types.NamespacedName{Namespace: w.Namespace, Name: WorkflowVarsName(w.Name)}, current); err != nil { - return Infra(fmt.Errorf("reading workflow-vars ConfigMap: %w", err)) - } - patch := client.MergeFromWithOptions(current.DeepCopy(), client.MergeFromWithOptimisticLock{}) - if current.Data == nil { - current.Data = map[string]string{} - } - for k, v := range kv { - current.Data[string(k)] = v - } - if err := c.Patch(ctx, current, patch, cmFieldOwner); err != nil { - return Infra(fmt.Errorf("patching workflow-vars ConfigMap: %w", err)) - } - return nil -} - -// WriteExitReason stamps EXIT_REASON from err's classification. Write errors -// are intentionally swallowed — the underlying failure already determined -// the exit code and shouldn't be masked by a CM-write failure. -func WriteExitReason(ctx context.Context, c client.Client, w WorkflowIdentity, err error) { - _ = SetVar(ctx, c, w, KeyExitReason, string(ExitReasonFor(err))) -} - -func stringifyKeys(in map[VarKey]string) map[string]string { - if len(in) == 0 { - return nil - } - out := make(map[string]string, len(in)) - for k, v := range in { - out[string(k)] = v - } - return out -} diff --git a/internal/taskruntime/cm_test.go b/internal/taskruntime/cm_test.go deleted file mode 100644 index a83c8478..00000000 --- a/internal/taskruntime/cm_test.go +++ /dev/null @@ -1,110 +0,0 @@ -package taskruntime - -import ( - "context" - "errors" - "testing" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func newScheme(t *testing.T) *runtime.Scheme { - t.Helper() - s := runtime.NewScheme() - if err := corev1.AddToScheme(s); err != nil { - t.Fatal(err) - } - return s -} - -const ( - testNamespace = "nightly" - testWorkflowName = "wf-test" - testWorkflowVarsCM = "workflow-vars-wf-test" -) - -func testIdentity() WorkflowIdentity { - return WorkflowIdentity{Name: testWorkflowName, UID: "uid-test", Namespace: testNamespace} -} - -func TestEnsureWorkflowVarsCM_CreatesWithOwnerRef(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - w := testIdentity() - - if err := EnsureWorkflowVarsCM(context.Background(), c, w, map[VarKey]string{KeyRunID: testWorkflowName}); err != nil { - t.Fatalf("Ensure: %v", err) - } - - got := &corev1.ConfigMap{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, got); err != nil { - t.Fatalf("Get: %v", err) - } - if got.Data[string(KeyRunID)] != testWorkflowName { - t.Fatalf("seed not written: %v", got.Data) - } - if len(got.OwnerReferences) != 1 || got.OwnerReferences[0].Kind != workflowKind { - t.Fatalf("ownerRef not stamped: %v", got.OwnerReferences) - } -} - -func TestEnsureWorkflowVarsCM_AlreadyExistsIsNoError(t *testing.T) { - existing := &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{Name: testWorkflowVarsCM, Namespace: testNamespace}, - Data: map[string]string{string(KeyRunID): testWorkflowName}, - } - c := fake.NewClientBuilder().WithScheme(newScheme(t)).WithObjects(existing).Build() - - if err := EnsureWorkflowVarsCM(context.Background(), c, testIdentity(), nil); err != nil { - t.Fatalf("Ensure on existing: %v", err) - } -} - -func TestSetVars_MergesIntoExisting(t *testing.T) { - existing := &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{Name: testWorkflowVarsCM, Namespace: testNamespace}, - Data: map[string]string{string(KeyRunID): testWorkflowName}, - } - c := fake.NewClientBuilder().WithScheme(newScheme(t)).WithObjects(existing).Build() - w := testIdentity() - - if err := SetVars(context.Background(), c, w, map[VarKey]string{ - KeyAdminAddress: "sei1abc", - KeyAdminSecretName: "admin-" + testWorkflowName, - }); err != nil { - t.Fatalf("SetVars: %v", err) - } - - got := &corev1.ConfigMap{} - if err := c.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: testWorkflowVarsCM}, got); err != nil { - t.Fatalf("Get: %v", err) - } - if got.Data[string(KeyRunID)] != testWorkflowName { - t.Fatalf("existing key clobbered: %v", got.Data) - } - if got.Data[string(KeyAdminAddress)] != "sei1abc" || got.Data[string(KeyAdminSecretName)] != "admin-wf-test" { - t.Fatalf("new keys not merged: %v", got.Data) - } -} - -func TestSetVar_ConfigMapMissingIsInfraError(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - err := SetVar(context.Background(), c, testIdentity(), KeyChainID, "bench-1") - if err == nil { - t.Fatalf("expected error when CM missing") - } - var infra *InfraError - if !errors.As(err, &infra) { - t.Fatalf("expected InfraError, got %T", err) - } -} - -func TestSetVars_Empty(t *testing.T) { - c := fake.NewClientBuilder().WithScheme(newScheme(t)).Build() - if err := SetVars(context.Background(), c, testIdentity(), nil); err != nil { - t.Fatalf("SetVars(nil): %v", err) - } -} diff --git a/internal/taskruntime/exit.go b/internal/taskruntime/exit.go deleted file mode 100644 index 0b3e3903..00000000 --- a/internal/taskruntime/exit.go +++ /dev/null @@ -1,60 +0,0 @@ -// Package taskruntime is the shared library for seitask subcommands: typed -// exit codes, ownerReference stamping, and workflow-vars ConfigMap helpers. -package taskruntime - -import ( - "errors" - "fmt" -) - -// Exit codes match qa-testing/release-test.ts: 0=pass, 1=task-fail (the work -// failed), 2=infra-fail (couldn't reach a verdict). Chaos Mesh collapses 1 -// and 2 to "Failed"; downstream readers use the EXIT_REASON workflow-vars -// key to recover the distinction. -const ( - ExitPass = 0 - ExitTaskFailure = 1 - ExitInfraError = 2 -) - -// InfraError marks the failure as non-deterministic (API unreachable, -// timeout, malformed input). Bare errors and TaskError map to exit 1. -type InfraError struct{ Err error } - -func (e *InfraError) Error() string { return fmt.Sprintf("infra: %v", e.Err) } -func (e *InfraError) Unwrap() error { return e.Err } - -// Infra wraps err as an InfraError. nil in → nil out. -func Infra(err error) error { - if err == nil { - return nil - } - return &InfraError{Err: err} -} - -// TaskError marks the failure as work-correctness. Wrapping is optional; -// bare errors map to exit 1 too. -type TaskError struct{ Err error } - -func (e *TaskError) Error() string { return fmt.Sprintf("task: %v", e.Err) } -func (e *TaskError) Unwrap() error { return e.Err } - -// Task wraps err as a TaskError. nil in → nil out. -func Task(err error) error { - if err == nil { - return nil - } - return &TaskError{Err: err} -} - -// ExitCodeFor: nil → 0, InfraError → 2, everything else → 1. -func ExitCodeFor(err error) int { - if err == nil { - return ExitPass - } - var infra *InfraError - if errors.As(err, &infra) { - return ExitInfraError - } - return ExitTaskFailure -} diff --git a/internal/taskruntime/exit_test.go b/internal/taskruntime/exit_test.go deleted file mode 100644 index f1e20454..00000000 --- a/internal/taskruntime/exit_test.go +++ /dev/null @@ -1,32 +0,0 @@ -package taskruntime - -import ( - "errors" - "fmt" - "testing" -) - -func TestExitCodeFor(t *testing.T) { - plain := errors.New("plain") - cases := []struct { - name string - err error - want int - }{ - {"nil", nil, ExitPass}, - {"plain", plain, ExitTaskFailure}, - {"task", Task(plain), ExitTaskFailure}, - {"infra", Infra(plain), ExitInfraError}, - {"infra wrapped twice", fmt.Errorf("outer: %w", Infra(plain)), ExitInfraError}, - {"task wrapped twice", fmt.Errorf("outer: %w", Task(plain)), ExitTaskFailure}, - {"Task(nil)", Task(nil), ExitPass}, - {"Infra(nil)", Infra(nil), ExitPass}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - if got := ExitCodeFor(tc.err); got != tc.want { - t.Fatalf("ExitCodeFor(%v) = %d, want %d", tc.err, got, tc.want) - } - }) - } -} diff --git a/internal/taskruntime/ownerref.go b/internal/taskruntime/ownerref.go deleted file mode 100644 index dff3b83a..00000000 --- a/internal/taskruntime/ownerref.go +++ /dev/null @@ -1,94 +0,0 @@ -package taskruntime - -import ( - "context" - "fmt" - "os" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -// Identity env contract for Task pods (downward API, projected by the -// scenario YAML's container env block): -// -// SEI_WORKFLOW_NAME ← fieldRef metadata.labels['chaos-mesh.org/workflow'] -// SEI_NAMESPACE ← fieldRef metadata.namespace -// -// The Workflow CR's UID is NOT projectable via downward API — Chaos Mesh -// does not stamp it on Task pods — so we fetch it from the apiserver at -// subcommand startup using NAME + NAMESPACE. SEI_WORKFLOW_UID env, when -// set, short-circuits the lookup; tests use it. -const ( - EnvWorkflowName = "SEI_WORKFLOW_NAME" - EnvWorkflowUID = "SEI_WORKFLOW_UID" - EnvNamespace = "SEI_NAMESPACE" - - workflowAPIVersion = "chaos-mesh.org/v1alpha1" - workflowKind = "Workflow" -) - -var workflowGVK = schema.GroupVersionKind{Group: "chaos-mesh.org", Version: "v1alpha1", Kind: workflowKind} - -// WorkflowIdentity is the parent Workflow CR's identity, read once at -// subcommand startup. -type WorkflowIdentity struct { - Name string - UID types.UID - Namespace string -} - -// LoadWorkflowIdentity reads NAME + NAMESPACE from env (downward-API -// projected on each Task pod), then fetches the Workflow CR's UID from -// the apiserver. SEI_WORKFLOW_UID env short-circuits the round-trip. -func LoadWorkflowIdentity(ctx context.Context, c client.Client) (WorkflowIdentity, error) { - name := os.Getenv(EnvWorkflowName) - ns := os.Getenv(EnvNamespace) - missing := []string{} - if name == "" { - missing = append(missing, EnvWorkflowName) - } - if ns == "" { - missing = append(missing, EnvNamespace) - } - if len(missing) > 0 { - return WorkflowIdentity{}, Infra(fmt.Errorf("downward-API env not projected: %v", missing)) - } - if uid := os.Getenv(EnvWorkflowUID); uid != "" { - return WorkflowIdentity{Name: name, UID: types.UID(uid), Namespace: ns}, nil - } - wf := &unstructured.Unstructured{} - wf.SetGroupVersionKind(workflowGVK) - if err := c.Get(ctx, types.NamespacedName{Namespace: ns, Name: name}, wf); err != nil { - return WorkflowIdentity{}, Infra(fmt.Errorf("fetching Workflow %s/%s for UID: %w", ns, name, err)) - } - uid := wf.GetUID() - if uid == "" { - return WorkflowIdentity{}, Infra(fmt.Errorf("workflow %s/%s exists but has no UID", ns, name)) - } - return WorkflowIdentity{Name: name, UID: uid, Namespace: ns}, nil -} - -// OwnerRef returns an ownerReference to the parent Workflow CR. Controller -// is explicit-false (Chaos Mesh manages Workflow children only via -// WorkflowNodes); BlockOwnerDeletion is explicit-false so cleanup doesn't -// stall on slow Task children. -func (w WorkflowIdentity) OwnerRef() metav1.OwnerReference { - return metav1.OwnerReference{ - APIVersion: workflowAPIVersion, - Kind: workflowKind, - Name: w.Name, - UID: w.UID, - Controller: new(bool), - BlockOwnerDeletion: new(bool), - } -} - -// WorkflowVarsName returns the per-run workflow-vars ConfigMap name. -// Single-sourced so producers and consumers don't drift. -func WorkflowVarsName(workflowName string) string { - return "workflow-vars-" + workflowName -} diff --git a/internal/taskruntime/ownerref_test.go b/internal/taskruntime/ownerref_test.go deleted file mode 100644 index d54df7f4..00000000 --- a/internal/taskruntime/ownerref_test.go +++ /dev/null @@ -1,102 +0,0 @@ -package taskruntime - -import ( - "context" - "errors" - "testing" - - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func TestLoadWorkflowIdentity(t *testing.T) { - t.Run("env-short-circuit (UID set)", func(t *testing.T) { - t.Setenv(EnvWorkflowName, "release-test-abc") - t.Setenv(EnvWorkflowUID, "uid-xyz") - t.Setenv(EnvNamespace, testNamespace) - - // Fake client w/ no Workflow CR — env UID should short-circuit - // the apiserver lookup, so this should still succeed. - c := fake.NewClientBuilder().Build() - w, err := LoadWorkflowIdentity(context.Background(), c) - if err != nil { - t.Fatalf("LoadWorkflowIdentity: %v", err) - } - if w.Name != "release-test-abc" || string(w.UID) != "uid-xyz" || w.Namespace != testNamespace { - t.Fatalf("got %+v", w) - } - }) - - t.Run("apiserver-lookup (UID env empty)", func(t *testing.T) { - t.Setenv(EnvWorkflowName, "release-test-abc") - t.Setenv(EnvWorkflowUID, "") - t.Setenv(EnvNamespace, testNamespace) - - wf := &unstructured.Unstructured{} - wf.SetGroupVersionKind(workflowGVK) - wf.SetName("release-test-abc") - wf.SetNamespace(testNamespace) - wf.SetUID("uid-from-apiserver") - - scheme := runtime.NewScheme() - c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(wf).Build() - w, err := LoadWorkflowIdentity(context.Background(), c) - if err != nil { - t.Fatalf("LoadWorkflowIdentity: %v", err) - } - if string(w.UID) != "uid-from-apiserver" { - t.Fatalf("expected UID from apiserver lookup, got %q", w.UID) - } - }) - - t.Run("missing name", func(t *testing.T) { - t.Setenv(EnvWorkflowName, "") - t.Setenv(EnvNamespace, testNamespace) - c := fake.NewClientBuilder().Build() - _, err := LoadWorkflowIdentity(context.Background(), c) - var infra *InfraError - if !errors.As(err, &infra) { - t.Fatalf("expected InfraError, got %T: %v", err, err) - } - }) - - t.Run("workflow CR not found", func(t *testing.T) { - t.Setenv(EnvWorkflowName, "missing-workflow") - t.Setenv(EnvWorkflowUID, "") - t.Setenv(EnvNamespace, testNamespace) - - scheme := runtime.NewScheme() - c := fake.NewClientBuilder().WithScheme(scheme).Build() - _, err := LoadWorkflowIdentity(context.Background(), c) - var infra *InfraError - if !errors.As(err, &infra) { - t.Fatalf("expected InfraError, got %T: %v", err, err) - } - }) -} - -func TestOwnerRef(t *testing.T) { - w := WorkflowIdentity{Name: "wf", UID: "uid", Namespace: "ns"} - ref := w.OwnerRef() - if ref.APIVersion != "chaos-mesh.org/v1alpha1" || ref.Kind != workflowKind { - t.Fatalf("wrong target: %+v", ref) - } - if ref.Name != "wf" || string(ref.UID) != "uid" { - t.Fatalf("wrong identity: %+v", ref) - } - if ref.Controller == nil || *ref.Controller { - t.Fatalf("Controller should be explicit false; got %+v", ref.Controller) - } - if ref.BlockOwnerDeletion == nil || *ref.BlockOwnerDeletion { - t.Fatalf("BlockOwnerDeletion should be explicit false; got %+v", ref.BlockOwnerDeletion) - } -} - -func TestWorkflowVarsName(t *testing.T) { - got := WorkflowVarsName("major-upgrade-20260520-184443") - want := "workflow-vars-major-upgrade-20260520-184443" - if got != want { - t.Fatalf("got %q, want %q", got, want) - } -} diff --git a/internal/taskruntime/scenarios_test.go b/internal/taskruntime/scenarios_test.go deleted file mode 100644 index 58a7e252..00000000 --- a/internal/taskruntime/scenarios_test.go +++ /dev/null @@ -1,96 +0,0 @@ -package taskruntime - -import ( - "os" - "path/filepath" - "strings" - "testing" - - "sigs.k8s.io/yaml" -) - -// TestScenarioYAMLs_CMNameMatchesWorkflowVarsName guards the contract -// surfaced by sei-protocol/sei-k8s-controller#337: scenario YAML's -// envFrom configMapRef.name MUST match what WorkflowVarsName produces -// for the scenario's own Workflow CR name. The first manual fire of -// release-test stuck a Task pod in CreateContainerConfigError for ~8m -// because the scenario referenced workflow-vars- but keygen -// created workflow-vars-. -// -// Scenarios using the seitask binary's typed CM helpers must opt in to -// this test by appearing in `scenariosToCheck` below. Bash-driven -// scenarios (e.g., major-upgrade.yaml today) are excluded — they -// create the CM via kubectl with their own naming convention. -func TestScenarioYAMLs_CMNameMatchesWorkflowVarsName(t *testing.T) { - scenariosDir, err := filepath.Abs("../../scenarios") - if err != nil { - t.Fatal(err) - } - scenariosToCheck := []string{"release-test.yaml", "load-test.yaml"} - - for _, name := range scenariosToCheck { - t.Run(name, func(t *testing.T) { - raw, err := os.ReadFile(filepath.Join(scenariosDir, name)) - if err != nil { - t.Fatal(err) - } - workflowName := workflowMetaName(t, raw) - wantCMName := WorkflowVarsName(workflowName) - cmRefs := configMapRefNames(t, raw) - if len(cmRefs) == 0 { - t.Fatalf("no envFrom configMapRef names found — scenario isn't exercising the bridge") - } - for i, got := range cmRefs { - if got != wantCMName { - t.Errorf("configMapRef[%d].name = %q; want %q (workflow %q)", i, got, wantCMName, workflowName) - } - } - }) - } -} - -// workflowMetaName extracts the first `kind: Workflow` document's -// metadata.name from a multi-doc scenario YAML. -func workflowMetaName(t *testing.T, raw []byte) string { - t.Helper() - for doc := range strings.SplitSeq(string(raw), "\n---\n") { - var head struct { - Kind string `json:"kind"` - Metadata struct { - Name string `json:"name"` - } `json:"metadata"` - } - if err := yaml.Unmarshal([]byte(doc), &head); err != nil { - continue - } - if head.Kind == "Workflow" && head.Metadata.Name != "" { - return head.Metadata.Name - } - } - t.Fatal("no kind=Workflow document with metadata.name found") - return "" -} - -// configMapRefNames pulls every envFrom configMapRef.name in the scenario. -// Walks the Workflow.spec.templates[].task.container.envFrom path; using a -// regex over the raw YAML keeps the test independent of the chaos-mesh -// Go types. -func configMapRefNames(t *testing.T, raw []byte) []string { - t.Helper() - var names []string - lines := strings.Split(string(raw), "\n") - for i, line := range lines { - if !strings.Contains(line, "configMapRef:") { - continue - } - // Next non-blank line should be ` name: ` at deeper indent. - for j := i + 1; j < len(lines) && j < i+4; j++ { - trimmed := strings.TrimSpace(lines[j]) - if rest, ok := strings.CutPrefix(trimmed, "name:"); ok { - names = append(names, strings.TrimSpace(rest)) - break - } - } - } - return names -} diff --git a/internal/taskruntime/vars.go b/internal/taskruntime/vars.go deleted file mode 100644 index e1766d80..00000000 --- a/internal/taskruntime/vars.go +++ /dev/null @@ -1,64 +0,0 @@ -package taskruntime - -import "strings" - -// VarKey is a typed key for the workflow-vars ConfigMap. Producers and -// consumers reference these constants so renames are compile errors. Schema + -// stability discipline: https://github.com/sei-protocol/bdchatham-designs/blob/main/designs/test-harness/test-harness-lld.md. -type VarKey string - -const ( - // KeyRunID — Workflow CR's metadata.name. Written by the initializing Task. - KeyRunID VarKey = "RUN_ID" - - // KeyChainID — the SeiNetwork's chainId. One-way door. - KeyChainID VarKey = "CHAIN_ID" - - // Endpoints — written by provision-snd after the SeiNetwork is Ready. - // KeyEVMJSONRPC is pod-0 only (release-test pins stateful EVM - // sequences to one pod). KeyEVMJSONRPCList is comma-separated - // per-pod URLs for seiload, whose stateful EVM workload needs to - // hit all RPC pods. - KeyTendermintRPC VarKey = "TM_RPC" - KeyTendermintREST VarKey = "REST" - KeyEVMJSONRPC VarKey = "EVM_RPC" - KeyEVMJSONRPCList VarKey = "EVM_RPC_LIST" - - // Admin identity — written by keygen. Mnemonic itself lives in the - // referenced Secret, not the ConfigMap. - KeyAdminAddress VarKey = "ADMIN_ADDRESS" - KeyAdminSecretName VarKey = "ADMIN_SECRET_NAME" - - // KeyExitReason — written by the failing Task pre-exit. upload-report - // reads this to recover the exit-code class Chaos Mesh collapses. - KeyExitReason VarKey = "EXIT_REASON" -) - -// ExitReason is the string mirror of ExitCodeFor for the EXIT_REASON CM value. -type ExitReason string - -const ( - ExitReasonPass ExitReason = "pass" - ExitReasonTaskFail ExitReason = "task-fail" - ExitReasonInfraFail ExitReason = "infra-fail" -) - -// ExitReasonFor mirrors ExitCodeFor: nil → pass, InfraError → infra-fail, -// otherwise → task-fail. -func ExitReasonFor(err error) ExitReason { - switch ExitCodeFor(err) { - case ExitPass: - return ExitReasonPass - case ExitInfraError: - return ExitReasonInfraFail - default: - return ExitReasonTaskFail - } -} - -// RoleScoped prefixes a key with an upper-cased role tag so scenarios with -// multiple SeiNetworks (validator + rpc) write disjoint workflow-vars keys. -// RoleScoped("validator", KeyTendermintRPC) → "VALIDATOR_TM_RPC". -func RoleScoped(role string, key VarKey) VarKey { - return VarKey(strings.ToUpper(role) + "_" + string(key)) -} diff --git a/internal/taskruntime/vars_test.go b/internal/taskruntime/vars_test.go deleted file mode 100644 index 2a090217..00000000 --- a/internal/taskruntime/vars_test.go +++ /dev/null @@ -1,46 +0,0 @@ -package taskruntime - -import ( - "errors" - "testing" -) - -func TestRoleScoped(t *testing.T) { - cases := []struct { - role string - key VarKey - want VarKey - }{ - {"validator", KeyTendermintRPC, "VALIDATOR_TM_RPC"}, - {"rpc", KeyEVMJSONRPC, "RPC_EVM_RPC"}, - {"Validator", KeyChainID, "VALIDATOR_CHAIN_ID"}, - } - for _, tc := range cases { - t.Run(string(tc.want), func(t *testing.T) { - if got := RoleScoped(tc.role, tc.key); got != tc.want { - t.Fatalf("RoleScoped(%q, %q) = %q, want %q", tc.role, tc.key, got, tc.want) - } - }) - } -} - -func TestExitReasonFor(t *testing.T) { - plain := errors.New("plain") - cases := []struct { - name string - err error - want ExitReason - }{ - {"nil", nil, ExitReasonPass}, - {"plain", plain, ExitReasonTaskFail}, - {"task", Task(plain), ExitReasonTaskFail}, - {"infra", Infra(plain), ExitReasonInfraFail}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - if got := ExitReasonFor(tc.err); got != tc.want { - t.Fatalf("ExitReasonFor(%v) = %s, want %s", tc.err, got, tc.want) - } - }) - } -} diff --git a/runner/rbac.yaml b/runner/rbac.yaml deleted file mode 100644 index ee6709ef..00000000 --- a/runner/rbac.yaml +++ /dev/null @@ -1,94 +0,0 @@ -# RBAC for seitask-runner — apply alongside each Chaos Mesh Workflow that uses -# the runner image. The Workflow's `task.container.serviceAccountName` must -# reference this ServiceAccount. -# -# Namespaced (Role, not ClusterRole) — the runner only operates on resources -# in the same namespace as the Workflow. -# -# The `configmaps` verbs cover the PR 6 cross-step variable bridge: bash -# steps (compute-target-height, resolve-proposal-id) upsert -# `workflow-vars-` via `kubectl apply`, and every other step reads -# values via `envFrom: configMapRef`. The kubelet itself needs no extra -# RBAC for the envFrom read — it uses kubelet credentials, not the -# workload SA — but the producer steps run `kubectl get/create/apply` -# under the workload SA and therefore require these verbs. ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: seitask-runner ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: seitask-runner -rules: -# SeiNodeTask: create + read + SSA-patch (apply uses PATCH with -# application/apply-patch+yaml, which requires the `patch` verb in addition -# to `create` — `create` alone 403s on the second invocation). -- apiGroups: ["sei.io"] - resources: ["seinodetasks"] - verbs: ["create", "get", "list", "watch", "patch", "update"] -- apiGroups: ["sei.io"] - resources: ["seinodetasks/status"] - verbs: ["get"] -# SeiNode: create + read. create is required by provision-node, whose first -# act is c.Create(SeiNode) to fan out N standalone follower CRs; without it -# every provision-node step 403s. get/list/watch back the --per-node-selector -# fan-out mode and the post-create wait-for-Running loop (.status.phase / -# .status.endpoint are read off the main object, gated by seinodes get). -- apiGroups: ["sei.io"] - resources: ["seinodes"] - verbs: ["create", "get", "list", "watch"] -# seinodes/status get is a forward-compat over-grant, NOT currently exercised: -# provision-node reads .status off the main object (seinodes get), not the -# subresource. Kept to match the /status pattern above and a future -# c.Status() read. -- apiGroups: ["sei.io"] - resources: ["seinodes/status"] - verbs: ["get"] -# SeiNetwork: create + read for provision-snd. Polls .status.phase -# until Ready and reads .status.endpoints to publish role-scoped TM/REST/ -# EVM URLs into workflow-vars. patch covers the major-upgrade bump-snd-image -# step, which `kubectl patch --type=merge`es spec.image to roll -# all validators onto the post-upgrade binary in a single write. -- apiGroups: ["sei.io"] - resources: ["seinetworks"] - verbs: ["create", "get", "list", "watch", "patch"] -- apiGroups: ["sei.io"] - resources: ["seinetworks/status"] - verbs: ["get"] -# Secrets: get + create for keygen, which writes the per-run admin -# mnemonic Secret. Downstream Tasks consume it via secretKeyRef on the -# Pod's env, which the kubelet handles under its own credentials. -- apiGroups: [""] - resources: ["secrets"] - verbs: ["get", "create"] -# ConfigMaps: workflow-vars- upsert by producer bash steps. -# `create`+`patch` are the apply-path verbs; `get`+`list`+`watch` support -# read-modify-write (resolve-proposal-id merges PROPOSAL_ID into the -# existing object); `update` covers the non-SSA apply path some kubectl -# versions emit. -- apiGroups: [""] - resources: ["configmaps"] - verbs: ["get", "list", "watch", "create", "patch", "update"] -# Chaos Mesh Workflow + WorkflowNode: read-only. -# workflows.get: LoadWorkflowIdentity fetches the Workflow CR's UID for -# ownerRef stamping (Chaos Mesh doesn't project UID via downward API). -# workflownodes.{get,list}: upload-report enumerates the per-step -# WorkflowNode tree as part of the S3 snapshot. -- apiGroups: ["chaos-mesh.org"] - resources: ["workflows", "workflownodes"] - verbs: ["get", "list"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: seitask-runner -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: seitask-runner -subjects: -- kind: ServiceAccount - name: seitask-runner diff --git a/runner/templates/await-condition.yaml.tmpl b/runner/templates/await-condition.yaml.tmpl deleted file mode 100644 index 2d936112..00000000 --- a/runner/templates/await-condition.yaml.tmpl +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNodeTask -metadata: - name: PLACEHOLDER -spec: - kind: AwaitCondition - target: - nodeRef: - name: {{ .NODE }} - timeoutSeconds: {{ with index . "TIMEOUT_SECONDS" }}{{ . }}{{ else }}900{{ end }} - awaitCondition: - height: - targetHeight: {{ .TARGET_HEIGHT }} - {{- with index . "ACTION" }} - action: {{ . }} - {{- end }} diff --git a/runner/templates/await-nodes-at-height.yaml.tmpl b/runner/templates/await-nodes-at-height.yaml.tmpl deleted file mode 100644 index b541d694..00000000 --- a/runner/templates/await-nodes-at-height.yaml.tmpl +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNodeTask -metadata: - name: PLACEHOLDER -spec: - kind: AwaitNodesAtHeight - target: - nodeRef: - name: {{ .NODE }} - timeoutSeconds: {{ with index . "TIMEOUT_SECONDS" }}{{ . }}{{ else }}1800{{ end }} - awaitNodesAtHeight: - targetHeight: {{ .TARGET_HEIGHT }} diff --git a/runner/templates/gov-software-upgrade.yaml.tmpl b/runner/templates/gov-software-upgrade.yaml.tmpl deleted file mode 100644 index a1e581a7..00000000 --- a/runner/templates/gov-software-upgrade.yaml.tmpl +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNodeTask -metadata: - # name is rewritten to a deterministic value by the runner; this placeholder - # is here only to keep the manifest schema-valid pre-rewrite. - name: PLACEHOLDER -spec: - kind: GovSoftwareUpgrade - target: - nodeRef: - name: {{ .NODE }} - timeoutSeconds: {{ with index . "TIMEOUT_SECONDS" }}{{ . }}{{ else }}600{{ end }} - govSoftwareUpgrade: - chainId: {{ .CHAIN_ID }} - {{- with index . "KEY_NAME" }} - keyName: {{ . }} - {{- end }} - title: {{ .TITLE | printf "%q" }} - description: {{ .DESCRIPTION | printf "%q" }} - upgradeName: {{ .UPGRADE_NAME }} - upgradeHeight: {{ .UPGRADE_HEIGHT }} - {{- with index . "UPGRADE_INFO" }} - upgradeInfo: {{ . | printf "%q" }} - {{- end }} - initialDeposit: {{ .INITIAL_DEPOSIT }} - fees: {{ .FEES }} - gas: {{ .GAS }} diff --git a/runner/templates/gov-vote.yaml.tmpl b/runner/templates/gov-vote.yaml.tmpl deleted file mode 100644 index 3111b977..00000000 --- a/runner/templates/gov-vote.yaml.tmpl +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNodeTask -metadata: - name: PLACEHOLDER -spec: - kind: GovVote - target: - nodeRef: - name: {{ .NODE }} - timeoutSeconds: {{ with index . "TIMEOUT_SECONDS" }}{{ . }}{{ else }}300{{ end }} - govVote: - chainId: {{ .CHAIN_ID }} - {{- with index . "KEY_NAME" }} - keyName: {{ . }} - {{- end }} - proposalId: {{ .PROPOSAL_ID }} - option: "{{ .OPTION }}" - fees: {{ .FEES }} - gas: {{ .GAS }} diff --git a/runner/templates/update-node-image.yaml.tmpl b/runner/templates/update-node-image.yaml.tmpl deleted file mode 100644 index e6ce3ec9..00000000 --- a/runner/templates/update-node-image.yaml.tmpl +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNodeTask -metadata: - name: PLACEHOLDER -spec: - kind: UpdateNodeImage - target: - nodeRef: - name: {{ .NODE }} - # Pre-upgrade nodes may be Pending or CrashLooping; relax the default - # Running-only requirement unless the operator overrides REQUIRE_PHASE. - requirePhase: {{ with index . "REQUIRE_PHASE" }}{{ . }}{{ else }}Running{{ end }} - timeoutSeconds: {{ with index . "TIMEOUT_SECONDS" }}{{ . }}{{ else }}1800{{ end }} - updateNodeImage: - image: {{ .IMAGE }} diff --git a/scenarios/README.md b/scenarios/README.md deleted file mode 100644 index 8cf2ea65..00000000 --- a/scenarios/README.md +++ /dev/null @@ -1,264 +0,0 @@ -# Scenarios - -End-to-end Chaos Mesh Workflows that compose `SeiNodeTask` CRs to exercise -chain-lifecycle behavior against real Kubernetes clusters. Each scenario is -the acceptance test for one capability surface. - -> **Status: runnable, gated on runner image publishing.** PR 6 closed the -> cross-step variable bridge gap via a per-Workflow-run ConfigMap -> (`workflow-vars-`). The bash steps that compute `TARGET_HEIGHT` -> and resolve `PROPOSAL_ID` `kubectl apply` the ConfigMap; every other -> step reads values via `envFrom`. End-to-end runs require a published -> `seitask-runner` image (see Prerequisites, item 4) -- everything else -> is in-tree. - -## Index - -| File | Mirrors | Purpose | -|---|---|---| -| `major-upgrade.yaml` | `sei-chain/integration_test/upgrade_module/major_upgrade_test.yaml` | 4-validator software-upgrade flow: gov proposal, vote, then a single SeiNetwork image bump that rolls all validators onto the new binary at the upgrade height. MVP acceptance for the SeiNodeTask CRD. | -| `testnet-deployment.yaml` | n/a | Reference 4-validator `SeiNetwork` the Workflow can target. | - -## Where this runs - -These scenarios are **destructive**. They submit governance proposals, mutate -SeiNode images, and drive validators through CrashLoop states. They are -designed for: - -- The **harbor dev cluster** (`harbor-dev` EKS). Ephemeral testnets only. -- Local `kind`/`minikube` clusters with the controller installed. -- **Not** any cluster carrying a chain you care about. - -The Workflow does not provision the chain. It assumes a 4-validator -`SeiNetwork` exists in the target namespace before the Workflow -applies. See "Run" below. - -## Prerequisites - -1. **CRDs installed** in the target cluster: - - `seinetworks.sei.io` - - `seinodes.sei.io` - - `seinodetasks.sei.io` - - ```bash - kubectl apply -f config/crd/ - ``` - -2. **Controller running** in `sei-k8s-controller-system` (or wherever the - platform repo installs it) and watching the target namespace. - -3. **Chaos Mesh installed** in the cluster (2.5+ verified). The dev cluster - already ships this via `platform/clusters/dev/chaos-mesh`. - -4. **`seitask-runner` image published** to a registry the cluster can pull - from. As of PR 5 the runner image is **not yet auto-published** by the - `ecr.yml` GitHub Action (which only builds the controller). Until that - workflow is extended, the operator must build and push manually: - - ```bash - make runner-image RUNNER_IMG=/sei/seitask-runner: - make runner-push RUNNER_IMG=/sei/seitask-runner: - ``` - - The image bundles the per-kind templates at `/templates/`; no ConfigMap - override is required for the in-tree scenarios. - -5. **RBAC for the runner ServiceAccount.** `runner/rbac.yaml` defines the - `seitask-runner` ServiceAccount + Role + RoleBinding. Apply it to the - namespace where the Workflow will run. - - Chaos Mesh `Task.container` does NOT expose `serviceAccountName` -- the - synthesized Workflow pod uses the namespace's `default` ServiceAccount. - You therefore need to either (a) bind the `seitask-runner` Role to the - `default` SA in the test namespace, or (b) use the - `chaos-mesh.org/inject-serviceaccount` annotation if your Chaos Mesh - build includes the SA-injection webhook (2.6+ optional component). - - Recommended: bind to `default` SA in the ephemeral namespace. - - ```bash - kubectl apply -n -f runner/rbac.yaml - kubectl create rolebinding seitask-runner-default \ - --role=seitask-runner --serviceaccount=:default \ - -n - ``` - -## Run: major-upgrade - -### 1. Apply the reference testnet - -```bash -kubectl apply -f scenarios/testnet-deployment.yaml -# wait for status.replicas == status.readyReplicas == 4 -kubectl -n majorupgrade wait --for=condition=NodesReady=true \ - --timeout=15m seinetwork/majorupgrade -# spot-check the validator phases -kubectl -n majorupgrade get seinodes -# expected: majorupgrade-0 .. majorupgrade-3, all Running. -``` - -### 2. Render and apply the Workflow - -The Workflow YAML uses `envsubst`-style placeholders so the same file works -across upgrade targets. Substitute and apply: - -```bash -export SEI_DEPLOYMENT=majorupgrade -export SEI_NAMESPACE=majorupgrade -export SEI_CHAIN_ID=majorupgrade-1 -export SEI_PRE_UPGRADE_IMG=ghcr.io/sei-protocol/sei:v6.3.0 -export SEI_POST_UPGRADE_IMG=ghcr.io/sei-protocol/sei:v6.4.0 -export SEI_UPGRADE_NAME=v6.4.0 -export SEITASK_RUNNER_IMG=/sei/seitask-runner: -# Unique per Workflow run -- drives the `workflow-vars-` ConfigMap -# name. Two concurrent runs of the same Workflow must not collide. -export SEI_WORKFLOW_RUN_ID="$(date +%s)-$(openssl rand -hex 3)" - -envsubst < scenarios/major-upgrade.yaml \ - | kubectl apply -n "${SEI_NAMESPACE}" -f - -``` - -`envsubst` is from the `gettext` package (preinstalled on most Linux distros; -`brew install gettext` on macOS). For Flux/ArgoCD-managed deployments, replace -the envsubst step with a kustomize patch or `flux create kustomization ---substitute-from=secret/...`. - -### 3. Watch progress - -```bash -# Workflow node states -kubectl -n majorupgrade get workflownodes -l workflow=major-upgrade -# top-level -kubectl -n majorupgrade describe workflow major-upgrade -# tail step containers as they spin up -kubectl -n majorupgrade get pods -l chaos-mesh.org/workflow=major-upgrade -w -``` - -### 4. Interpret results - -Each step exits 0 (PASS) or 1 (FAIL). Chaos Mesh records terminal status on -the corresponding `WorkflowNode`. The Workflow itself is `Succeeded` when -every step in the entry Serial path completed; `Failed` when any required -step (no `conditionalBranches` override) failed. - -Per-step interpretation: - -| Step | What success means | -|---|---| -| `compute-target-height` | Created `workflow-vars-${SEI_WORKFLOW_RUN_ID}` ConfigMap with `TARGET_HEIGHT` / `UPGRADE_HEIGHT` / `POST_UPGRADE_HEIGHT`. | -| `submit-upgrade-proposal` | SeiNodeTask `.status.phase=Complete`. proposalId is NOT extracted here (sidecar structured outputs are intentionally empty post-PR 3); `resolve-proposal-id` derives it from the chain. | -| `resolve-proposal-id` | Polled gov REST for a voting-period proposal whose plan name matches `$SEI_UPGRADE_NAME`, merged `PROPOSAL_ID` into the workflow-vars ConfigMap. | -| `vote-yes-all-validators` | All 4 vote tasks Complete. | -| `wait-for-proposal-to-pass` | Proposal observed `PROPOSAL_STATUS_PASSED`. | -| `bump-snd-image` | `kubectl patch seinetwork` set `spec.image` to the post-upgrade build. The SeiNetwork controller re-asserts the image onto every child and rolls all validators onto the new binary. | -| `await-post-upgrade-progress` | Post-upgrade height-advance check: each of nodes 0/1/2/3 advanced past `POST_UPGRADE_HEIGHT` (= `TARGET_HEIGHT + 10`) via AwaitCondition. This is the liveness assertion -- a node that crosses the boundary has survived the upgrade. | - -### 5. Cleanup - -```bash -kubectl delete workflow -n majorupgrade major-upgrade -# Workflow does NOT delete the SeiNodeTask CRs it created (intentional -- -# you want them visible for post-mortem). Remove them explicitly: -kubectl delete seinodetasks -n majorupgrade --all -# Per-run ConfigMaps (labeled with sei.io/workflow-run) accumulate across -# runs. The Workflow does not garbage-collect them; an operator clears -# them out by label: -kubectl delete configmap -n majorupgrade -l sei.io/workflow-run -# or for a single run: -kubectl delete configmap -n majorupgrade "workflow-vars-${SEI_WORKFLOW_RUN_ID}" -# Tear down the testnet: -kubectl delete -f scenarios/testnet-deployment.yaml -``` - -## Cross-step variable bridge (PR 6) - -Chaos Mesh Workflow Task steps are each their own Pod, so emptyDir -volumes cannot carry state across steps. The bridge is a per-run -ConfigMap named `workflow-vars-${SEI_WORKFLOW_RUN_ID}` in the same -namespace as the Workflow: - -- **Producer steps** (`compute-target-height`, `resolve-proposal-id`) - use `alpine/k8s` (curl + kubectl + jq) to compute or query values and - `kubectl apply` the ConfigMap. `compute-target-height` creates it with - `--from-literal` x4 and labels it `sei.io/workflow-run`; later - producers read-modify-write via `kubectl get -o json | jq | apply`. - -- **Consumer steps** receive every key as a container env var via - `envFrom: configMapRef: name: workflow-vars-$SEI_WORKFLOW_RUN_ID`. The - runner's `--var KEY=$(KEY)` arguments use the Kubernetes container env - expansion (`$(VAR)`), which kubelet resolves against the env at - container start. The runner sees concrete `--var KEY=` strings - and no longer needs to source any file. - -- **Concurrency:** the ConfigMap name is parameterized on - `$SEI_WORKFLOW_RUN_ID`, which the operator generates at apply time - (see the `export SEI_WORKFLOW_RUN_ID=...` line above). Two concurrent - runs of the same Workflow get distinct ConfigMaps. - **Caveat:** `resolve-proposal-id` filters voting-period proposals by - `content.plan.name` (= `$SEI_UPGRADE_NAME`). Running two concurrent - scenarios on the **same chain** with the **same upgrade name** lets - either run resolve to whichever proposal sorts first. Use a distinct - `$SEI_UPGRADE_NAME` per concurrent run, or treat the chain as serially - owned by one scenario at a time. - -- **Cleanup:** the ConfigMap carries an `ownerReference` pointing at the - parent Workflow CR (`major-upgrade-$SEI_WORKFLOW_RUN_ID`). Deleting the - Workflow cascades garbage-collection of the ConfigMap automatically - via kube-controller-manager. Operators can still clean up by label - (`-l sei.io/workflow-run`) if multiple Workflows are torn down at once. - -## Known limitations / deferred capability - -1. **Liveness via post-upgrade height-advance check only.** This Workflow - does not assert pre-upgrade running state, does not detect the panic - directly (no RPC-down / stuck-at-`TARGET_HEIGHT-1` polling), and does - not assert the `UPGRADE "" NEEDED` log line that the source - `major_upgrade_test.yaml` greps for. The post-upgrade height-advance - check (each upgraded node advances past `TARGET_HEIGHT + 10` via - AwaitCondition) is the actual liveness signal -- a node that crosses - that boundary has by construction survived the upgrade. Explicit panic - detection and log-line assertions are future SeiNodeTask kinds - (`AssertLogContains`, `AwaitCondition` with a `panicked` predicate) - that no current scenario actually requires. - -2. **No chain-query task kind for proposals.** `compute-target-height`, - `resolve-proposal-id`, and `wait-for-proposal-to-pass` are bash + - curl against the per-pod headless Service RPC/REST. The right - primitive is an `AwaitCondition` extension with `proposalStatus` / - `proposalIdByPlanName` / `heightAdvancing` predicates that emits the - resolved value to the standard outputs path. Migrating those three - steps to a structured kind also lets us delete the `configmaps` RBAC - verbs (only the runner's outputs ConfigMap-write would remain). - -3. **Upgrade rolls the whole fleet, not staggered per-node.** This - Workflow bumps the SeiNetwork spec.image once and lets the SeiNetwork controller - roll all validators together. It does NOT exercise the staggered - early-upgrade-one-node-then-the-rest path the source - `major_upgrade_test.yaml` does. Per-child `UpdateNodeImage` against a - SeiNetwork-owned node fights the controller's spec.image re-assertion (the child - image flip-flops, the StatefulSet churns, `observe-image` never settles), - so staggered rollout needs a different primitive (e.g. SeiNetwork-level - partition/maxUnavailable) before it can return. - -4. **The runner image is not yet auto-published.** Add a `runner` step to - `.github/workflows/ecr.yml` once this scenario is wired into a CI job. - -5. **Argo Workflows migration is still on the long-term roadmap.** The - ConfigMap bridge is the MVP. Argo's `outputs.parameters` / - `inputs.parameters` is more ergonomic and avoids the per-run - ConfigMap garbage. Plan that migration once we have more than one - scenario worth porting. - -6. **No fan-out from a single step.** The 4-vote step is hard-coded to - 4 children rather than `--per-node-selector=role=validator`. We could - collapse the four `vote-node-*` templates into one fan-out runner if - the SeiNodes carry a consistent label, but the explicit per-node form - is easier to diagnose in `kubectl describe workflownode` output. - -## References - -- `https://github.com/sei-protocol/bdchatham-designs/blob/main/designs/seinode-task/seinode-task-lld.md` -- the canonical interface contract. -- `runner/rbac.yaml` -- RBAC the workflow expects on its ServiceAccount. -- `runner/templates/*.yaml.tmpl` -- the templates the runner ships. -- `sei-chain/integration_test/upgrade_module/major_upgrade_test.yaml` -- - the north-star scenario this Workflow replicates. diff --git a/scenarios/load-test.yaml b/scenarios/load-test.yaml deleted file mode 100644 index d2cdcfe4..00000000 --- a/scenarios/load-test.yaml +++ /dev/null @@ -1,251 +0,0 @@ -# Chaos Mesh Workflow: load-test scenario. -# -# Provisions a 4-validator chain + 2-RPC fleet, renders the seiload -# profile JSON with per-run chain id + per-pod EVM endpoints, runs -# seiload against the fleet for DURATION_MINUTES, uploads the report. -# Replaces the bash orchestrate.sh at -# clusters/harbor/nightly/load/orchestrate.sh. -# -# Second scenario authored on the seitask primitives (release-test is -# N=1). Follows the same shape: thin wrapper CronJob fires this -# Workflow; ownerRefs cascade-clean SNDs / profile CM on Workflow -# deletion; trap-side state capture lives in the wrapper. -# -# Open prereq for first fire: pods/exec verb on the seitask-runner -# Role (wait-rpc-caught-up Task uses kubectl exec on the RPC pod to -# poll seid status). Track in the platform-repo wrapper PR. -# -# Placeholders (the wrapper envsubst's at apply time): -# $SEI_NAMESPACE namespace of workflow + provisioned SNDs -# $SEI_CHAIN_ID chain id (e.g. "bench-$SEI_WORKFLOW_RUN_ID") -# $SEI_WORKFLOW_RUN_ID unique per-run id -# $SEID_IMAGE seid container image -# $SEITASK_IMAGE seitask monolith image (SeiNetwork templates baked in) -# $SEILOAD_IMAGE sei-load benchmark image (ghcr.io/sei-protocol/sei-load:…) -# $SEILOAD_COMMIT_ID sei-chain commit SHA (parsed by wrapper from $SEID_IMAGE -# tag); flows into SEILOAD_COMMIT_ID metric/report label -# $SEILOAD_PROFILE profile name in the source CM (e.g., "nightly_evm_transfer") -# $DURATION_MINUTES seiload run length ---- -apiVersion: chaos-mesh.org/v1alpha1 -kind: Workflow -metadata: - name: load-test-$SEI_WORKFLOW_RUN_ID - labels: - sei.io/scenario: load-test - sei.io/workflow-run: "$SEI_WORKFLOW_RUN_ID" -spec: - entry: load-test - templates: - - name: load-test - templateType: Serial - deadline: 90m - children: - - provision-validator-chain - - provision-rpc-fleet - - wait-rpc-caught-up - - render-seiload-profile - - run-seiload - - upload-report - - # Every seitask container projects Workflow identity via downward API: - # NAME from chaos-mesh.org/workflow label, NAMESPACE from pod metadata. - # UID is fetched at runtime via taskruntime.LoadWorkflowIdentity. - - name: provision-validator-chain - templateType: Task - deadline: 25m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - provision-snd - - --role=validator - - --name=$SEI_CHAIN_ID - - --template=/scenarios/load-test/validator.yaml.tmpl - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=IMAGE=$SEID_IMAGE - - --ready-timeout=18m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - - name: provision-rpc-fleet - # 42m covers worst-case sequential readiness: running-timeout 18m + - # N×(WaitCaughtUp + WaitEVMServing) at first-block-timeout 5m each - # (N=2 → 18m + 4×5m = 38m), plus 4m headroom. The Chaos-Mesh deadline - # clock starts at Task admission, so the headroom must also absorb pod - # scheduling + a cold $SEITASK_IMAGE pull (both outside the inner 38m) — - # else a slow-but-genuine readiness is killed opaquely before its typed - # exit lands, inverting the very invariant this budget protects. - templateType: Task - deadline: 42m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - provision-node # rpc followers are standalone SeiNodes, not a SeiNetwork - - --role=rpc - - --name=$SEI_CHAIN_ID-rpc # BASE name; followers are -0..N-1 - - --replicas=2 # seiload drives all N via RPC_EVM_RPC_LIST - - --network=$SEI_CHAIN_ID # peer auto-wire to the genesis SeiNetwork (sei.io/seinetwork=) - - --template=/scenarios/load-test/rpc.yaml.tmpl - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=IMAGE=$SEID_IMAGE - - --running-timeout=18m # was --ready-timeout; SeiNode has no Ready phase (default --first-block-timeout=5m) - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - # Redundant secondary gate: provision-node already waited every follower - # caught-up (height>1 && catching_up==false) before publishing endpoints — - # this re-confirms node-0 (RPC_TM_RPC is node-0's URL, not an aggregate) once - # more right before pointing seiload at the fleet. The sed extract tolerates - # Sei CometBFT's envelope-or-bare /status shape. - - name: wait-rpc-caught-up - templateType: Task - deadline: 10m - task: - container: - name: wait-rpc - image: curlimages/curl:8.10.1 - command: ["/bin/sh", "-c"] - args: - - | - set -eu - for i in $(seq 1 120); do - state=$(curl -fsS --max-time 5 "${RPC_TM_RPC}/status" 2>/dev/null \ - | sed -n 's/.*"catching_up":\([^,}]*\).*/\1/p' \ - | head -1 || true) - if [ "${state}" = "false" ]; then - echo "catching_up=false on ${RPC_TM_RPC}" - exit 0 - fi - sleep 5 - done - echo "timed out waiting for catching_up=false" >&2 - exit 1 - envFrom: - - configMapRef: - name: workflow-vars-load-test-$SEI_WORKFLOW_RUN_ID - - # Sed-substitute the source profile (mounted via volume from the - # seiload-profiles CM in this namespace) and create the per-run - # rendered ConfigMap with an ownerRef to the parent Workflow. - # $RPC_EVM_RPC_LIST is published by provision-node as comma- - # separated bare URLs; the profile expects `[...]` with JSON-quoted - # entries, so we quote+join here. - - name: render-seiload-profile - templateType: Task - deadline: 5m - task: - container: - name: render - image: alpine/k8s:1.31.0 - command: ["/bin/sh", "-c"] - args: - - | - set -eu - # Fast-fail on missing source CM rather than letting the - # pod hang in ContainerCreating against a missing mount. - kubectl -n "${SEI_NAMESPACE}" get configmap seiload-profiles >/dev/null - - QUOTED=$(printf '%s' "${RPC_EVM_RPC_LIST}" | sed 's|\([^,][^,]*\)|"\1"|g') - sed \ - -e "s|__SEI_CHAIN_ID__|${CHAIN_ID}|g" \ - -e "s|__RPC_ENDPOINTS__|${QUOTED}|g" \ - "/profiles/${SEILOAD_PROFILE}.json" > /tmp/profile.json - - WORKFLOW_UID=$(kubectl -n "${SEI_NAMESPACE}" get \ - "workflow.chaos-mesh.org/load-test-${SEI_WORKFLOW_RUN_ID}" \ - -o jsonpath='{.metadata.uid}') - [ -n "${WORKFLOW_UID}" ] || { echo "no Workflow UID"; exit 1; } - - kubectl -n "${SEI_NAMESPACE}" create configmap "${PROFILE_CM}" \ - --from-file=profile.json=/tmp/profile.json \ - --dry-run=client -o yaml \ - | kubectl label -f - --local -o yaml \ - "sei.io/chain-id=${CHAIN_ID}" \ - "sei.io/workflow-run=${SEI_WORKFLOW_RUN_ID}" \ - | kubectl patch -f - --local --type=merge --patch \ - "{\"metadata\":{\"ownerReferences\":[{\"apiVersion\":\"chaos-mesh.org/v1alpha1\",\"kind\":\"Workflow\",\"name\":\"load-test-${SEI_WORKFLOW_RUN_ID}\",\"uid\":\"${WORKFLOW_UID}\",\"controller\":false,\"blockOwnerDeletion\":false}]}}" \ - -o yaml \ - | kubectl apply -f - - env: - - {name: SEI_NAMESPACE, valueFrom: {fieldRef: {fieldPath: metadata.namespace}}} - - {name: SEILOAD_PROFILE, value: "$SEILOAD_PROFILE"} - - {name: PROFILE_CM, value: "seiload-profile-$SEI_WORKFLOW_RUN_ID"} - envFrom: - - configMapRef: - name: workflow-vars-load-test-$SEI_WORKFLOW_RUN_ID - volumeMounts: - - {name: profiles, mountPath: /profiles, readOnly: true} - volumes: - - name: profiles - configMap: - name: seiload-profiles - - - name: run-seiload - templateType: Task - deadline: 30m - task: - container: - name: seiload - image: $SEILOAD_IMAGE - args: - - --config - - /etc/seiload/profile.json - - --duration=$(DURATION_MINUTES)m - - --post-summary-flush-delay=45s - - --track-receipts=true - ports: - - {name: metrics, containerPort: 9090, protocol: TCP} - env: - - {name: DURATION_MINUTES, value: "$DURATION_MINUTES"} - - {name: SEILOAD_RUN_ID, value: "$SEI_WORKFLOW_RUN_ID"} - - {name: SEILOAD_CHAIN_ID, value: "$SEI_CHAIN_ID"} - - {name: SEILOAD_COMMIT_ID, value: "$SEILOAD_COMMIT_ID"} - - {name: SEILOAD_WORKLOAD, value: nightly} - volumeMounts: - - {name: profile, mountPath: /etc/seiload, readOnly: true} - resources: - requests: {cpu: "2", memory: "4Gi"} - limits: {cpu: "4", memory: "8Gi"} - volumes: - - name: profile - configMap: - name: seiload-profile-$SEI_WORKFLOW_RUN_ID - - - name: upload-report - templateType: Task - deadline: 5m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - upload-report - - --bucket=harbor-validation-results - - --prefix=nightly/load-test/$SEI_WORKFLOW_RUN_ID - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace diff --git a/scenarios/load-test/rpc.yaml.tmpl b/scenarios/load-test/rpc.yaml.tmpl deleted file mode 100644 index 6a8fec4d..00000000 --- a/scenarios/load-test/rpc.yaml.tmpl +++ /dev/null @@ -1,23 +0,0 @@ -# Standalone follower SeiNode. provision-node renders this once per replica, -# stamping metadata.name=- (e.g. -rpc-0), namespace, -# ownerRef->Workflow, the sei.io/role=node + sei.io/seinetwork= object -# labels, and a synthesized peers[].label.selector{sei.io/seinetwork:}. -# This template describes the node only -- never its name, peering, or topology. -apiVersion: sei.io/v1alpha1 -kind: SeiNode -metadata: - name: PLACEHOLDER # overwritten to - by provision-node -spec: - chainId: "{{ .CHAIN_ID }}" - image: "{{ .IMAGE }}" - fullNode: {} # the rpc role = EVM-serving full node - overrides: - storage.state_commit.write_mode: memiavl_only - storage.state_store.write_mode: memiavl_only - evm.worker_pool_size: "32" - evm.worker_queue_size: "4000" - evm.max_tx_pool_txs: "10000" - {{- with (index . "MEMPOOL_SIZE") }} - mempool.size: "{{ . }}" - mempool.pending_size: "{{ . }}" - {{- end }} diff --git a/scenarios/load-test/validator.yaml.tmpl b/scenarios/load-test/validator.yaml.tmpl deleted file mode 100644 index 0abf81f2..00000000 --- a/scenarios/load-test/validator.yaml.tmpl +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNetwork -metadata: - name: PLACEHOLDER -spec: - image: "{{ .IMAGE }}" - replicas: 4 - configOverrides: - storage.state_commit.write_mode: memiavl_only - storage.state_store.write_mode: memiavl_only - {{- with (index . "MEMPOOL_SIZE") }} - mempool.size: "{{ . }}" - mempool.pending_size: "{{ . }}" - {{- end }} - genesis: - chainId: "{{ .CHAIN_ID }}" diff --git a/scenarios/major-upgrade.yaml b/scenarios/major-upgrade.yaml deleted file mode 100644 index 7f1812ef..00000000 --- a/scenarios/major-upgrade.yaml +++ /dev/null @@ -1,575 +0,0 @@ -# Chaos Mesh Workflow: major-upgrade scenario. -# -# Acceptance test for the SeiNodeTask MVP. Expresses -# sei-chain/integration_test/upgrade_module/major_upgrade_test.yaml as a -# composition of SeiNodeTask CRs driven by the seitask runner. -# -# Provisions a 4-validator chain in-workflow via provision-snd, runs the -# upgrade pipeline against it, uploads the run snapshot to S3. Matches the -# release-test/load-test pattern: SeiNetwork lifecycle and workflow-vars ConfigMap -# all carry ownerRef to this Workflow CR, so the wrapper's only cleanup duty -# is `kubectl delete workflow`. -# -# Upgrade mechanism: a single bump-snd-image step patches the SeiNetwork -# spec.image; the SeiNetwork controller rolls all validators onto the new binary. -# The SeiNetwork spec.image is the one source of truth for child image -- per-child -# UpdateNodeImage would fight the controller's spec.image re-assertion and -# churn the StatefulSet so the rollout never settles. -# -# Workflow-vars producers/consumers -# --------------------------------- -# provision-validator-chain seeds CHAIN_ID + VALIDATOR_TM_RPC + VALIDATOR_REST. -# compute-target-height patches TARGET_HEIGHT/UPGRADE_HEIGHT/POST_UPGRADE_HEIGHT. -# resolve-proposal-id patches PROPOSAL_ID. Every downstream step consumes via -# `envFrom: configMapRef`; runner steps use `$(VAR)` (K8s container env -# interpolation) inside --var args. -# -# PROPOSAL_ID resolution (chain-as-medium) -# ---------------------------------------- -# .status.outputs.govSoftwareUpgrade.proposalId is empty by design (no -# sidecar-derived structured outputs in MVP). The resolve-proposal-id step -# polls /cosmos/gov/v1beta1/proposals?proposal_status=2 (voting period) until -# a proposal matching $SEI_UPGRADE_NAME appears, then patches PROPOSAL_ID. -# -# Placeholders (wrapper envsubst's at apply time -- see scenarios/README.md): -# $SEI_NAMESPACE namespace of workflow + provisioned SeiNetwork -# $SEI_CHAIN_ID chain id; also the SeiNetwork name -# $SEI_PRE_UPGRADE_IMG seid image the validators boot on -# $SEI_POST_UPGRADE_IMG seid image the upgrade rolls out to -# $SEI_UPGRADE_NAME upgrade plan name registered in seid -# $SEITASK_IMAGE seitask monolith image (SeiNetwork templates baked in) -# $SEI_WORKFLOW_RUN_ID unique per-run id; suffixes Workflow + CM names ---- -apiVersion: chaos-mesh.org/v1alpha1 -kind: Workflow -metadata: - name: major-upgrade-$SEI_WORKFLOW_RUN_ID - labels: - sei.io/scenario: major-upgrade - sei.io/workflow-run: "$SEI_WORKFLOW_RUN_ID" -spec: - entry: major-upgrade - templates: - - name: major-upgrade - templateType: Serial - deadline: 90m - children: - - provision-validator-chain - - compute-target-height - - submit-upgrade-proposal - - resolve-proposal-id - - vote-yes-all-validators - - wait-for-proposal-to-pass - - settle-into-halt - - bump-snd-image - - await-post-upgrade-progress - - upload-report - - # Every seitask container projects Workflow identity via downward API: - # NAME from the chaos-mesh.org/workflow label chaos-mesh stamps on each - # Task pod, NAMESPACE from the pod's own metadata. UID isn't projectable - # so taskruntime.LoadWorkflowIdentity fetches it via the apiserver using - # NAME + NAMESPACE. - - name: provision-validator-chain - templateType: Task - deadline: 25m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - provision-snd - - --role=validator - - --name=$SEI_CHAIN_ID - - --template=/scenarios/major-upgrade/validator.yaml.tmpl - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=IMAGE=$SEI_PRE_UPGRADE_IMG - - --ready-timeout=18m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - # Sets upgrade height = current + 200 blocks (~120s at Sei's ~600ms block - # time) to outlast the 60s gov voting_period plus tally + plan-execution - # slack. Patches the workflow-vars ConfigMap (seeded by provision-snd) with: - # TARGET_HEIGHT -- upgrade height - # UPGRADE_HEIGHT -- consumed by gov-software-upgrade.yaml.tmpl - # POST_UPGRADE_HEIGHT -- TARGET_HEIGHT + 10; liveness check threshold - - name: compute-target-height - templateType: Task - deadline: 5m - task: - container: - name: compute-target-height - image: alpine/k8s:1.31.0 - command: ["/bin/sh", "-c"] - args: - - | - set -eu - CUR="" - for i in $(seq 1 30); do - CUR=$(curl -fsS --connect-timeout 3 "${VALIDATOR_TM_RPC}/status" 2>/dev/null \ - | sed -n 's/.*"latest_block_height":"\([0-9]*\)".*/\1/p' || true) - if [ -n "${CUR}" ]; then - echo "got height=${CUR} on attempt=${i}" - break - fi - echo "attempt=${i} RPC not ready yet; retrying in 3s" - sleep 3 - done - if [ -z "${CUR}" ]; then - echo "failed to parse latest_block_height from ${VALIDATOR_TM_RPC}/status after 30 attempts" >&2 - exit 1 - fi - TARGET=$((CUR + 200)) - POST=$((TARGET + 10)) - echo "current=${CUR} target=${TARGET} post=${POST}" - kubectl patch configmap "workflow-vars-major-upgrade-${SEI_WORKFLOW_RUN_ID}" \ - --type=merge \ - --patch "{\"data\":{\"TARGET_HEIGHT\":\"${TARGET}\",\"UPGRADE_HEIGHT\":\"${TARGET}\",\"POST_UPGRADE_HEIGHT\":\"${POST}\"}}" - env: - - name: SEI_WORKFLOW_RUN_ID - value: "$SEI_WORKFLOW_RUN_ID" - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - # Submits software-upgrade proposal at UPGRADE_HEIGHT via node-0's sidecar. - - name: submit-upgrade-proposal - templateType: Task - deadline: 10m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/gov-software-upgrade.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-0 - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=TITLE=major-upgrade scenario - - --var=DESCRIPTION=software-upgrade to $SEI_UPGRADE_NAME - - --var=UPGRADE_NAME=$SEI_UPGRADE_NAME - - --var=UPGRADE_HEIGHT=$(UPGRADE_HEIGHT) - - --var=INITIAL_DEPOSIT=20000000usei - - --var=FEES=10000usei - - --var=GAS=500000 - - --timeout=8m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - # Polls gov REST for a voting-period proposal whose content.plan.name - # matches $SEI_UPGRADE_NAME (legacy shape) OR messages[].content.plan.name - # (v1 shape). Writes PROPOSAL_ID to workflow-vars. 150 * 2s = 300s window. - - name: resolve-proposal-id - templateType: Task - deadline: 5m - task: - container: - name: resolve-proposal-id - image: alpine/k8s:1.31.0 - command: ["/bin/sh", "-c"] - args: - - | - set -eu - for i in $(seq 1 150); do - BODY=$(curl -fsS "${VALIDATOR_REST}/cosmos/gov/v1beta1/proposals?proposal_status=2" || true) - PID=$(printf '%s' "${BODY}" | jq -r --arg n "${SEI_UPGRADE_NAME}" ' - .proposals // [] - | map(select( - (.content.plan.name? == $n) - or (.messages? // [] | map(.content.plan.name? // empty) | index($n)) - )) - | .[0].proposal_id // empty - ') - if [ -n "${PID}" ] && [ "${PID}" != "null" ]; then - echo "resolved proposal_id=${PID} for upgrade=${SEI_UPGRADE_NAME}" - kubectl patch configmap "workflow-vars-major-upgrade-${SEI_WORKFLOW_RUN_ID}" \ - --type=merge \ - --patch "{\"data\":{\"PROPOSAL_ID\":\"${PID}\"}}" - exit 0 - fi - echo "attempt=${i} no voting-period proposal matching ${SEI_UPGRADE_NAME} yet" - sleep 2 - done - echo "timed out resolving PROPOSAL_ID for upgrade=${SEI_UPGRADE_NAME}" >&2 - exit 1 - env: - - name: SEI_UPGRADE_NAME - value: "$SEI_UPGRADE_NAME" - - name: SEI_WORKFLOW_RUN_ID - value: "$SEI_WORKFLOW_RUN_ID" - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - # vote-yes-all-validators -- parallel, one CR per validator. - - name: vote-yes-all-validators - templateType: Parallel - deadline: 10m - children: - - vote-node-0 - - vote-node-1 - - vote-node-2 - - vote-node-3 - - - name: vote-node-0 - templateType: Task - deadline: 8m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/gov-vote.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-0 - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=PROPOSAL_ID=$(PROPOSAL_ID) - - --var=OPTION=yes - - --var=FEES=10000usei - - --var=GAS=200000 - - --timeout=5m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - - name: vote-node-1 - templateType: Task - deadline: 8m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/gov-vote.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-1 - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=PROPOSAL_ID=$(PROPOSAL_ID) - - --var=OPTION=yes - - --var=FEES=10000usei - - --var=GAS=200000 - - --timeout=5m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - - name: vote-node-2 - templateType: Task - deadline: 8m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/gov-vote.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-2 - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=PROPOSAL_ID=$(PROPOSAL_ID) - - --var=OPTION=yes - - --var=FEES=10000usei - - --var=GAS=200000 - - --timeout=5m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - - name: vote-node-3 - templateType: Task - deadline: 8m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/gov-vote.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-3 - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=PROPOSAL_ID=$(PROPOSAL_ID) - - --var=OPTION=yes - - --var=FEES=10000usei - - --var=GAS=200000 - - --timeout=5m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - # Polls REST gov endpoint until status=PROPOSAL_STATUS_PASSED. - - name: wait-for-proposal-to-pass - templateType: Task - deadline: 10m - task: - container: - name: wait-for-pass - image: curlimages/curl:8.10.1 - command: ["/bin/sh", "-c"] - args: - - | - set -eu - for i in $(seq 1 300); do - STATUS=$(curl -fsS "${VALIDATOR_REST}/cosmos/gov/v1beta1/proposals/${PROPOSAL_ID}" \ - | sed -n 's/.*"status":[[:space:]]*"\([A-Z_]*\)".*/\1/p' | head -1) - echo "attempt=${i} proposal=${PROPOSAL_ID} status=${STATUS:-unknown}" - [ "${STATUS}" = "PROPOSAL_STATUS_PASSED" ] && exit 0 - sleep 1 - done - echo "proposal ${PROPOSAL_ID} did not pass within timeout" >&2 - exit 1 - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - # Waits for the chain to reach UPGRADE_HEIGHT and halt before the binary - # swap. The old binary panics ("UPGRADE NEEDED") at UPGRADE_HEIGHT; the new - # binary panics ("BINARY UPDATED BEFORE TRIGGER", sei-cosmos x/upgrade - # abci.go) if it processes ANY block below UPGRADE_HEIGHT. So bump-snd-image - # must land only after every validator has committed UPGRADE_HEIGHT-1 and - # halted. The height can't be polled at that point -- all validators halt - # together and stop serving RPC exactly when the predicate would be true -- - # so this is a fixed wait, not an AwaitCondition. UPGRADE_HEIGHT is current - # + 200 blocks measured at compute-target-height, but the proposal flow - # (~60s voting period + tally) burns most of that budget first, so only - # ~100 blocks (~60s at ~600ms blocks) remain once the proposal has passed. - # Over-waiting is free (the chain just sits halted until the swap); the only - # failure mode is waiting too short. The full wall-clock from height - # measurement to swap (~60s voting + 150s here) must exceed 200 x block_time, - # so block time above ~1s would break it -- raise this if a cold chain's - # early blocks run slow. - - name: settle-into-halt - templateType: Task - deadline: 8m - task: - container: - name: settle-into-halt - image: alpine/k8s:1.31.0 - command: ["/bin/sh", "-c"] - args: - - | - set -eu - echo "waiting 150s for the chain to reach UPGRADE_HEIGHT and halt before swapping the binary" - sleep 150 - echo "settle window elapsed; proceeding to bump-snd-image" - - # Bumps the SeiNetwork image to the post-upgrade build in - # a single patch. The SeiNetwork controller re-asserts the new - # image onto every child SeiNode and drives each node's NodeUpdate plan; - # the validators roll together onto the new binary at the upgrade height. - # - # Patches spec.image only -- a merge patch leaves the rest of the spec - # untouched. Per-child UpdateNodeImage is NOT used here: the SeiNetwork - # controller would re-assert spec.image every reconcile, flip-flopping the - # child spec.image and churning the StatefulSet so the rollout never settles - # (observe-image never completes). The SeiNetwork spec.image is the single - # source of truth for child image. - - name: bump-snd-image - templateType: Task - deadline: 5m - task: - container: - name: bump-snd-image - image: alpine/k8s:1.31.0 - command: ["/bin/sh", "-c"] - args: - - | - set -eu - kubectl patch seinetwork "${SEI_CHAIN_ID}" \ - --type=merge \ - --patch "{\"spec\":{\"image\":\"${SEI_POST_UPGRADE_IMG}\"}}" - echo "patched seinetwork/${SEI_CHAIN_ID} image to ${SEI_POST_UPGRADE_IMG}" - env: - - name: SEI_CHAIN_ID - value: "$SEI_CHAIN_ID" - - name: SEI_POST_UPGRADE_IMG - value: "$SEI_POST_UPGRADE_IMG" - - # Liveness: each validator advances past TARGET_HEIGHT+10 - # (= POST_UPGRADE_HEIGHT) after the SeiNetwork rolls all nodes onto the new - # binary. AwaitCondition over the height predicate, one per validator. - - name: await-post-upgrade-progress - templateType: Parallel - deadline: 15m - children: - - await-post-upgrade-progress-node-0 - - await-post-upgrade-progress-node-1 - - await-post-upgrade-progress-node-2 - - await-post-upgrade-progress-node-3 - - - name: await-post-upgrade-progress-node-0 - templateType: Task - deadline: 12m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/await-condition.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-0 - - --var=TARGET_HEIGHT=$(POST_UPGRADE_HEIGHT) - - --timeout=10m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - - name: await-post-upgrade-progress-node-1 - templateType: Task - deadline: 12m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/await-condition.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-1 - - --var=TARGET_HEIGHT=$(POST_UPGRADE_HEIGHT) - - --timeout=10m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - - name: await-post-upgrade-progress-node-2 - templateType: Task - deadline: 12m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/await-condition.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-2 - - --var=TARGET_HEIGHT=$(POST_UPGRADE_HEIGHT) - - --timeout=10m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - - name: await-post-upgrade-progress-node-3 - templateType: Task - deadline: 12m - task: - container: - name: runner - image: $SEITASK_IMAGE - args: - - runner - - --template=/templates/await-condition.yaml.tmpl - - --var=NODE=$SEI_CHAIN_ID-3 - - --var=TARGET_HEIGHT=$(POST_UPGRADE_HEIGHT) - - --timeout=10m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-major-upgrade-$SEI_WORKFLOW_RUN_ID - - - name: upload-report - templateType: Task - deadline: 5m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - upload-report - - --bucket=harbor-validation-results - - --prefix=nightly/major-upgrade/$SEI_WORKFLOW_RUN_ID - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace diff --git a/scenarios/major-upgrade/validator.yaml.tmpl b/scenarios/major-upgrade/validator.yaml.tmpl deleted file mode 100644 index 29d11430..00000000 --- a/scenarios/major-upgrade/validator.yaml.tmpl +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNetwork -metadata: - name: PLACEHOLDER -spec: - image: "{{ .IMAGE }}" - replicas: 4 - configOverrides: - tx_index.indexer: kv - api.rest.enable: "true" - genesis: - chainId: "{{ .CHAIN_ID }}" - overrides: - gov.voting_params.voting_period: "60s" diff --git a/scenarios/release-test.yaml b/scenarios/release-test.yaml deleted file mode 100644 index 07de3bb3..00000000 --- a/scenarios/release-test.yaml +++ /dev/null @@ -1,189 +0,0 @@ -# Chaos Mesh Workflow: release-test scenario. -# -# Provisions a 4-validator chain + 2-RPC fleet, runs the release-test image -# against the RPC endpoints, uploads the run snapshot to S3. First scenario -# composed end-to-end from seitask primitives (keygen, provision-snd, -# upload-report). Workflow-vars CM bridges per-step values; see -# scenarios/major-upgrade.yaml for the pattern. -# -# Cleanup: every per-run resource (SNDs, admin Secret, workflow-vars CM) -# carries ownerRef to this Workflow CR. The wrapper's only cleanup duty is -# `kubectl delete workflow` — kube-controller-manager cascades. -# -# Upload-report is the last Serial child: a failed earlier step bails the -# Serial before upload fires. Phase 2b lifts upload-report into an -# always-fire Parallel branch. -# -# Placeholders (the wrapper envsubst's at apply time): -# $SEI_NAMESPACE namespace of workflow + provisioned SNDs -# $SEI_CHAIN_ID chain id (e.g. "rel-$SEI_WORKFLOW_RUN_ID") -# $SEI_WORKFLOW_RUN_ID unique per-run id; suffixes Workflow + Secret names -# $SEID_IMAGE seid container image -# $SEITASK_IMAGE seitask monolith image (SeiNetwork templates baked in) -# $RELEASE_TEST_IMAGE release-test harness image ---- -apiVersion: chaos-mesh.org/v1alpha1 -kind: Workflow -metadata: - name: release-test-$SEI_WORKFLOW_RUN_ID - labels: - sei.io/scenario: release-test - sei.io/workflow-run: "$SEI_WORKFLOW_RUN_ID" -spec: - entry: release-test - templates: - - name: release-test - templateType: Serial - deadline: 60m - children: - - keygen-admin - - provision-validator-chain - - provision-rpc-fleet - - run-release-test - - upload-report - - # Every seitask container projects Workflow identity via downward API: - # NAME from the chaos-mesh.org/workflow label chaos-mesh stamps on - # each Task pod, NAMESPACE from the pod's own metadata. UID isn't - # projectable so taskruntime.LoadWorkflowIdentity fetches it via the - # apiserver using NAME + NAMESPACE. - - name: keygen-admin - templateType: Task - deadline: 2m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - keygen - - --key-name=admin - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - # Inner --ready-timeout + default first-block-timeout sit 2m below the - # Task deadline so provision-snd's typed exit reaches the parent before - # chaos-mesh kills the pod on deadline exceeded. - - name: provision-validator-chain - templateType: Task - deadline: 25m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - provision-snd - - --role=validator - - --name=$SEI_CHAIN_ID - - --template=/scenarios/release-test/validator.yaml.tmpl - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=IMAGE=$SEID_IMAGE - - --var=ADMIN_ADDRESS=$(ADMIN_ADDRESS) - - --ready-timeout=18m - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-release-test-$SEI_WORKFLOW_RUN_ID - - - name: provision-rpc-fleet - # 32m covers worst-case sequential readiness: running-timeout 18m + - # WaitCaughtUp + WaitEVMServing at first-block-timeout 5m each - # (N=1 → 18m + 2×5m = 28m), plus 4m headroom. The Chaos-Mesh deadline - # clock starts at Task admission, so the headroom must also absorb pod - # scheduling + a cold $SEITASK_IMAGE pull (both outside the inner 28m) — - # else a slow-but-genuine readiness is killed opaquely before its typed - # exit lands, inverting the very invariant this budget protects. - templateType: Task - deadline: 32m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - provision-node # rpc follower is a standalone SeiNode, not a SeiNetwork - - --role=rpc - - --name=$SEI_CHAIN_ID-rpc # BASE name; follower is -0 - - --replicas=1 # mocha hits a single RPC (RPC_TM_RPC/EVM/REST) - - --network=$SEI_CHAIN_ID # peer auto-wire to the genesis SeiNetwork (sei.io/seinetwork=) - - --template=/scenarios/release-test/rpc.yaml.tmpl - - --var=CHAIN_ID=$SEI_CHAIN_ID - - --var=IMAGE=$SEID_IMAGE - - --running-timeout=18m # was --ready-timeout; SeiNode has no Ready phase (default --first-block-timeout=5m) - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - envFrom: - - configMapRef: - name: workflow-vars-release-test-$SEI_WORKFLOW_RUN_ID - - # With one rpc follower (replicas=1), TM RPC + REST + EVM RPC all resolve to - # that node via provision-node's RoleScoped(rpc, *) off node-0's - # .status.endpoint. A single node also gives the stateful sequences - # (sei_newFilter + sei_getFilterLogs, eth_sendRawTransaction + tx.wait) one - # consistent mempool + filter-store view. - - name: run-release-test - templateType: Task - deadline: 30m - task: - container: - name: release-test - image: $RELEASE_TEST_IMAGE - envFrom: - - configMapRef: - name: workflow-vars-release-test-$SEI_WORKFLOW_RUN_ID - env: - - {name: TEST_TARGET, value: chain-agnostic} - - {name: SEI_CHAIN_ID, value: $(CHAIN_ID)} - - {name: SEI_ADMIN_ADDRESS, value: $(ADMIN_ADDRESS)} - - {name: SEI_TENDERMINT_RPC, value: $(RPC_TM_RPC)} - - {name: SEI_EVM_JSON_RPC, value: $(RPC_EVM_RPC)} - - {name: SEI_REST_ENDPOINT, value: $(RPC_REST)} - - name: SEI_ADMIN_MNEMONIC - valueFrom: - secretKeyRef: - name: admin-release-test-$SEI_WORKFLOW_RUN_ID - key: mnemonic - resources: - requests: {cpu: 500m, memory: 1Gi} - limits: {memory: 2Gi} - - - name: upload-report - templateType: Task - deadline: 5m - task: - container: - name: seitask - image: $SEITASK_IMAGE - args: - - upload-report - - --bucket=harbor-validation-results - - --prefix=nightly/release-test/$SEI_WORKFLOW_RUN_ID - env: - - name: SEI_WORKFLOW_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['chaos-mesh.org/workflow'] - - name: SEI_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace diff --git a/scenarios/release-test/rpc.yaml.tmpl b/scenarios/release-test/rpc.yaml.tmpl deleted file mode 100644 index dc870b7c..00000000 --- a/scenarios/release-test/rpc.yaml.tmpl +++ /dev/null @@ -1,20 +0,0 @@ -# Standalone follower SeiNode. provision-node renders this once per replica, -# stamping metadata.name=- (e.g. -rpc-0), namespace, -# ownerRef->Workflow, the sei.io/role=node + sei.io/seinetwork= object -# labels, and a synthesized peers[].label.selector{sei.io/seinetwork:}. -# This template describes the node only -- never its name, peering, or topology. -apiVersion: sei.io/v1alpha1 -kind: SeiNode -metadata: - name: PLACEHOLDER # overwritten to - by provision-node -spec: - chainId: "{{ .CHAIN_ID }}" - image: "{{ .IMAGE }}" - fullNode: {} # the rpc role = EVM-serving full node - overrides: - tx_index.indexer: kv - storage.state_commit.write_mode: memiavl_only - storage.state_store.write_mode: memiavl_only - mempool.ttl_duration: 60s - network.rpc.lag_threshold: "2" - evm.enabled_legacy_sei_apis: sei_getLogs,sei_getBlockByNumber,sei_getBlockByHash,sei_getSeiAddress,sei_getEVMAddress,sei_getCosmosTx,sei_getEvmTx,sei_newFilter,sei_getFilterLogs diff --git a/scenarios/release-test/validator.yaml.tmpl b/scenarios/release-test/validator.yaml.tmpl deleted file mode 100644 index 1b7a17bd..00000000 --- a/scenarios/release-test/validator.yaml.tmpl +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: sei.io/v1alpha1 -kind: SeiNetwork -metadata: - name: PLACEHOLDER -spec: - image: "{{ .IMAGE }}" - replicas: 4 - configOverrides: - tx_index.indexer: kv - storage.state_commit.write_mode: memiavl_only - storage.state_store.write_mode: memiavl_only - mempool.ttl_duration: 60s - genesis: - chainId: "{{ .CHAIN_ID }}" - accounts: - - address: "{{ .ADMIN_ADDRESS }}" - balance: 1000000000000usei diff --git a/scenarios/testnet-deployment.yaml b/scenarios/testnet-deployment.yaml deleted file mode 100644 index 86000727..00000000 --- a/scenarios/testnet-deployment.yaml +++ /dev/null @@ -1,33 +0,0 @@ -# Reference 4-validator SeiNetwork that the major-upgrade Workflow -# can target. NOT a production manifest -- intended for ephemeral harbor -# dev cluster testnets. Adjust .spec.image and genesis.chainId to -# match the upgrade you are exercising. -# -# After apply, wait for status.replicas==status.readyReplicas==4 and all -# 4 SeiNodes in phase Running before applying scenarios/major-upgrade.yaml. ---- -apiVersion: v1 -kind: Namespace -metadata: - name: majorupgrade - labels: - pod-security.kubernetes.io/enforce: restricted - pod-security.kubernetes.io/audit: restricted - pod-security.kubernetes.io/warn: restricted - sei.io/scenario: major-upgrade ---- -apiVersion: sei.io/v1alpha1 -kind: SeiNetwork -metadata: - name: majorupgrade - namespace: majorupgrade -spec: - replicas: 4 - # PRE_UPGRADE image -- bump in lockstep with $SEI_PRE_UPGRADE_IMG in - # the Workflow apply step. - image: ghcr.io/sei-protocol/sei:v6.3.0 - genesis: - chainId: majorupgrade-1 - stakingAmount: "10000000usei" - sidecar: - image: ghcr.io/sei-protocol/seictl:v0.0.29 diff --git a/sdk/sei/.xreview/sdk-task-surface.md b/sdk/sei/.xreview/sdk-task-surface.md deleted file mode 100644 index 04e27516..00000000 --- a/sdk/sei/.xreview/sdk-task-surface.md +++ /dev/null @@ -1,46 +0,0 @@ -# xreview ledger — SDK SeiNodeTask surface (WS-G) - -Class: component (public SDK surface over the SeiNodeTask CRD) -Tier: T2 - -Target: `sdk/sei/task.go`, `sdk/sei/provider.go`, `sdk/sei/provider/k8s/{render,handle,k8s}.go`, stubs + tests -Artifact: branch `feat/sdk-task-surface` (diff /tmp/wsg-task-surface.diff) - -## Round 1 - -State: RESOLVED -OpenFindings: 0 -Convergence: independent (4 blinded reviewers) -Blinded: yes -Dissenter: sei-network-specialist (DISSENT → resolved) - -Slate: kubernetes-specialist (CRD-contract), idiomatic-reviewer (Go idiom), systems-engineer (poll/error contract), sei-network-specialist (dissenter, upgrade semantics). - -### Boundary table - -| Boundary | Provider | Consumer | Status | Evidence | Raised by | -|---|---|---|---|---|---| -| GovSoftwareUpgrade proposal-ID handoff | nodetask controller | harness (GovVote input) | **MISMATCH → FIXED** | `controller.go:360-369` populateOutputs only handles UpdateNodeImage; gov/await outputs never written (chain-as-medium by design). SDK advertised `ProposalID` as "the GovVote input" → always 0 → GovVote.proposalId Minimum=1 admission reject (`seinodetask_types.go:366`). | dissenter (lead), k8s, systems | -| UpdateNodeImage RequirePhase on halted node | nodetask controller | harness (step 4) | **MISMATCH → FIXED** | Gate is `==` exact-match defaulting Running (`controller.go:195-199`); a node halted at upgrade height still reports Running (phase sticky). SDK doc+test told callers to relax to Pending → terminal timeout. | dissenter | -| Payload field mapping (4 kinds) | CRD | renderTask | COMPATIBLE | All fields field-for-field congruent (render.go). | k8s | -| SSA / status subresource | CRD | k8s apply | COMPATIBLE | Main-resource Apply; status subresource separate — no status stomp. | k8s | -| WaitComplete poll loop | — | harness | **MISMATCH → FIXED** | Only NotFound tolerated; a transient Get error aborts a multi-minute wait. Tolerate retryable (ServerTimeout/TooManyRequests/InternalError). | systems | -| Complete + nil outputs | controller | WaitComplete | MISMATCH → MITIGATED | `(nil,nil)` nil-deref hazard; largely mooted by removing the unpopulated gov output types. | systems, k8s | -| Resubmit / idempotency | CRD + controller | RunTask | MISSING → DOCUMENTED | same-name re-apply is a no-op (no double-submit); delete+recreate resubmits the gov-tx. Doc'd on RunTask. CEL immutability + on-chain dedup are later coordinated CRD work. | systems | -| GovVote per-validator key derivation | controller | harness | COMPATIBLE | `KeyName:""` derives per-target SeiNode key (`seinodetask_params.go:120,231`); no shared-key assumption. | dissenter | - -### Idiom addendum (idiomatic-reviewer — RATIFY) -Clean. No correctness/divergence-with-consequence findings. Endorsed `WaitComplete (*TaskOutputs, error)` as the correct one-shot-terminal shape. Two pure-style notes accepted as-is. Process note: add a provider-side one-output-per-Kind test. - -### Resolutions (this PR, no controller change) -1. **ProposalID lie:** removed `GovSoftwareUpgradeOutputs`/`GovVoteOutputs`/`AwaitNodesAtHeightOutputs` from the SDK (all structurally unpopulated); `TaskOutputs` now carries only `UpdateNodeImage` (the sole kind populateOutputs writes). Package + payload docs rewritten to the chain-as-medium reality. -2. **RequirePhase backwards:** UpdateNodeImage doc fixed — a halted node still reports Running, so the default is correct; removed the relax-to-Pending guidance; repurposed the test to verify mechanical RequirePhase override (not the upgrade-failure pattern). -3. **Transient-error tolerance:** WaitComplete keeps polling on retryable Get errors. -4. **Cheap hardening:** validateTaskSpec now validates GovVote.Option enum + rejects 00) — proves the handler ran. | -| Enveloped-only /status decode | **MISMATCH → FIXED** | pollHeight re-models /status wrapped-only; the Sei fork sometimes answers unwrapped (SDK latestHeight handles both) → spins forever on such a node. | idiom | Promote SDK latestHeight → exported sei.LatestHeight (dual-shape); suite consumes it. | -| Diagnosability of timeouts | **MISMATCH → FIXED** | Poll helpers swallow last-seen height/status into bare deadline errors — below the WaitHeightAdvances bar; suite is unattended-nightly. | systems | pollREST threads a last-seen string into the deadline error; height polls use LatestHeight's value. | -| Task GC label | **MISSING → FIXED** | Task CRs carry no sei.io/harness-run label → leak on abnormal exit. | k8s | Added SDK TaskSpec.Labels (mirrors NetworkSpec.Labels); suite stamps runLabelKey. | -| Vote error fan-in | flag → FIXED | Only first-in-slice error surfaced. | systems, idiom | errors.Join across all validators. | -| NetworkSpec drift (provision vs bump) | flag → FIXED | Two hand-duplicated literals; a future field added to one strips it on the other via ForceOwnership. | systems, k8s | Single networkSpec builder; bump mutates only Image. | -| Image bump full-spec SSA re-apply | COMPATIBLE | k8s read the controller: it never writes the parent spec (finalizer + status only); genesis re-stamp idempotent (ceremony latched, nodes.go:91); no replica churn; no SSA conflict. Verified equivalent to `patch spec.image`. | k8s (refutes dissenter concern) | Kept; builder shared. | -| Concurrency / leaks / SIGTERM | COMPATIBLE | Race-free vote fan-out, body closed, ctx nesting + NotifyContext match sibling suites. | systems | — | -| Validator naming / namespace co-location | COMPATIBLE | - 0-based matches controller labels.go; task/target/pods co-located. | k8s | — | - -### Idiom addendum (RATIFY) -Reads native (env+spec idiom, helpers, comment register, build-tag). Divergence-with-consequence = the /status + poll duplication of the SDK (resolved by exporting LatestHeight; gov-REST polling stays harness-local as gov-query orchestration, not readiness — matches the scope rule). - -### Deferred (not blocking) -- Parent 60m can fire mid-child-step → misattributed error (systems): generous envelope; un-defer on first spurious occurrence. -- min_deposit / deposit-period hang (dissenter): params match the proven scenario (20000000usei clears min_deposit); the last-seen diagnostic surfaces a stuck deposit-period proposal if it ever regresses. From 7700b154b5a6d0e6752f8a38372ff9e36ce09803 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 23 Jun 2026 13:53:06 -0700 Subject: [PATCH 2/5] chore: crisp migration-narrative comments in the harness + SDK Drop the 'Go-native' framing and the 'replaces the Chaos-Mesh Workflow / seitask-runner' narrative from doc comments (incl. now-dangling refs to the deleted machinery); keep the technical rationale (chain-as-medium, the env-alias superset the release-test image reads). Co-Authored-By: Claude Opus 4.8 --- .github/workflows/ecr.yml | 5 ++--- sdk/sei/sei.go | 6 +++--- sdk/sei/task.go | 7 +++---- test/integration/Dockerfile | 9 ++++----- test/integration/release_test.go | 18 ++++++++---------- test/integration/upgrade_test.go | 4 ++-- 6 files changed, 22 insertions(+), 27 deletions(-) diff --git a/.github/workflows/ecr.yml b/.github/workflows/ecr.yml index b2786241..5192a748 100644 --- a/.github/workflows/ecr.yml +++ b/.github/workflows/ecr.yml @@ -40,9 +40,8 @@ jobs: cache-from: type=registry,ref=${{ steps.ecr-login.outputs.registry }}/sei/build-cache:shared cache-to: type=registry,ref=${{ steps.ecr-login.outputs.registry }}/sei/build-cache:shared,mode=max - # The Go-native integration harness (go test -c -tags integration), run by - # one CronJob per target (-test.run TestX). Replaces seitask-runner + the - # Chaos-Mesh Workflow scenarios once the nightly CronJobs cut over. + # The integration test suite compiled to an image (go test -c -tags + # integration), run by one nightly CronJob per target (-test.run TestX). - name: Build and push integration-harness image uses: docker/build-push-action@v6 with: diff --git a/sdk/sei/sei.go b/sdk/sei/sei.go index a7fa3b21..8aec62b2 100644 --- a/sdk/sei/sei.go +++ b/sdk/sei/sei.go @@ -1,4 +1,4 @@ -// Package sei is a thin, typed, stateless, multi-mode Go-native API for +// Package sei is a thin, typed, stateless, multi-mode API for // SeiNetwork/SeiNode lifecycle. It mirrors database/sql: a provider registers in // init(), the consumer blank-imports it, and Open selects the mode by name. // @@ -130,7 +130,7 @@ func (c *Client) GetNode(ctx context.Context, name, namespace string) (*Node, er return &Node{handle: h}, nil } -// Network is a Go-native handle to a SeiNetwork. Endpoint getters read the +// Network is a handle to a SeiNetwork. Endpoint getters read the // runtime's status verbatim — never reconstructed. type Network struct{ handle NetworkHandle } @@ -159,7 +159,7 @@ func (n *Network) Delete(ctx context.Context) error { return n.handle.Delete(ctx // type-asserts; local/docker stubs return nil. func (n *Network) Object() any { return n.handle.Object() } -// Node is a Go-native handle to a SeiNode. +// Node is a handle to a SeiNode. type Node struct{ handle NodeHandle } // Name is the SeiNode resource name. diff --git a/sdk/sei/task.go b/sdk/sei/task.go index c7e22079..748a46ef 100644 --- a/sdk/sei/task.go +++ b/sdk/sei/task.go @@ -8,9 +8,8 @@ import ( // Task support. A SeiNodeTask is a one-shot, typed operation against a single // SeiNode — submit a gov upgrade proposal, vote, wait for a height, swap the -// node image. The harness drives a major-upgrade or release scenario by running -// these in statement order. This replaces the Chaos-Mesh Workflow DAG + env-file -// handoffs the seitask-runner used. +// node image. A caller drives a major-upgrade or release flow by running these +// in statement order. // // Cross-task coordination is chain-as-medium, NOT task-to-task output currying: // the controller surfaces typed Outputs only for UpdateNodeImage today (the gov @@ -190,7 +189,7 @@ func (c *Client) GetTask(ctx context.Context, name, namespace string) (*Task, er return &Task{handle: h}, nil } -// Task is a Go-native handle to a SeiNodeTask. +// Task is a handle to a SeiNodeTask. type Task struct{ handle TaskHandle } // Name is the SeiNodeTask resource name. diff --git a/test/integration/Dockerfile b/test/integration/Dockerfile index e5bfc383..98b848e9 100644 --- a/test/integration/Dockerfile +++ b/test/integration/Dockerfile @@ -1,8 +1,7 @@ -# The integration harness image: the build-tagged test binary, compiled once and -# run by one in-cluster CronJob per target (args: -test.run TestX). It replaces -# the seitask-runner image + the Chaos-Mesh Workflow scenarios — the suites carry -# their fault/seiload templates via //go:embed, so the binary is self-contained -# (no scenario files to COPY). +# The integration test suite compiled to an image: the build-tagged test binary, +# run by one in-cluster CronJob per target (args: -test.run TestX). The suites +# carry their fault/seiload templates via //go:embed, so the binary is +# self-contained (no extra files to COPY). FROM golang:1.26 AS builder ARG TARGETOS ARG TARGETARCH diff --git a/test/integration/release_test.go b/test/integration/release_test.go index 68809eb8..ffb160dd 100644 --- a/test/integration/release_test.go +++ b/test/integration/release_test.go @@ -53,8 +53,7 @@ var releaseRPCConfig = map[string]string{ // the external release-test image against the RPC node as a Job. The release-test // image owns the functional assertions (TEST_TARGET=chain-agnostic); the suite's // job is to stand up the chain, hand the harness its endpoints + admin key, and -// gate on the Job's exit code. Replaces the Chaos-Mesh Workflow's keygen + -// provision + run steps with statement order + the SDK. +// gate on the Job's exit code. // // One RPC node (not the load suite's two) is deliberate: the harness runs // stateful EVM-filter and send-then-wait sequences that need one consistent @@ -189,9 +188,8 @@ func TestRelease(t *testing.T) { } // createMnemonicSecret writes the admin mnemonic to a Secret the release-test pod -// reads via secretKeyRef. Labeled for the GC sweep; deleted on cleanup. (The -// seitask-runner stamps an ownerRef instead — the harness uses the run label + -// t.Cleanup, matching how it provisions everything else.) +// reads via secretKeyRef. Labeled for the GC sweep and deleted on cleanup, +// matching how the suite manages everything else it creates. func createMnemonicSecret( ctx context.Context, t *testing.T, cs *kubernetes.Clientset, ns, name string, labels map[string]string, mnemonic string, @@ -246,10 +244,10 @@ func releaseJob(p releaseParams) *batchv1.Job { Containers: []corev1.Container{{ Name: "release-test", Image: p.image, - // The scenario projects the workflow-vars CM (RPC_*/CHAIN_ID/ - // ADMIN_ADDRESS) via envFrom ON TOP of the explicit SEI_* list; - // reproduce that superset so a harness sub-case reading e.g. - // RPC_EVM_RPC_LIST isn't silently unset (a skip-but-exit-0). + // The release-test image reads both the SEI_* names and the + // RPC_*/CHAIN_ID/ADMIN_ADDRESS names; provide both so a + // sub-case reading e.g. RPC_EVM_RPC_LIST isn't silently unset + // (which would skip-but-exit-0). Env: []corev1.EnvVar{ {Name: "TEST_TARGET", Value: "chain-agnostic"}, {Name: "SEI_CHAIN_ID", Value: p.chainID}, @@ -257,7 +255,7 @@ func releaseJob(p releaseParams) *batchv1.Job { {Name: "SEI_TENDERMINT_RPC", Value: p.tmRPC}, {Name: "SEI_EVM_JSON_RPC", Value: p.evmRPC}, {Name: "SEI_REST_ENDPOINT", Value: p.rest}, - // workflow-vars CM superset (the scenario's envFrom). + // The RPC_*/CHAIN_ID/ADMIN_ADDRESS aliases the image also reads. {Name: "CHAIN_ID", Value: p.chainID}, {Name: "ADMIN_ADDRESS", Value: p.adminAddr}, {Name: "RPC_TM_RPC", Value: p.tmRPC}, diff --git a/test/integration/upgrade_test.go b/test/integration/upgrade_test.go index 9add2d79..ee49f5b5 100644 --- a/test/integration/upgrade_test.go +++ b/test/integration/upgrade_test.go @@ -75,8 +75,8 @@ var upgradeConfig = map[string]string{ const restUnreachable = "REST unreachable / non-200" // TestChainUpgrade drives a Sei major software upgrade end-to-end through the SDK -// task surface, replacing the Chaos-Mesh Workflow DAG: provision a 4-validator -// chain on the pre-upgrade image -> submit a GovSoftwareUpgrade proposal -> +// task surface: provision a 4-validator chain on the pre-upgrade image -> submit +// a GovSoftwareUpgrade proposal -> // resolve its ID from the chain's gov REST (chain-as-medium, since the controller // does not surface it as a task output) -> vote yes from every validator -> wait // for it to pass -> let the chain halt at the upgrade height -> bump the From 44e02371935a192144ce5ca2f2b83b62d7d14dc5 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 23 Jun 2026 13:57:40 -0700 Subject: [PATCH 3/5] chore: regenerate RBAC after removing the runner's chaos-mesh markers Deleting internal/runner dropped its +kubebuilder:rbac markers for chaos-mesh workflows/workflownodes, which controller-gen had aggregated into the manager role. Regenerate so verify-generated passes; the manager never used that access (it was the runner's). Co-Authored-By: Claude Opus 4.8 --- config/rbac/role.yaml | 8 -------- manifests/role.yaml | 8 -------- 2 files changed, 16 deletions(-) diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 9dccc620..76cd7fd3 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -67,14 +67,6 @@ rules: - patch - update - watch -- apiGroups: - - chaos-mesh.org - resources: - - workflownodes - - workflows - verbs: - - get - - list - apiGroups: - sei.io resources: diff --git a/manifests/role.yaml b/manifests/role.yaml index 9dccc620..76cd7fd3 100644 --- a/manifests/role.yaml +++ b/manifests/role.yaml @@ -67,14 +67,6 @@ rules: - patch - update - watch -- apiGroups: - - chaos-mesh.org - resources: - - workflownodes - - workflows - verbs: - - get - - list - apiGroups: - sei.io resources: From 2d023d252d4bedbde973b15b7d93e8887a986d07 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 23 Jun 2026 14:35:01 -0700 Subject: [PATCH 4/5] =?UTF-8?q?chore:=20resolve=20deletion=20xreview=20?= =?UTF-8?q?=E2=80=94=20drop=20dangling=20refs=20to=20removed=20machinery?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Coral slate (k8s + idiom) on the deletion PR: - The SeiNodeTask CRD type doc comments named the seitask-runner / Chaos Workflow layer as where to express fan-out — and the spec.target one shipped into the served CRD description (kubectl explain would cite deleted machinery). Re-point to "the orchestrating caller (one task per node)" and regenerate config/crd + manifests. - Crisp the remaining dangling "scenario" / "scenarios/major-upgrade.yaml" references in upgrade_test.go + release_test.go (keep the technical rationale, drop the provenance trailers + the broken line-flow). Co-Authored-By: Claude Opus 4.8 --- api/v1alpha1/seinodetask_types.go | 9 ++++----- config/crd/sei.io_seinodetasks.yaml | 2 +- manifests/sei.io_seinodetasks.yaml | 2 +- test/integration/release_test.go | 17 +++++++---------- test/integration/upgrade_test.go | 25 ++++++++++++------------- 5 files changed, 25 insertions(+), 30 deletions(-) diff --git a/api/v1alpha1/seinodetask_types.go b/api/v1alpha1/seinodetask_types.go index 87f8b926..1f611ce9 100644 --- a/api/v1alpha1/seinodetask_types.go +++ b/api/v1alpha1/seinodetask_types.go @@ -85,8 +85,7 @@ const ( const ( // ConditionSeiNodeTaskReady reflects whether the task has reached a // terminal successful state. True only when status.phase == Complete. - // Load-bearing for `kubectl wait --for=condition=Ready=true` in the - // seitask-runner. + // Load-bearing for `kubectl wait --for=condition=Ready=true`. ConditionSeiNodeTaskReady = "Ready" // ConditionSeiNodeTaskFailed reflects whether the task has reached a @@ -131,7 +130,7 @@ type SeiNodeTaskSpec struct { // Target identifies the single SeiNode this task operates on. Fan-out // targeting (label selectors) is intentionally out of scope at the CRD - // layer — express fan-out at the seitask-runner / Chaos Workflow layer. + // layer — express fan-out in the orchestrating caller (one task per node). Target SeiNodeTaskTarget `json:"target"` // TimeoutSeconds bounds execution time, measured from @@ -179,8 +178,8 @@ type SeiNodeTaskSpec struct { } // SeiNodeTaskTarget identifies the single SeiNode this task operates on. -// Selector-based fan-out is intentionally out of scope for MVP — express -// multi-node operations at the seitask-runner / Chaos Workflow layer. +// Selector-based fan-out is intentionally out of scope — express multi-node +// operations in the orchestrating caller (one task per node). type SeiNodeTaskTarget struct { // NodeRef is a same-namespace reference to a SeiNode. NodeRef SeiNodeTaskNodeRef `json:"nodeRef"` diff --git a/config/crd/sei.io_seinodetasks.yaml b/config/crd/sei.io_seinodetasks.yaml index 351caca6..4f595fc6 100644 --- a/config/crd/sei.io_seinodetasks.yaml +++ b/config/crd/sei.io_seinodetasks.yaml @@ -349,7 +349,7 @@ spec: description: |- Target identifies the single SeiNode this task operates on. Fan-out targeting (label selectors) is intentionally out of scope at the CRD - layer — express fan-out at the seitask-runner / Chaos Workflow layer. + layer — express fan-out in the orchestrating caller (one task per node). properties: nodeRef: description: NodeRef is a same-namespace reference to a SeiNode. diff --git a/manifests/sei.io_seinodetasks.yaml b/manifests/sei.io_seinodetasks.yaml index 351caca6..4f595fc6 100644 --- a/manifests/sei.io_seinodetasks.yaml +++ b/manifests/sei.io_seinodetasks.yaml @@ -349,7 +349,7 @@ spec: description: |- Target identifies the single SeiNode this task operates on. Fan-out targeting (label selectors) is intentionally out of scope at the CRD - layer — express fan-out at the seitask-runner / Chaos Workflow layer. + layer — express fan-out in the orchestrating caller (one task per node). properties: nodeRef: description: NodeRef is a same-namespace reference to a SeiNode. diff --git a/test/integration/release_test.go b/test/integration/release_test.go index ffb160dd..49ae922c 100644 --- a/test/integration/release_test.go +++ b/test/integration/release_test.go @@ -22,14 +22,12 @@ import ( ) // releaseAdminBalance funds the admin account in genesis so the release-test -// harness can sign and pay for the txs it issues. Ported from the release-test -// scenario's validator template. +// harness can sign and pay for the txs it issues. const releaseAdminBalance = "1000000000000usei" // releaseBaseConfig is the seid config the release chain runs with: the memiavl // storage baseline (the nightly image rejects the cosmos_only default) plus kv tx -// indexing (the harness queries txs) and a short mempool TTL. Ported from the -// release-test scenario's validator + rpc configOverrides. +// indexing (the harness queries txs) and a short mempool TTL. var releaseBaseConfig = mergeConfig(memiavlStorageConfig, map[string]string{ "tx_index.indexer": "kv", "mempool.ttl_duration": "60s", @@ -48,7 +46,7 @@ var releaseRPCConfig = map[string]string{ "evm.enabled_legacy_sei_apis": releaseLegacyEVMAPIs, } -// TestRelease drives the release-validation scenario: provision a 4-validator +// TestRelease drives the release-validation flow: provision a 4-validator // chain + one EVM-serving RPC follower, generate a funded admin account, and run // the external release-test image against the RPC node as a Job. The release-test // image owns the functional assertions (TEST_TARGET=chain-agnostic); the suite's @@ -175,8 +173,8 @@ func TestRelease(t *testing.T) { waitJob(ctx, t, cs, net.Namespace(), job.Name) // Archive the harness output even on success: exit 0 alone doesn't show which - // sub-cases ran, so a skip-but-pass is otherwise invisible (the scenario's - // upload-report served this; an S3 report is the deferred telemetry step). + // sub-cases ran, so a skip-but-pass is otherwise invisible. (A durable S3 + // report is a deferred telemetry step.) t.Logf("release-test job completed; harness log tail:\n%s", podLogTail(ctx, cs, net.Namespace(), job.Name)) // The chain stayed live through the release suite: the follower is still @@ -220,9 +218,8 @@ type releaseParams struct { // releaseJob builds the release-test Job: the external harness image, fed the // chain endpoints + admin identity, run once (no retry) with a self-terminating -// deadline. Resources + ttl match the scenario's run-release-test step (which the -// nightly — an unenforced-PSS namespace — runs without a securityContext, so this -// stays faithful rather than imposing one the harness image may not tolerate). +// deadline. No securityContext: nightly is an unenforced-PSS namespace, so this +// avoids imposing one the harness image may not tolerate (it writes a keyring). func releaseJob(p releaseParams) *batchv1.Job { backoff := int32(0) deadline := int64(60 * 60) // the chain-agnostic harness runs >35m against one RPC node; generous cap diff --git a/test/integration/upgrade_test.go b/test/integration/upgrade_test.go index ee49f5b5..04ef83e6 100644 --- a/test/integration/upgrade_test.go +++ b/test/integration/upgrade_test.go @@ -18,9 +18,9 @@ import ( "github.com/sei-protocol/sei-k8s-controller/sdk/sei" ) -// Gov tx parameters for the upgrade flow, ported from the major-upgrade scenario -// (scenarios/major-upgrade.yaml). usei-only; the deposit must clear the chain's -// min_deposit so the proposal enters voting immediately (not the deposit period). +// Gov tx parameters for the upgrade flow. usei-only; the deposit must clear the +// chain's min_deposit so the proposal enters voting immediately (not the deposit +// period). const ( upgradeDeposit = "20000000usei" govFees = "10000usei" @@ -42,9 +42,9 @@ const ( // haltPollMargin is how many blocks BEFORE the upgrade height the suite stops // polling: it polls the (load-balanced) aggregate RPC only while the chain is // still serving, then settles. At the halt itself every validator stops - // serving RPC simultaneously, so the halt height is unpollable (the scenario - // uses a fixed wait for the same reason) — polling to a pre-halt height keeps - // the endpoint alive while still confirming the chain is about to halt. + // serving RPC simultaneously, so the halt height is unpollable — polling to a + // pre-halt height keeps the endpoint alive while still confirming the chain is + // about to halt. haltPollMargin = 10 // haltSettle bounds the wait, after the chain reaches the pre-halt height, for // the remaining blocks to commit and every validator to halt at the upgrade @@ -64,8 +64,7 @@ var votingPeriodGenesis = map[string]string{ // upgradeConfig are the seid runtime overrides the upgrade flow needs: the REST // API serves the gov proposal queries (off by default), and kv tx-indexing lets -// the proposal-submission tx be found. Ported from the major-upgrade scenario's -// SeiNetwork configOverrides. +// the proposal-submission tx be found. var upgradeConfig = map[string]string{ "api.rest.enable": "true", "tx_index.indexer": "kv", @@ -75,10 +74,10 @@ var upgradeConfig = map[string]string{ const restUnreachable = "REST unreachable / non-200" // TestChainUpgrade drives a Sei major software upgrade end-to-end through the SDK -// task surface: provision a 4-validator chain on the pre-upgrade image -> submit -// a GovSoftwareUpgrade proposal -> -// resolve its ID from the chain's gov REST (chain-as-medium, since the controller -// does not surface it as a task output) -> vote yes from every validator -> wait +// task surface: provision a 4-validator chain on the pre-upgrade image -> submit a +// GovSoftwareUpgrade proposal -> resolve its ID from the chain's gov REST +// (chain-as-medium, since the controller does not surface it as a task output) -> +// vote yes from every validator -> wait // for it to pass -> let the chain halt at the upgrade height -> bump the // SeiNetwork image to the post-upgrade build -> assert the upgrade handler ran // and every validator resumed past the upgrade height. @@ -388,7 +387,7 @@ func taskName(chainID, step string) string { // govProposal models just enough of a proposal to resolve an upgrade proposal by // its plan name and read its status. The legacy (v1beta1) shape carries the plan // at content.plan.name; the v1 shape carries it under messages[].content.plan.name -// — both are accepted, matching the scenario's resolver. +// — both are accepted. type govProposal struct { ProposalID string `json:"proposal_id"` Status string `json:"status"` From 1e3f5a8abc37bfc327eea4f643e4e8d07f520f89 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 23 Jun 2026 14:41:57 -0700 Subject: [PATCH 5/5] chore: clear residual seitask references (confirmation re-review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Idiom + k8s confirmation pass follow-ups: - Tidy the rewrapped TestChainUpgrade doc line-flow. - Re-point CLAUDE.md's Ready/Failed-exception rationale off "the seitask-runner depends on kubectl wait" to "consumers wait on" (the contract is unchanged; the named consumer was deleted). - Drop the deleted seitask from the SDK's consumer-naming comments (labels.go, readiness.go, render.go, sdk/CLAUDE.md) — seictl stays. - gitignore the stray local seitask build artifact. Co-Authored-By: Claude Opus 4.8 --- CLAUDE.md | 2 +- sdk/CLAUDE.md | 6 +++--- sdk/sei/labels.go | 13 ++++++------- sdk/sei/provider/k8s/render.go | 3 +-- sdk/sei/readiness.go | 4 ++-- test/integration/upgrade_test.go | 8 ++++---- 6 files changed, 17 insertions(+), 19 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index cd3855d5..d4edb250 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -64,7 +64,7 @@ setCondition(obj, ConditionNetworkingReady, metav1.ConditionFalse, The narrow exceptions to the always-present rule: - **`*Needed`-style conditions** where `True` is the exception and `False` would be tautological with the absence of the feature. No current instances in this codebase; the exception is retained for future conditions where it genuinely fits. -- **`kubectl wait` consumer conditions** where present-vs-absent semantics are explicitly load-bearing. `SeiNodeTask.Status.Conditions[Ready|Failed]` is documented as latch-on-terminal-state because the seitask-runner depends on `kubectl wait --for=condition=Ready=true` (which matches `True` only) and `--for=condition=Failed=true` as the dual exit signal. The Ready+Failed pair is the documented exception to the "no mixed polarities for the same subject" rule below — both latch independently on terminal state. +- **`kubectl wait` consumer conditions** where present-vs-absent semantics are explicitly load-bearing. `SeiNodeTask.Status.Conditions[Ready|Failed]` is documented as latch-on-terminal-state because consumers wait on `kubectl wait --for=condition=Ready=true` (which matches `True` only) and `--for=condition=Failed=true` as the dual exit signal. The Ready+Failed pair is the documented exception to the "no mixed polarities for the same subject" rule below — both latch independently on terminal state. Any new condition that doesn't fit one of these exceptions defaults to always-present. diff --git a/sdk/CLAUDE.md b/sdk/CLAUDE.md index 4f701fa7..0d10b20c 100644 --- a/sdk/CLAUDE.md +++ b/sdk/CLAUDE.md @@ -47,8 +47,8 @@ timeout spec fields; `sei.IsTimeout(err)` reports a deadline. **`WaitCaughtUp` / `WaitEVMServing` = STRICT readiness** (`readiness.go`). The caught-up gate (TM `/status`: `height>1 && catching_up==false`) and the EVM serve gate (`eth_blockNumber` bound) — the heavier contract `WaitReady` deliberately is -not. URL-based and stdlib-only (no apimachinery), so seictl, the seitask Task -steps, and external integration harnesses share one readiness implementation +not. URL-based and stdlib-only (no apimachinery), so seictl and external +integration harnesses share one readiness implementation instead of bespoke bash. Inputs (endpoint URLs) come from whatever produced the resource — e.g. the CLI create command's stdout. @@ -89,6 +89,6 @@ authors them once. - **`provider.Provider` interface + `Register`/`Factory`.** The handle-based CRUD driver-registration contract. - **Object-label keys** `sei.io/role=node`, `sei.io/seinetwork=`. The - fleet-wide selector contract shared with seictl, seitask, chaos selectors. + fleet-wide selector contract shared with seictl + chaos selectors. - **SSA FieldOwner `sei-sdk`.** A distinct field manager. Renaming it orphans field ownership on objects the SDK already created. diff --git a/sdk/sei/labels.go b/sdk/sei/labels.go index 678748f3..32cee7ae 100644 --- a/sdk/sei/labels.go +++ b/sdk/sei/labels.go @@ -1,11 +1,10 @@ package sei // Object-label / peer-wiring producer contract — the canonical single source of -// truth. The controller's seitask and seictl copies are unexported (internal/), -// so the SDK authors these once here. Changing a value is a fleet-wide breaking -// change: chaos selectors, follower-discovery queries (node list -// -l sei.io/seinetwork=,sei.io/role=node), and seictl all match the exact -// literals. +// truth. seictl's copy is unexported (internal/), so the SDK authors these once +// here. Changing a value is a fleet-wide breaking change: chaos selectors, +// follower-discovery queries (node list -l sei.io/seinetwork=,sei.io/role=node), +// and seictl all match the exact literals. const ( // LabelRole keys the role an object plays in a network. LabelRole = "sei.io/role" @@ -20,6 +19,6 @@ const ( ) // FieldOwner is the SSA field manager the SDK applies under — a distinct writer -// from seictl ("seictl") and seitask ("seitask-provision-node"). Stable: renaming -// it orphans field ownership on objects the SDK already created. +// from seictl ("seictl"). Stable: renaming it orphans field ownership on objects +// the SDK already created. const FieldOwner = "sei-sdk" diff --git a/sdk/sei/provider/k8s/render.go b/sdk/sei/provider/k8s/render.go index 1fab481e..b1e30d3d 100644 --- a/sdk/sei/provider/k8s/render.go +++ b/sdk/sei/provider/k8s/render.go @@ -13,8 +13,7 @@ import ( "github.com/sei-protocol/sei-k8s-controller/sdk/sei" ) -// fieldOwner is the SDK's SSA field manager. A distinct writer from -// seictl/seitask. +// fieldOwner is the SDK's SSA field manager. A distinct writer from seictl. const fieldOwner client.FieldOwner = sei.FieldOwner // renderNetwork builds the SeiNetwork from a NetworkSpec. ChainID is not a spec diff --git a/sdk/sei/readiness.go b/sdk/sei/readiness.go index c594e2f7..abc241bd 100644 --- a/sdk/sei/readiness.go +++ b/sdk/sei/readiness.go @@ -14,8 +14,8 @@ import ( // Readiness probes are the generally-useful chain-provisioning lifecycle piece: // "the node has joined consensus and is actually serving," not merely "the pod is // Running." They are mode-agnostic — they take a published endpoint URL and speak -// HTTP, so the k8s/local/docker providers, the seitask Task steps, and external -// harnesses all share one implementation. Kept stdlib-only (no apimachinery) so +// HTTP, so the k8s/local/docker providers and external callers all share one +// implementation. Kept stdlib-only (no apimachinery) so // the core package stays dependency-free for lightweight external consumers. // probeInterval is the readiness poll cadence; a var so tests can shrink it diff --git a/test/integration/upgrade_test.go b/test/integration/upgrade_test.go index 04ef83e6..08e463a7 100644 --- a/test/integration/upgrade_test.go +++ b/test/integration/upgrade_test.go @@ -77,10 +77,10 @@ const restUnreachable = "REST unreachable / non-200" // task surface: provision a 4-validator chain on the pre-upgrade image -> submit a // GovSoftwareUpgrade proposal -> resolve its ID from the chain's gov REST // (chain-as-medium, since the controller does not surface it as a task output) -> -// vote yes from every validator -> wait -// for it to pass -> let the chain halt at the upgrade height -> bump the -// SeiNetwork image to the post-upgrade build -> assert the upgrade handler ran -// and every validator resumed past the upgrade height. +// vote yes from every validator -> wait for it to pass -> let the chain halt at +// the upgrade height -> bump the SeiNetwork image to the post-upgrade build -> +// assert the upgrade handler ran and every validator resumed past the upgrade +// height. // // The upgrade height is scheduled far enough ahead (defaultUpgradeHeightDelta) // that the proposal passes before the chain reaches it; the upgrade-applied check