diff --git a/AGENTS.md b/AGENTS.md index 8e5fb3f..43672eb 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,320 +1,173 @@ -# cluster-api-provider-cloudscale - AI Agent Guide - -## Project Structure - -**Single-group layout (default):** -``` -cmd/main.go Manager entry (registers controllers/webhooks) -api//*_types.go CRD schemas (+kubebuilder markers) -api//zz_generated.* Auto-generated (DO NOT EDIT) -internal/controller/* Reconciliation logic -internal/webhook/* Validation/defaulting (if present) -config/crd/bases/* Generated CRDs (DO NOT EDIT) -config/rbac/role.yaml Generated RBAC (DO NOT EDIT) -config/samples/* Example CRs (edit these) -Makefile Build/test/deploy commands -PROJECT Kubebuilder metadata Auto-generated (DO NOT EDIT) -``` - -**Multi-group layout** (for projects with multiple API groups): -``` -api///*_types.go CRD schemas by group -internal/controller//* Controllers by group -internal/webhook///* Webhooks by group and version (if present) -``` - -Multi-group layout organizes APIs by group name (e.g., `batch`, `apps`). Check the `PROJECT` file for `multigroup: true`. - -**To convert to multi-group layout:** -1. Run: `kubebuilder edit --multigroup=true` -2. Move APIs: `mkdir -p api/ && mv api/ api//` -3. Move controllers: `mkdir -p internal/controller/ && mv internal/controller/*.go internal/controller//` -4. Move webhooks (if present): `mkdir -p internal/webhook/ && mv internal/webhook/ internal/webhook//` -5. Update import paths in all files -6. Fix `path` in `PROJECT` file for each resource -7. Update test suite CRD paths (add one more `..` to relative paths) - -## Critical Rules - -### Never Edit These (Auto-Generated) -- `config/crd/bases/*.yaml` - from `make manifests` -- `config/rbac/role.yaml` - from `make manifests` -- `config/webhook/manifests.yaml` - from `make manifests` -- `**/zz_generated.*.go` - from `make generate` -- `PROJECT` - from `kubebuilder [OPTIONS]` - -### Never Remove Scaffold Markers -Do NOT delete `// +kubebuilder:scaffold:*` comments. CLI injects code at these markers. - -### Keep Project Structure -Do not move files around. The CLI expects files in specific locations. - -### Always Use CLI Commands -Always use `kubebuilder create api` and `kubebuilder create webhook` to scaffold. Do NOT create files manually. - -### E2E Tests Require an Isolated Kind Cluster -The e2e tests are designed to validate the solution in an isolated environment (similar to GitHub Actions CI). -Ensure you run them against a dedicated [Kind](https://kind.sigs.k8s.io/) cluster (not your “real” dev/prod cluster). - -## After Making Changes - -**After editing `*_types.go` or markers:** -``` -make manifests # Regenerate CRDs/RBAC from markers -make generate # Regenerate DeepCopy methods -``` - -**After editing `*.go` files:** -``` -make lint-fix # Auto-fix code style -make test # Run unit tests -``` - -## CLI Commands Cheat Sheet - -### Create API (your own types) -```bash -kubebuilder create api --group --version --kind -``` - -### Deploy Image Plugin (scaffold to deploy/manage ANY container image) - -Generate a controller that deploys and manages a container image (nginx, redis, memcached, your app, etc.): - -```bash -# Example: deploying memcached -kubebuilder create api --group example.com --version v1alpha1 --kind Memcached \ - --image=memcached:alpine \ - --plugins=deploy-image.go.kubebuilder.io/v1-alpha -``` - -Scaffolds good-practice code: reconciliation logic, status conditions, finalizers, RBAC. Use as a reference implementation. - - -### Create Webhooks -```bash -# Validation + defaulting -kubebuilder create webhook --group --version --kind \ - --defaulting --programmatic-validation - -# Conversion webhook (for multi-version APIs) -kubebuilder create webhook --group --version v1 --kind \ - --conversion --spoke v2 -``` - -### Controller for Core Kubernetes Types -```bash -# Watch Pods -kubebuilder create api --group core --version v1 --kind Pod \ - --controller=true --resource=false - -# Watch Deployments -kubebuilder create api --group apps --version v1 --kind Deployment \ - --controller=true --resource=false -``` - -### Controller for External Types (e.g., from other operators) - -Watch resources from external APIs (cert-manager, Argo CD, Istio, etc.): - -```bash -# Example: watching cert-manager Certificate resources -kubebuilder create api \ - --group cert-manager --version v1 --kind Certificate \ - --controller=true --resource=false \ - --external-api-path=github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1 \ - --external-api-domain=io \ - --external-api-module=github.com/cert-manager/cert-manager -``` - -**Note:** Use `--external-api-module=@` only if you need a specific version. Otherwise, omit `@` to use what's in go.mod. - -### Webhook for External Types - -```bash -# Example: validating external resources -kubebuilder create webhook \ - --group cert-manager --version v1 --kind Issuer \ - --defaulting \ - --external-api-path=github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1 \ - --external-api-domain=io \ - --external-api-module=github.com/cert-manager/cert-manager -``` - -## Testing & Development - -```bash -make test # Run unit tests (uses envtest: real K8s API + etcd) -make run # Run locally (uses current kubeconfig context) -``` - -Tests use **Ginkgo + Gomega** (BDD style). Check `suite_test.go` for setup. - -## Deployment Workflow - -```bash -# 1. Regenerate manifests -make manifests generate - -# 2. Build & deploy -export IMG=/:tag -make docker-build docker-push IMG=$IMG # Or: kind load docker-image $IMG --name -make deploy IMG=$IMG - -# 3. Test -kubectl apply -k config/samples/ - -# 4. Debug -kubectl logs -n -system deployment/-controller-manager -c manager -f -``` - -### API Design - -**Key markers for** `api//*_types.go`: - -```go -// +kubebuilder:object:root=true -// +kubebuilder:subresource:status -// +kubebuilder:resource:scope=Namespaced -// +kubebuilder:printcolumn:name="Status",type=string,JSONPath=".status.conditions[?(@.type=='Ready')].status" - -// On fields: -// +kubebuilder:validation:Required -// +kubebuilder:validation:Minimum=1 -// +kubebuilder:validation:MaxLength=100 -// +kubebuilder:validation:Pattern="^[a-z]+$" -// +kubebuilder:default="value" -``` - -- **Use** `metav1.Condition` for status (not custom string fields) -- **Use predefined types**: `metav1.Time` instead of `string` for dates -- **Follow K8s API conventions**: Standard field names (`spec`, `status`, `metadata`) - -### Controller Design - -**RBAC markers in** `internal/controller/*_controller.go`: - -```go -// +kubebuilder:rbac:groups=mygroup.example.com,resources=mykinds,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups=mygroup.example.com,resources=mykinds/status,verbs=get;update;patch -// +kubebuilder:rbac:groups=mygroup.example.com,resources=mykinds/finalizers,verbs=update -// +kubebuilder:rbac:groups=events.k8s.io,resources=events,verbs=create;patch -// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch;delete -``` - -**Implementation rules:** -- **Idempotent reconciliation**: Safe to run multiple times -- **Re-fetch before updates**: `r.Get(ctx, req.NamespacedName, obj)` before `r.Update` to avoid conflicts -- **Structured logging**: `log := log.FromContext(ctx); log.Info("msg", "key", val)` -- **Owner references**: Enable automatic garbage collection (`SetControllerReference`) -- **Watch secondary resources**: Use `.Owns()` or `.Watches()`, not just `RequeueAfter` -- **Finalizers**: Clean up external resources (buckets, VMs, DNS entries) - -### Logging - -**Follow Kubernetes logging message style guidelines:** - -- Start from a capital letter -- Do not end the message with a period -- Active voice: subject present (`"Deployment could not create Pod"`) or omitted (`"Could not create Pod"`) -- Past tense: `"Could not delete Pod"` not `"Cannot delete Pod"` -- Specify object type: `"Deleted Pod"` not `"Deleted"` -- Balanced key-value pairs - -```go -log.Info("Starting reconciliation") -log.Info("Created Deployment", "name", deploy.Name) -log.Error(err, "Failed to create Pod", "name", name) -``` - -**Reference:** https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md#message-style-guidelines - -### Webhooks -- **Create all types together**: `--defaulting --programmatic-validation --conversion` -- **When`--force`is used**: Backup custom logic first, then restore after scaffolding -- **For multi-version APIs**: Use hub-and-spoke pattern (`--conversion --spoke v2`) - - Hub version: Usually oldest stable version (v1) - - Spoke versions: Newer versions that convert to/from hub (v2, v3) - - Example: `--group crew --version v1 --kind Captain --conversion --spoke v2` (v1 is hub, v2 is spoke) - -### Learning from Examples - -The **deploy-image plugin** scaffolds a complete controller following good practices. Use it as a reference implementation: - -```bash -kubebuilder create api --group example --version v1alpha1 --kind MyApp \ - --image= --plugins=deploy-image.go.kubebuilder.io/v1-alpha -``` - -Generated code includes: status conditions (`metav1.Condition`), finalizers, owner references, events, idempotent reconciliation. - -## Distribution Options - -### Option 1: YAML Bundle (Kustomize) - -```bash -# Generate dist/install.yaml from Kustomize manifests -make build-installer IMG=/:tag -``` - -**Key points:** -- The `dist/install.yaml` is generated from Kustomize manifests (CRDs, RBAC, Deployment) -- Commit this file to your repository for easy distribution -- Users only need `kubectl` to install (no additional tools required) - -**Example:** Users install with a single command: -```bash -kubectl apply -f https://raw.githubusercontent.com////dist/install.yaml -``` - -### Option 2: Helm Chart - -```bash -kubebuilder edit --plugins=helm/v2-alpha # Generates dist/chart/ (default) -kubebuilder edit --plugins=helm/v2-alpha --output-dir=charts # Generates charts/chart/ -``` - -**For development:** -```bash -make helm-deploy IMG=/: # Deploy manager via Helm -make helm-deploy IMG=$IMG HELM_EXTRA_ARGS="--set ..." # Deploy with custom values -make helm-status # Show release status -make helm-uninstall # Remove release -make helm-history # View release history -make helm-rollback # Rollback to previous version -``` - -**For end users/production:** -```bash -helm install my-release .//chart/ --namespace --create-namespace -``` - -**Important:** If you add webhooks or modify manifests after initial chart generation: -1. Backup any customizations in `/chart/values.yaml` and `/chart/manager/manager.yaml` -2. Re-run: `kubebuilder edit --plugins=helm/v2-alpha --force` (use same `--output-dir` if customized) -3. Manually restore your custom values from the backup - -### Publish Container Image - -```bash -export IMG=/: -make docker-build docker-push IMG=$IMG -``` +# AGENTS.md — CAPCS-specific guidance for AI agents + +This file is a supplement for AI agents working on CAPCS +(cluster-api-provider-cloudscale). For architecture, setup, and the +day-to-day developer flow, read [`docs/development.md`](docs/development.md) +first. This file covers the rules and conventions that are most +load-bearing for code changes. + +Related docs: + +- [`docs/development.md`](docs/development.md) — architecture, Tilt, test layers +- [`CONTRIBUTING.md`](CONTRIBUTING.md) — PR flow, required local checks +- [`docs/getting-started.md`](docs/getting-started.md) — end-user workflow +- [`docs/troubleshooting.md`](docs/troubleshooting.md) — common failure modes + +## Do not edit (auto-generated) + +- `config/crd/bases/*.yaml`, `config/rbac/role.yaml`, + `config/webhook/manifests.yaml` — regenerated by `make manifests`. +- `**/zz_generated.*.go` — regenerated by `make generate`. +- `PROJECT` — owned by kubebuilder. +- Leave `// +kubebuilder:scaffold:*` markers in place; the CLI injects code + at them. + +## What to run after a change + +| You touched | Run | +|-------------------------------------------------|--------------------------------------------------------------------------------------| +| `api/v1beta2/*_types.go` or kubebuilder markers | `make manifests generate` | +| Any `*.go` | `make lint-fix && make test` | +| A reconciler or a template under `templates/` | `make test-e2e-lifecycle` before opening a PR ([`CONTRIBUTING.md`](CONTRIBUTING.md)) | + +**Webhooks own all defaulting and validation.** Never duplicate webhook +logic in a controller — behaviour must be identical between `kubectl apply` +and the reconcile loop. + +## Use TDD + +Write code red → green: a failing test first, then the minimal change that +turns it green. This applies to new behaviour, bug fixes, and refactors +that change observable behaviour. Tests live next to the code they exercise +(`*_test.go`); the cloudscale API is mocked through the service interfaces +in `internal/cloudscale/`. + +## Where things live + +| CRD | Types | Controller | Webhook | +|-----------------------------|--------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------| +| `CloudscaleCluster` | `api/v1beta2/cloudscalecluster_types.go` | `internal/controller/cloudscalecluster_controller.go` + `cloudscalecluster_{network,loadbalancer,floatingip,servergroup}.go` | `internal/webhook/v1beta2/cloudscalecluster_webhook.go` | +| `CloudscaleClusterTemplate` | `api/v1beta2/cloudscaleclustertemplate_types.go` | *(none — handled by the CAPI topology controller)* | `internal/webhook/v1beta2/cloudscaleclustertemplate_webhook.go` | +| `CloudscaleMachine` | `api/v1beta2/cloudscalemachine_types.go` | `internal/controller/cloudscalemachine_controller.go` + `cloudscalemachine_server.go` | `internal/webhook/v1beta2/cloudscalemachine_webhook.go` + `cloudscalemachine_validation.go` | +| `CloudscaleMachineTemplate` | `api/v1beta2/cloudscalemachinetemplate_types.go` | `internal/controller/cloudscalemachinetemplate_controller.go` | `internal/webhook/v1beta2/cloudscalemachinetemplate_webhook.go` | + +Shared infrastructure: + +- `internal/cloudscale/{client,services}.go` — SDK wrapper, service interfaces, shared HTTP transport +- `internal/scope/{cluster,machine}.go` — reconciliation scope objects +- `internal/credentials/` — resolves the per-cluster API token from `credentialsRef` +- `api/v1beta2/condition_types.go` — condition type and reason constants +- `api/v1beta2/tags.go`, `internal/controller/cloudscale_tags.go` — ownership tagging + +For the prose architecture sketch see [`docs/development.md`](docs/development.md#architecture-sketch). + +## Reconciler conventions + +- **Scope pattern.** Build a `scope.ClusterScope` / `scope.MachineScope` at + the top of `Reconcile`. It bundles the client, logger, CAPI `Cluster`, the + CAPCS object, a `patch.Helper`, and the cloudscale client. Defer + `scope.Close()` **with a fresh context** — the reconcile context may + already be timed out by the time status persistence runs. See + `internal/controller/cloudscalecluster_controller.go`. +- **Reconcile shape:** get CAPI owner → check pause + (`util/annotations.IsPaused`) → load credentials → build scope → defer + close → branch on deletion timestamp → add finalizer → call + `reconcileNormal` / `reconcileDelete`. +- **Finalizer constants** live next to the types + (`api/v1beta2/cloudscalecluster_types.go`, look for `ClusterFinalizer`). +- **Conditions.** Set sub-conditions (`NetworkReadyCondition`, + `LoadBalancerReadyCondition`, `FloatingIPReadyCondition`, + `ServerReadyCondition`, `ServerGroupReadyCondition`) that roll up into + `ReadyCondition`. Full list with reason constants: + `api/v1beta2/condition_types.go`. +- **Requeue intervals.** Return `ctrl.Result{}` on steady state. For + transient waits, reuse existing intervals — 5 s + (`ServerStatusPollInterval`, `cloudscalemachine_controller.go:51`) for + server status polling, 10 s for "still draining" waits + (`cloudscalecluster_controller.go:207-214`). Don't invent new ones. +- **Ownership tag.** Every cloudscale resource we create is tagged with + `capcs-cluster-: owned` via + `internal/controller/cloudscale_tags.go`. Reuse `clusterOwnershipTags`; + do not invent a parallel ownership scheme. + +## Webhook conventions + +- Use `CustomDefaulter` + `CustomValidator` and wire both with + `ctrl.NewWebhookManagedBy(...).WithValidator(...).WithDefaulter(...)` — + see `internal/webhook/v1beta2/cloudscalecluster_webhook.go:40`. +- Defaulters and validators get a `*cloudscale.RegionInfo` injected; use it + for region/zone/flavor lookups instead of hitting the API live. +- Validators enforce immutability. Existing examples: region, zone, network + config, LB `Enabled` flag, FIP managed/pre-existing switch. When you add + a new spec field, decide whether it is immutable up front and add the + check here, not in the controller. +- `CloudscaleClusterTemplate` reuses `clusterSpecDefault` and + `clusterSpecValidateCreate` from `cloudscalecluster_webhook.go` (see + `cloudscaleclustertemplate_webhook.go:59, 79`). A new default or validator + on `CloudscaleClusterSpec` automatically applies to both CRDs — do not fork + the logic. +- The e2e overlay `test/infrastructure/cloudscale/clusterclass-quick-start/cluster-class.yaml` + intentionally duplicates `templates/cluster-class.yaml`. Keep them in sync, + but do not attempt to merge or deduplicate them — both copies are required. + +## Cloudscale SDK usage + +- Do not `import "github.com/cloudscale-ch/cloudscale-go-sdk/v9"` outside + `internal/cloudscale/`. Controllers and webhooks talk to the SDK through + the service interfaces on `cloudscale.Client` + (`internal/cloudscale/client.go:32`). +- The shared `*http.Transport` (`internal/cloudscale/client.go:62`) is + created once per manager. Do not build new transports per reconciliation. +- Use the error helpers — `IsNotFound`, `IsFloatingIPNoPublicInterface`, + `IsTimeoutError` (`internal/cloudscale/client.go:121-144`) — instead of + string-matching on error messages. +- Wrap SDK calls with `context.WithTimeout` using the constants in + `client.go`: `ReadTimeout` (1 s), `WriteTimeout` (2 m), `DeleteTimeout` + (2 s). Existing controllers already do this; match the pattern. + +## API design rules + +- Standard CRD markers on infrastructure types: + ```go + // +kubebuilder:object:root=true + // +kubebuilder:subresource:status + // +kubebuilder:resource:path=,scope=Namespaced,categories=cluster-api + ``` + The `categories=cluster-api` bit is what makes `kubectl get cluster-api` + include the resource — keep it on any new infrastructure CRD. +- Print columns. A new infrastructure CRD should expose at minimum + `Cluster` (from the `cluster.x-k8s.io/cluster-name` label) and + `Provisioned` (from `.status.initialization.provisioned`), plus one or + two type-specific columns. Pattern: + `api/v1beta2/cloudscalecluster_types.go:317-323`. +- Implement `conditions.Setter` from + `sigs.k8s.io/cluster-api/util/conditions` so CAPI tooling can read status + uniformly. See + `api/v1beta2/cloudscalecluster_types.go:344-355`. +- Prefer kubebuilder validation markers (`Required`, `Enum`, `Pattern`, + `MinLength`, `MaxLength`, `Minimum`, `Maximum`, `default`) over webhook + checks where a marker suffices — they generate OpenAPI schema and are + enforced by the API server. + +## Multi-version / multi-group + +CAPCS is single-group, single-version (`infrastructure.cluster.x-k8s.io/v1beta2`). +If we ever introduce `v1beta3`, use +`kubebuilder create webhook --conversion --spoke v2` and follow the +[Kubebuilder Book](https://book.kubebuilder.io) — do not roll a custom +conversion scheme. + +## Logging style + +Follow +the [Kubernetes message style guide](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md#message-style-guidelines): +capitalised start, no trailing period, past tense, name the object type +(`"Created FloatingIP"`, not `"created"`), balanced key/value pairs. Use +`log.FromContext(ctx)` (or pass the logger through the scope). ## References -### Essential Reading -- **Kubebuilder Book**: https://book.kubebuilder.io (comprehensive guide) -- **controller-runtime FAQ**: https://github.com/kubernetes-sigs/controller-runtime/blob/main/FAQ.md (common patterns and questions) -- **Good Practices**: https://book.kubebuilder.io/reference/good-practices.html (why reconciliation is idempotent, status conditions, etc.) -- **Logging Conventions**: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md#message-style-guidelines (message style, verbosity levels) - -### API Design & Implementation -- **API Conventions**: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md -- **Operator Pattern**: https://kubernetes.io/docs/concepts/extend-kubernetes/operator/ -- **Markers Reference**: https://book.kubebuilder.io/reference/markers.html - -### Tools & Libraries -- **controller-runtime**: https://github.com/kubernetes-sigs/controller-runtime -- **controller-tools**: https://github.com/kubernetes-sigs/controller-tools -- **Kubebuilder Repo**: https://github.com/kubernetes-sigs/kubebuilder +- [Kubebuilder Book](https://book.kubebuilder.io) +- [Cluster API Book](https://cluster-api.sigs.k8s.io) +- [controller-runtime FAQ](https://github.com/kubernetes-sigs/controller-runtime/blob/main/FAQ.md) +- [cloudscale-go-sdk](https://github.com/cloudscale-ch/cloudscale-go-sdk) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..1ac9d2c --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,37 @@ +# Contributing + +Thanks for your interest in improving CAPCS. + +## Issues + +File bugs and feature requests in the +[GitHub issue tracker](https://github.com/cloudscale-ch/cluster-api-provider-cloudscale/issues). +Please include the CAPCS version, the Kubernetes version of your management +cluster, and the relevant CRD YAML when reporting a bug. If you are unsure +whether a problem is a bug, open an issue anyway — it is easier to redirect +than to discover later. + +## Pull requests + +1. Fork the repository and create a feature branch off `main`. +2. Make your change. Tests live next to the code; new behavior needs a test. +3. Run `make test` and `make lint` locally. +4. For changes that touch reconcilers or templates, run at least + `make test-e2e-lifecycle` against a cloudscale project. Reviewers may + trigger a broader suite via the manual workflow — see + [Running E2E on a PR](docs/development.md#running-e2e-on-a-pr). +5. Open a PR against `main`. Keep the title short and the description + focused on the *why*. + +Commit messages loosely follow +[Conventional Commits](https://www.conventionalcommits.org/) (`feat:`, `fix:`, +`chore:`, `docs:`). Match the style of recent commits. + +## Development setup + +See [docs/development.md](docs/development.md) for architecture, Tilt setup, +test layers, and make targets. + +## Questions + +Open an issue — there is no separate chat channel. diff --git a/Dockerfile b/Dockerfile index 5b59f51..029dfdc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM golang:1.26 AS builder ARG TARGETOS ARG TARGETARCH +ARG VERSION=dev WORKDIR /workspace # Copy the Go Modules manifests @@ -19,7 +20,7 @@ COPY . . # was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO # the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore, # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform. -RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o manager cmd/main.go +RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -ldflags "-X main.version=${VERSION}" -a -o manager cmd/main.go # Use distroless as minimal base image to package the manager binary # Refer to https://github.com/GoogleContainerTools/distroless for more details diff --git a/Makefile b/Makefile index 46826d5..32d71e9 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ TAG ?= dev IMG ?= quay.io/cloudscalech/capcs-staging:$(TAG) # YEAR defines the year value used for substituting the YEAR placeholder in the boilerplate header. YEAR ?= $(shell date +%Y) +LDFLAGS ?= -X main.version=$(TAG) # E2E image configuration E2E_TAG ?= e2e-$(shell git rev-parse --short HEAD) @@ -287,18 +288,18 @@ test-e2e-conformance-fast: $(GINKGO) generate-e2e-templates generate-e2e-config .PHONY: build build: manifests generate fmt vet ## Build manager binary. - go build -o bin/manager cmd/main.go + go build -ldflags '$(LDFLAGS)' -o bin/manager cmd/main.go .PHONY: run run: manifests generate fmt vet ## Run a controller from your host. - go run ./cmd/main.go + go run -ldflags '$(LDFLAGS)' ./cmd/main.go # If you wish to build the manager image targeting other platforms you can use the --platform flag. # (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it. # More info: https://docs.docker.com/develop/develop-images/build_enhancements/ .PHONY: docker-build docker-build: ## Build docker image with the manager. - $(CONTAINER_TOOL) build --platform linux/amd64 -t ${IMG} . + $(CONTAINER_TOOL) build --platform linux/amd64 --build-arg VERSION=$(TAG) -t ${IMG} . .PHONY: docker-push docker-push: ## Push docker image with the manager. @@ -321,7 +322,7 @@ docker-buildx: ## Build and push docker image for the manager for cross-platform sed -e '1 s/\(^FROM\)/FROM --platform=\$$\{BUILDPLATFORM\}/; t' -e ' 1,// s//FROM --platform=\$$\{BUILDPLATFORM\}/' Dockerfile > Dockerfile.cross - $(CONTAINER_TOOL) buildx create --name cluster-api-provider-cloudscale-builder $(CONTAINER_TOOL) buildx use cluster-api-provider-cloudscale-builder - - $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --tag ${IMG} -f Dockerfile.cross . + - $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --build-arg VERSION=$(TAG) --tag ${IMG} -f Dockerfile.cross . - $(CONTAINER_TOOL) buildx rm cluster-api-provider-cloudscale-builder rm Dockerfile.cross diff --git a/README.md b/README.md index abbd79d..dbab340 100644 --- a/README.md +++ b/README.md @@ -2,197 +2,83 @@ [![Tests](https://github.com/cloudscale-ch/cluster-api-provider-cloudscale/actions/workflows/test.yml/badge.svg)](https://github.com/cloudscale-ch/cluster-api-provider-cloudscale/actions/workflows/test.yml) [![Release](https://img.shields.io/github/v/release/cloudscale-ch/cluster-api-provider-cloudscale)](https://github.com/cloudscale-ch/cluster-api-provider-cloudscale/releases/latest) +[![Go Reference](https://pkg.go.dev/badge/github.com/cloudscale-ch/cluster-api-provider-cloudscale.svg)](https://pkg.go.dev/github.com/cloudscale-ch/cluster-api-provider-cloudscale) +[![Goreportcard](https://goreportcard.com/badge/github.com/cloudscale-ch/cluster-api-provider-cloudscale)](https://goreportcard.com/report/github.com/cloudscale-ch/cluster-api-provider-cloudscale) +[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/cloudscale-ch/cluster-api-provider-cloudscale) Kubernetes [Cluster API](https://cluster-api.sigs.k8s.io/) infrastructure provider -for [cloudscale.ch](https://www.cloudscale.ch). +for [cloudscale.ch](https://www.cloudscale.ch). CAPCS provisions the cloudscale-specific +infrastructure — servers, networks, load balancers, floating IPs, server groups — +that Cluster API uses to build and manage workload Kubernetes clusters. + +New to Cluster API? Read the upstream +[concepts](https://cluster-api.sigs.k8s.io/user/concepts.html) and +[quick start](https://cluster-api.sigs.k8s.io/user/quick-start.html) first; this +project only documents what is cloudscale-specific. ## Features -- **CloudscaleCluster**: Multi-network management (managed or pre-existing), Load Balancer (public or private VIP), - Floating IP - support -- **CloudscaleMachine**: Server provisioning with cloud-init and configurable network interfaces -- **CloudscaleMachineTemplate**: Immutable machine templates for KubeadmControlPlane/MachineDeployment +- Managed or pre-existing networks; public or private load balancer VIPs; + floating IPs (IPv4/IPv6); anti-affinity server groups +- HA control plane; `MachineDeployment` autoscaling including + [scale-from-zero](https://cluster-api.sigs.k8s.io/tasks/automated-machine-management/autoscaling) + via capacity reported on `CloudscaleMachineTemplate` +- [ClusterClass](https://cluster-api.sigs.k8s.io/tasks/experimental-features/cluster-class/) support + +## Compatibility + +### Cluster-API Versions + +Currently, CAPCS requires CAPI version >= v1.13.0 and is compatible only with the v1beta2 CRD versions of CAPI. + +### Kubernetes Versions + +The cloudscale provider is able to install and manage +the [versions of Kubernetes supported by the Cluster API (CAPI) project](https://cluster-api.sigs.k8s.io/reference/versions.html#supported-versions-matrix-by-provider-or-component). ## Prerequisites -- A Kubernetes cluster to use as a management cluster ([kind](https://kind.sigs.k8s.io/) works) -- [clusterctl](https://cluster-api.sigs.k8s.io/user/quick-start#install-clusterctl) -- A [cloudscale.ch](https://www.cloudscale.ch) account and API token -- A custom image imported into cloudscale. Images can e.g. be generated - using [image-builder Openstack](https://image-builder.sigs.k8s.io/) +- cloudscale.ch account and API token +- A custom OS image imported into your cloudscale.ch project, e.g. built with + [image-builder for OpenStack](https://image-builder.sigs.k8s.io/) +- A management Kubernetes cluster ([kind](https://kind.sigs.k8s.io/) works) and + [clusterctl](https://cluster-api.sigs.k8s.io/user/quick-start#install-clusterctl) ## Quickstart -### Initialize the management cluster +This quickstart assumes you already know how Cluster-API works and have the prerequisites ready to use. +For a more detailed introduction, please read [our getting started guide](docs/getting-started.md). ```bash export CLOUDSCALE_API_TOKEN= +# initialize the Cluster-API management controllers clusterctl init --infrastructure cloudscale-ch-cloudscale -``` - -### Generate and apply a workload cluster - -Set the [required environment variables](#environment-variables), then generate and apply the cluster manifest: -```bash +# Generate and apply the cluster definition clusterctl generate cluster my-cluster \ - --infrastructure cloudscale-ch-cloudscale \ - --kubernetes-version v1.36.0 \ - --control-plane-machine-count 1 \ - --worker-machine-count 2 \ + --infrastructure cloudscale-ch-cloudscale --kubernetes-version v1.36.0 \ + --control-plane-machine-count 1 --worker-machine-count 2 \ | kubectl apply -f - -``` - -This uses the default template (public nodes, managed network). See [Cluster Templates](#cluster-templates) for other -network topologies. - -Watch the cluster come up: -```bash +# Describe the status of the cluster clusterctl describe cluster my-cluster ``` -## Environment Variables - -| Variable | Description | Example | -|-------------------------------------------|-----------------------------------------|-----------------------------------| -| `CLOUDSCALE_API_TOKEN` | cloudscale.ch API token | `abc123...` | -| `CLOUDSCALE_SSH_PUBLIC_KEY` | SSH public key added to nodes | `ssh-ed25519 AAAA...` | -| `CLOUDSCALE_REGION` | cloudscale.ch region | `lpg` or `rma` | -| `CLOUDSCALE_MACHINE_IMAGE` | Server image for nodes | `custom:ubuntu-2404-kube-v1.xx.x` | -| `CLOUDSCALE_CONTROL_PLANE_MACHINE_FLAVOR` | Flavor for control plane nodes | `flex-4-2` | -| `CLOUDSCALE_WORKER_MACHINE_FLAVOR` | Flavor for worker nodes | `flex-4-2` | -| `CLOUDSCALE_ROOT_VOLUME_SIZE` | Root volume size in GB | `50` | -| `CLOUDSCALE_NETWORK_UUID` | Pre-Existing cloudscale.ch network UUID | `2db69ba3-...` | - -> **Note:** `CLOUDSCALE_NETWORK_UUID` is required by the `fip`, `public-lb-private-nodes`, and `pre-existing-network` -> template flavors. It is not needed for the default template. - -## Cluster Templates +The default template uses a managed network and a public load balancer. +[Getting Started](docs/getting-started.md) lists the required environment +variables and the other template flavors. -CAPCS ships several cluster templates for different network topologies. Use `clusterctl generate cluster` with the -`--flavor` flag to select one: +## Documentation -```bash -clusterctl generate cluster my-cluster \ - --infrastructure cloudscale-ch-cloudscale \ - --kubernetes-version v1.36.0 \ - --control-plane-machine-count 1 \ - --worker-machine-count 2 \ - --flavor \ - | kubectl apply -f - -``` - -| Flavor | Network | CP Endpoint | Node Connectivity | Extra Env Vars | Notes | -|---------------------------|---------------------------|-----------------------|-------------------|---------------------------|----------------------| -| *(default)* | Managed (`172.18.0.0/24`) | Public LB (DualStack) | Public + cluster | — | | -| `fip` | Pre-Existing | Floating IP (IPv4) | Public + cluster | `CLOUDSCALE_NETWORK_UUID` | | -| `public-lb-private-nodes` | Pre-Existing + NAT | Public LB | Private only | `CLOUDSCALE_NETWORK_UUID` | Requires NAT gateway | -| `pre-existing-network` | Pre-Existing | Public LB (DualStack) | Public + cluster | `CLOUDSCALE_NETWORK_UUID` | | - -The default `networks[].cidr` is `172.18.0.0/24` so it does not overlap with the default Cilium -cluster-pool IPAM range `10.0.0.0/8`. If you override `networks[].cidr` to a range inside -`10.0.0.0/8`, make sure to configure your CNI's IP range correctly. Overlapping -ranges may break for example control-plane LB's health checks. - -## Development - -This is a kubebuilder-scaffolded project. For new APIs, Webhooks, etc. [kubebuilder](https://book.kubebuilder.io/) -commands should be used. - -```bash -# Run tests -make test - -# Generate manifests -make manifests - -# Generate code -make generate - -# Run E2E tests (requires CLOUDSCALE_API_TOKEN) -make test-e2e -``` - -### E2E Tests - -E2E tests are built on the [CAPI e2e test framework](https://pkg.go.dev/sigs.k8s.io/cluster-api/test/e2e) -(Ginkgo-based) and provision real clusters on cloudscale.ch. Tests use Ginkgo labels for -filtering and are split into suites of increasing cost, scheduled accordingly: - -| Suite | Label | Description | ~Duration | Schedule | Make target | -|-------------------------|---------------------------|------------------------------------------------------------------------------------------|-----------|----------|------------------------------------| -| Lifecycle | `lifecycle` | 1 CP + 1 worker: create, validate cloudscale resources, delete | ~5 min | Nightly | `test-e2e-lifecycle` | -| HA lifecycle | `ha` | 3 CP + 2 workers with anti-affinity server groups | ~8 min | Weekly | `test-e2e-ha` | -| Cluster upgrade | `upgrade` | Rolling K8s version upgrade (v1.35 → v1.36) | ~25 min | Weekly | `test-e2e-upgrade` | -| Self-hosted | `self-hosted` | clusterctl move (pivot) to workload cluster. Requires container image in public registry | ~13 min | Weekly | `test-e2e-self-hosted` | -| MD remediation | `md-remediation` | MachineHealthCheck auto-replacement of unhealthy workers | ~6 min | Weekly | `test-e2e-md-remediation` | -| Pre-Existing networking | `pre-existing-networking` | Pre-Existing network: public-LB + private-nodes and floating-IP variants | ~30 min | Weekly | `test-e2e-pre-existing-networking` | -| Conformance (fast) | `conformance` | K8s conformance, skip Serial tests | ~55 min | Weekly | `test-e2e-conformance-fast` | -| Conformance (full) | `conformance` | Full K8s conformance including Serial tests | ~120 min | Biweekly | `test-e2e-conformance` | - -Durations are approximate from a real CI run; conformance varies with cluster size. - -**Why this split?** The single-CP lifecycle test is the cheapest smoke test and runs -nightly to catch regressions early. HA, upgrade, self-hosted, and remediation tests are more -resource-intensive and run weekly. Private networking tests require `CLOUDSCALE_NETWORK_UUID` to be set and are -skipped otherwise. Full K8s conformance is the most expensive and runs biweekly -(1st + 15th of month). All suites can be triggered manually via the `test-e2e.yml` workflow -dispatch. E2E tests share a concurrency group so only one suite runs at a time. - -Any run involving the self-hosted spec requires the container image to be published to our registry. The self-hosted -spec moves the management cluster to the first workload cluster. That workload cluster doesn't have access to the -locally -built images and therefore needs a published container image. - -For PRs, no e2e test is automatically run. It is advised to run them locally before submitting, as well as for a -reviewer -to run them locally and/or manually triggering the workflow **after** reviewing the code is safe. - -### Tilt - -The easiest way to work on this provider is by using the -[Tilt setup](https://cluster-api.sigs.k8s.io/developer/core/tilt.html) of Cluster-API. - -Refer to the linked documentation on how to set up your local tilt. This requires cloning -[Cluster-API core](https://github.com/kubernetes-sigs/cluster-api) to your host. The necessary commands need to be -executed in the -Cluster-API core repository (**not** in this repository). - -An example `tilt-settings.yaml`, which should also be placed in the Cluster-API core repository, is provided here: - -```yaml -default_registry: "" # change if you use a remote image registry -provider_repos: - # This refers to your provider directory and loads settings - # from `tilt-provider.yaml` - - path/to/local/clone/cluster-api-provider-cloudscale -enable_providers: - - cloudscale - - kubeadm-bootstrap - - kubeadm-control-plane -deploy_cert_manager: true -kustomize_substitutions: - CLOUDSCALE_API_TOKEN: "INSERT_TOKEN_HERE" - CLOUDSCALE_SSH_PUBLIC_KEY: "INSERT_SSH_PUBLIC_KEY_HERE" - CLOUDSCALE_REGION: "lpg" - CLOUDSCALE_CONTROL_PLANE_MACHINE_FLAVOR: "flex-4-2" - CLOUDSCALE_WORKER_MACHINE_FLAVOR: "flex-4-2" - CLOUDSCALE_MACHINE_IMAGE: "IMAGE_NAME" - CLOUDSCALE_ROOT_VOLUME_SIZE: "50" - # Required for pre-existing network flavors (fip, public-lb-private-nodes, pre-existing-network): - # CLOUDSCALE_NETWORK_UUID: "UUID_HERE" -extra_args: - cloudscale: - - "--zap-log-level=5" -template_dirs: - docker: - - ./test/infrastructure/docker/templates - cloudscale: - - path/to/local/clone/cluster-api-provider-cloudscale/templates -``` +| If you are… | Start here | +|-------------------------------------|----------------------------------------------------------------------------------------------------------------| +| New to Cluster API, or new to CAPCS | [Getting Started](docs/getting-started.md) | +| Looking up a CRD field | `kubectl explain cloudscalecluster.spec` (or the generated CRDs under [`config/crd/bases/`](config/crd/bases)) | +| Setting up monitoring or tracing | [Observability](docs/observability.md) | +| Hitting an error | [Troubleshooting](docs/troubleshooting.md) | +| Contributing to CAPCS | [Development](docs/development.md), [CONTRIBUTING.md](CONTRIBUTING.md) | +| Cutting a release | [Releasing](docs/releasing.md), [Testing releases](docs/testing-releases.md) | ## License diff --git a/api/v1beta2/cloudscalecluster_types.go b/api/v1beta2/cloudscalecluster_types.go index 2cd313e..1f713e6 100644 --- a/api/v1beta2/cloudscalecluster_types.go +++ b/api/v1beta2/cloudscalecluster_types.go @@ -41,13 +41,17 @@ const ( // CloudscaleClusterSpec defines the desired state of CloudscaleCluster type CloudscaleClusterSpec struct { - // Region is the cloudscale.ch region (e.g., "rma", "lpg"). + // Region is the cloudscale.ch region the cluster is provisioned in. + // Determines the default zone and the set of available flavors. + // Immutable after cluster creation. // +kubebuilder:validation:Required // +kubebuilder:validation:Enum=rma;lpg Region string `json:"region"` - // Zone is the cloudscale.ch zone (e.g., "rma1", "lpg1"). - // Defaults to region + "1" if not specified. + // Zone is the cloudscale.ch zone within Region. + // Defaults to Region + "1" (e.g., "rma1", "lpg1"). Set explicitly only when + // the region offers multiple zones and you need to pin the cluster to one. + // Immutable after cluster creation. // +optional Zone string `json:"zone,omitempty"` @@ -87,13 +91,16 @@ type CloudscaleClusterSpec struct { FloatingIP *FloatingIPSpec `json:"floatingIP,omitempty"` } -// CloudscaleCredentialsReference references a Secret containing the API token. +// CloudscaleCredentialsReference references a Secret holding the cloudscale.ch +// API token used to provision this cluster's infrastructure. The Secret must +// contain a key named "token" with the raw token string as its value. type CloudscaleCredentialsReference struct { // Name is the name of the Secret. // +kubebuilder:validation:Required Name string `json:"name"` - // Namespace is the namespace of the Secret. Defaults to the cluster namespace. + // Namespace is the namespace of the Secret. Defaults to the + // CloudscaleCluster's own namespace if unset. // +optional Namespace string `json:"namespace,omitempty"` } @@ -139,18 +146,24 @@ type LoadBalancerSpec struct { // +optional Enabled *bool `json:"enabled,omitempty"` - // Algorithm is the load balancing algorithm. + // Algorithm is the cloudscale.ch load-balancing algorithm. + // - "round_robin" (default): rotate requests across healthy backends. + // - "least_connections": send each request to the backend with the fewest active connections. + // - "source_ip": hash the client IP so the same client lands on the same backend. // +kubebuilder:validation:Enum=round_robin;least_connections;source_ip // +kubebuilder:default="round_robin" // +optional Algorithm string `json:"algorithm,omitempty"` - // Flavor is the load balancer flavor (size). + // Flavor is the cloudscale.ch load balancer flavor slug. Defaults to + // "lb-standard". // +kubebuilder:default="lb-standard" // +optional Flavor string `json:"flavor,omitempty"` - // APIServerPort is the port for the Kubernetes API server. + // APIServerPort is the LB listener port exposed for the Kubernetes API + // server. Defaults to 6443. The pool always targets the API server on the + // control plane nodes' 6443. // +kubebuilder:default=6443 // +kubebuilder:validation:Minimum=1 // +kubebuilder:validation:Maximum=65535 @@ -310,7 +323,9 @@ func (s *CloudscaleClusterStatus) GetNetworkStatus(name string) *NetworkStatus { // +kubebuilder:printcolumn:name="Region",type="string",JSONPath=".spec.region",description="cloudscale.ch region" // +kubebuilder:printcolumn:name="Endpoint",type="string",JSONPath=".spec.controlPlaneEndpoint.host",description="Control plane endpoint" -// CloudscaleCluster is the Schema for the cloudscaleclusters API +// CloudscaleCluster is the cloudscale.ch infrastructure for a CAPI Cluster. +// It owns the networks, control-plane load balancer, optional floating IP, and +// server groups that back the cluster's machines. type CloudscaleCluster struct { metav1.TypeMeta `json:",inline"` diff --git a/api/v1beta2/cloudscaleclustertemplate_types.go b/api/v1beta2/cloudscaleclustertemplate_types.go index adcb066..0d78ad0 100644 --- a/api/v1beta2/cloudscaleclustertemplate_types.go +++ b/api/v1beta2/cloudscaleclustertemplate_types.go @@ -21,22 +21,41 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" ) -// CloudscaleClusterTemplateSpec defines the desired state of CloudscaleClusterTemplate +// CloudscaleClusterTemplateSpec defines the desired state of CloudscaleClusterTemplate. type CloudscaleClusterTemplateSpec struct { + // Template is the embedded resource the CAPI topology controller stamps out + // into a CloudscaleCluster for each Cluster whose ClusterClass references + // this CloudscaleClusterTemplate. Template CloudscaleClusterTemplateResource `json:"template"` } -// CloudscaleClusterTemplateResource contains spec for CloudscaleClusterSpec. +// CloudscaleClusterTemplateResource describes the CloudscaleCluster that the +// topology controller materializes from this template. type CloudscaleClusterTemplateResource struct { + // ObjectMeta supplies labels and annotations that propagate to the + // generated CloudscaleCluster. The name/namespace fields are ignored: + // the topology controller derives those from the owning Cluster. // +optional - ObjectMeta clusterv1.ObjectMeta `json:"metadata,omitempty"` - Spec CloudscaleClusterSpec `json:"spec"` + ObjectMeta clusterv1.ObjectMeta `json:"metadata,omitempty"` + + // Spec embeds CloudscaleClusterSpec verbatim and shares the same defaulting + // and validation logic via the cluster webhook helpers + // (clusterSpecDefault / clusterSpecValidateCreate). + // Immutable after creation; override per-cluster fields via + // spec.topology.variables on the Cluster instead of mutating this spec. + Spec CloudscaleClusterSpec `json:"spec"` } // +kubebuilder:object:root=true // +kubebuilder:resource:path=cloudscaleclustertemplates,scope=Namespaced,categories=cluster-api -// CloudscaleClusterTemplate is the Schema for the cloudscaleclustertemplates API +// CloudscaleClusterTemplate is a template embedded in a ClusterClass that the +// CAPI topology controller uses to materialize a CloudscaleCluster for every +// Cluster whose spec.topology.classRef resolves to a ClusterClass referencing +// this object. Unlike CloudscaleCluster, this CRD has no controller and no +// status — it is consumed only at Cluster creation time by CAPI core. +// Its spec is immutable after creation (enforced by the validating webhook); +// per-cluster overrides go through ClusterClass variables, not template edits. type CloudscaleClusterTemplate struct { metav1.TypeMeta `json:",inline"` diff --git a/api/v1beta2/cloudscalemachine_types.go b/api/v1beta2/cloudscalemachine_types.go index d4eb98c..056e36a 100644 --- a/api/v1beta2/cloudscalemachine_types.go +++ b/api/v1beta2/cloudscalemachine_types.go @@ -34,22 +34,34 @@ type CloudscaleMachineSpec struct { // +optional ProviderID *string `json:"providerID,omitempty"` - // Flavor is the cloudscale.ch server flavor (e.g., "flex-8-4"). + // Flavor is the cloudscale.ch server flavor slug, e.g. "flex-4-2" or + // "plus-8-4". List available flavors via the cloudscale API + // (`GET /v1/flavors`) or the control panel. + // Immutable after machine creation. // +kubebuilder:validation:Required // +kubebuilder:validation:MinLength=1 Flavor string `json:"flavor"` - // Image is the OS image slug (e.g., "ubuntu-24.04"), custom image slug (e.g., "custom:ubuntu-foo"), or custom image UUID. + // Image identifies the OS image used to boot the server. One of: + // - a public image slug (e.g. "ubuntu-24.04"), + // - a custom image slug (e.g. "custom:ubuntu-2404-kube-v1.36.0"), or + // - a custom image UUID. + // For Kubernetes nodes you typically want a custom image built with + // image-builder (https://image-builder.sigs.k8s.io/) that already contains + // kubelet, containerd, and the chosen Kubernetes version. // +kubebuilder:validation:Required // +kubebuilder:validation:MinLength=1 Image string `json:"image"` - // RootVolumeSize is the root volume size in GB. + // RootVolumeSize is the root volume size in GB. Minimum 10. If unset, the + // cloudscale.ch default for the chosen flavor is used. // +kubebuilder:validation:Minimum=10 // +optional RootVolumeSize int `json:"rootVolumeSize,omitempty"` - // Tags are key-value pairs to apply to the server. + // Tags are user-defined key/value pairs applied to the server as cloudscale + // tags. CAPCS additionally sets its own ownership tag with the key + // "capcs-cluster-"; do not set keys with the "capcs-" prefix. // +optional Tags map[string]string `json:"tags,omitempty"` @@ -95,10 +107,14 @@ type InterfaceSpec struct { } // ServerGroupSpec configures server group placement for anti-affinity. +// cloudscale.ch limits a single server group to 4 servers; to scale a pool +// beyond that, split it across multiple MachineDeployments each pointing at a +// CloudscaleMachineTemplate with a distinct ServerGroupSpec.Name. type ServerGroupSpec struct { // Name is the server group name. Machines with the same server group name - // in the same zone will be placed on different physical hosts. - // The server group is created automatically if it doesn't exist. + // in the same zone are placed on different physical hosts. The group is + // created automatically the first time CAPCS sees the name. + // Immutable after machine creation. // +kubebuilder:validation:Required // +kubebuilder:validation:MinLength=1 Name string `json:"name"` @@ -153,7 +169,9 @@ type CloudscaleMachineStatus struct { // +kubebuilder:printcolumn:name="ProviderID",type="string",JSONPath=".spec.providerID",description="cloudscale.ch server ID" // +kubebuilder:printcolumn:name="Machine",type="string",JSONPath=".metadata.ownerReferences[?(@.kind==\"Machine\")].name",description="Machine object" -// CloudscaleMachine is the Schema for the cloudscalemachines API +// CloudscaleMachine represents a single cloudscale.ch server backing a CAPI +// Machine. Most spec fields are immutable after creation — to change them, +// roll the owning MachineDeployment or KubeadmControlPlane. type CloudscaleMachine struct { metav1.TypeMeta `json:",inline"` diff --git a/api/v1beta2/cloudscalemachinetemplate_types.go b/api/v1beta2/cloudscalemachinetemplate_types.go index 64fd857..2b91553 100644 --- a/api/v1beta2/cloudscalemachinetemplate_types.go +++ b/api/v1beta2/cloudscalemachinetemplate_types.go @@ -70,7 +70,10 @@ type NodeInfo struct { // +kubebuilder:object:root=true // +kubebuilder:subresource:status -// CloudscaleMachineTemplate is the Schema for the cloudscalemachinetemplates API +// CloudscaleMachineTemplate is the immutable template a MachineDeployment or +// KubeadmControlPlane uses to stamp out CloudscaleMachines. Its Status.Capacity +// reports the CPU/memory of the chosen flavor (plus the root volume size) so +// the cluster autoscaler can scale a MachineDeployment up from zero replicas. type CloudscaleMachineTemplate struct { metav1.TypeMeta `json:",inline"` diff --git a/cmd/main.go b/cmd/main.go index f03145a..55938f1 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -25,12 +25,13 @@ import ( "os" "time" - "golang.org/x/sync/errgroup" - // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. _ "k8s.io/client-go/plugin/pkg/client/auth" + "github.com/cloudscale-ch/cloudscale-go-sdk/v9/instrumentation" + "go.opentelemetry.io/otel" + "golang.org/x/sync/errgroup" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -38,6 +39,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" + ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" @@ -45,6 +47,7 @@ import ( infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/controller" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" webhookv1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/webhook/v1beta2" // +kubebuilder:scaffold:imports ) @@ -52,6 +55,7 @@ import ( var ( scheme = runtime.NewScheme() setupLog = ctrl.Log.WithName("setup") + version = "dev" ) func init() { @@ -62,8 +66,14 @@ func init() { // +kubebuilder:scaffold:scheme } -// nolint:gocyclo func main() { + if err := run(); err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + os.Exit(1) + } +} + +func run() error { var metricsAddr string var metricsCertPath, metricsCertName, metricsCertKey string var webhookCertPath, webhookCertName, webhookCertKey string @@ -75,6 +85,9 @@ func main() { var machineConcurrency int var watchFilter string var tlsOpts []func(*tls.Config) + var enableTracing bool + var tracingSampleRate float64 + var profilerAddress string flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") @@ -99,6 +112,11 @@ func main() { flag.StringVar(&watchFilter, "watch-filter", "", fmt.Sprintf("Label value that the controller watches to reconcile cluster-api objects. Label key is always %s. "+ "If unspecified, the controller watches for all cluster-api objects.", clusterv1.WatchLabel)) + flag.BoolVar(&enableTracing, "enable-tracing", false, "Enable OpenTelemetry tracing") + flag.Float64Var(&tracingSampleRate, "tracing-sample-rate", 0.1, + "Trace sampling rate, between 0.0 and 1.0 (1.0 = always sample)") + flag.StringVar(&profilerAddress, "profiler-address", "", + "Bind address to expose the pprof profiler (e.g. localhost:6060)") opts := zap.Options{ Development: true, } @@ -108,14 +126,10 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) if clusterConcurrency < 1 || clusterConcurrency > 4 { - setupLog.Error( - fmt.Errorf("--cluster-concurrency must be between 1 and 4, got %d", clusterConcurrency), "invalid flag") - os.Exit(1) + return fmt.Errorf("invalid flag: --cluster-concurrency must be between 1 and 4, got %d", clusterConcurrency) } if machineConcurrency < 1 || machineConcurrency > 10 { - setupLog.Error( - fmt.Errorf("--machine-concurrency must be between 1 and 10, got %d", machineConcurrency), "invalid flag") - os.Exit(1) + return fmt.Errorf("invalid flag: --machine-concurrency must be between 1 and 10, got %d", machineConcurrency) } // if the enable-http2 flag is false (the default), http/2 should be disabled @@ -192,24 +206,37 @@ func main() { HealthProbeBindAddress: probeAddr, LeaderElection: enableLeaderElection, LeaderElectionID: "cloudscale.infrastructure.cluster.x-k8s.io", + PprofBindAddress: profilerAddress, // LeaderElectionReleaseOnCancel: true, }) if err != nil { - setupLog.Error(err, "Failed to start manager") - os.Exit(1) + return fmt.Errorf("failed to start manager: %w", err) } ctx := ctrl.SetupSignalHandler() - // Create a shared HTTP transport for all cloudscale API clients. - // This enables connection pooling and HTTP/2 multiplexing across reconciles. - transport := cloudscale.NewTransport() + if enableTracing { + shutdown, err := observability.InitTracing(ctx, setupLog, "capcs", version, tracingSampleRate) + if err != nil { + return fmt.Errorf("failed to initialize tracing: %w", err) + } + defer shutdown() + } + + // Wrap the transport with SDK instrumentation so all cloudscale API calls + // emit Prometheus metrics and OpenTelemetry spans. + // + // The wrapped transport is shared for all cloudscale API clients to enable connection pooling and HTTP/2 multiplexing + // across reconciles. + instrumentedTransport := instrumentation.InstrumentedTransport(cloudscale.NewTransport(), instrumentation.Options{ + PrometheusRegistry: ctrlmetrics.Registry, + Tracer: otel.Tracer("cloudscale-go-sdk"), + }) // Fetch region information for controllers and webhooks - regionInfo, flavorInfo, err := fetchAPIInfo(transport) + regionInfo, flavorInfo, err := fetchAPIInfo(instrumentedTransport, version) if err != nil { - setupLog.Error(err, "unable to fetch API information") - os.Exit(1) + return fmt.Errorf("failed to fetch API info: %w", err) } setupLog.Info("fetched region information", "regions", regionInfo.GetAllRegions()) setupLog.Info("fetched flavor information", "flavors", len(flavorInfo.GetAllFlavors())) @@ -218,72 +245,65 @@ func main() { Client: mgr.GetClient(), Scheme: mgr.GetScheme(), WatchFilter: watchFilter, - Transport: transport, + Transport: instrumentedTransport, + Version: version, MaxConcurrentReconciles: clusterConcurrency, }).SetupWithManager(ctx, mgr); err != nil { - setupLog.Error(err, "Failed to create controller", "controller", "CloudscaleCluster") - os.Exit(1) + return fmt.Errorf("failed to create controller CloudscaleCluster: %w", err) } if err := (&controller.CloudscaleMachineReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), WatchFilter: watchFilter, - Transport: transport, + Transport: instrumentedTransport, + Version: version, MaxConcurrentReconciles: machineConcurrency, }).SetupWithManager(ctx, mgr); err != nil { - setupLog.Error(err, "Failed to create controller", "controller", "CloudscaleMachine") - os.Exit(1) + return fmt.Errorf("failed to create controller CloudscaleMachine: %w", err) } if err := (&controller.CloudscaleMachineTemplateReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), FlavorInfo: flavorInfo, }).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "Failed to create controller", "controller", "CloudscaleMachineTemplate") - os.Exit(1) + return fmt.Errorf("failed to create controller CloudscaleMachineTemplate: %w", err) } webhooksEnabled := os.Getenv("ENABLE_WEBHOOKS") != "false" if webhooksEnabled { if err := webhookv1beta2.SetupCloudscaleClusterWebhookWithManager(mgr, regionInfo); err != nil { - setupLog.Error(err, "Failed to create webhook", "webhook", "CloudscaleCluster") - os.Exit(1) + return fmt.Errorf("failed to setup webhook validation webhook CloudscaleCluster: %w", err) } if err := webhookv1beta2.SetupCloudscaleMachineWebhookWithManager(mgr, flavorInfo); err != nil { - setupLog.Error(err, "Failed to create webhook", "webhook", "CloudscaleMachine") - os.Exit(1) + return fmt.Errorf("failed to setup webhook validation webhook CloudscaleMachine: %w", err) } if err := webhookv1beta2.SetupCloudscaleMachineTemplateWebhookWithManager(mgr, flavorInfo); err != nil { - setupLog.Error(err, "Failed to create webhook", "webhook", "CloudscaleMachineTemplate") - os.Exit(1) + return fmt.Errorf("failed to setup webhook validation webhook CloudscaleMachineTemplate: %w", err) } if err := webhookv1beta2.SetupCloudscaleClusterTemplateWebhookWithManager(mgr, regionInfo); err != nil { - setupLog.Error(err, "Failed to create webhook", "webhook", "CloudscaleClusterTemplate") - os.Exit(1) + return fmt.Errorf("failed to setup webhook validation webhook CloudscaleClusterTemplate: %w", err) } } // +kubebuilder:scaffold:builder if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { - setupLog.Error(err, "Failed to set up health check") - os.Exit(1) + return fmt.Errorf("failed to set up health check: %w", err) } if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { - setupLog.Error(err, "Failed to set up ready check") - os.Exit(1) + return fmt.Errorf("failed to set up ready check: %w", err) } - setupLog.Info("Starting manager") + setupLog.Info("Starting manager", "version", version) if err := mgr.Start(ctx); err != nil { - setupLog.Error(err, "Failed to run manager") - os.Exit(1) + return fmt.Errorf("failed to run manager: %w", err) } + return nil } // fetchAPIInfo fetches region and flavor information from cloudscale.ch API. // Requires CLOUDSCALE_API_TOKEN environment variable. -func fetchAPIInfo(transport *http.Transport) (*cloudscale.RegionInfo, *cloudscale.FlavorInfo, error) { +func fetchAPIInfo(transport http.RoundTripper, version string) (*cloudscale.RegionInfo, *cloudscale.FlavorInfo, error) { token := os.Getenv("CLOUDSCALE_API_TOKEN") if token == "" { return nil, nil, fmt.Errorf("CLOUDSCALE_API_TOKEN environment variable is required") @@ -292,7 +312,7 @@ func fetchAPIInfo(transport *http.Transport) (*cloudscale.RegionInfo, *cloudscal ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - client := cloudscale.NewClient(token, transport) + client := cloudscale.NewClient(token, version, transport) var regionInfo *cloudscale.RegionInfo var flavorInfo *cloudscale.FlavorInfo diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclusters.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclusters.yaml index 4de5538..323e83e 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclusters.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclusters.yaml @@ -36,7 +36,10 @@ spec: name: v1beta2 schema: openAPIV3Schema: - description: CloudscaleCluster is the Schema for the cloudscaleclusters API + description: |- + CloudscaleCluster is the cloudscale.ch infrastructure for a CAPI Cluster. + It owns the networks, control-plane load balancer, optional floating IP, and + server groups that back the cluster's machines. properties: apiVersion: description: |- @@ -82,7 +85,11 @@ spec: properties: algorithm: default: round_robin - description: Algorithm is the load balancing algorithm. + description: |- + Algorithm is the cloudscale.ch load-balancing algorithm. + - "round_robin" (default): rotate requests across healthy backends. + - "least_connections": send each request to the backend with the fewest active connections. + - "source_ip": hash the client IP so the same client lands on the same backend. enum: - round_robin - least_connections @@ -90,8 +97,10 @@ spec: type: string apiServerPort: default: 6443 - description: APIServerPort is the port for the Kubernetes API - server. + description: |- + APIServerPort is the LB listener port exposed for the Kubernetes API + server. Defaults to 6443. The pool always targets the API server on the + control plane nodes' 6443. format: int32 maximum: 65535 minimum: 1 @@ -105,7 +114,9 @@ spec: type: boolean flavor: default: lb-standard - description: Flavor is the load balancer flavor (size). + description: |- + Flavor is the cloudscale.ch load balancer flavor slug. Defaults to + "lb-standard". type: string healthMonitor: description: HealthMonitor configures the load balancer health @@ -155,8 +166,9 @@ spec: description: Name is the name of the Secret. type: string namespace: - description: Namespace is the namespace of the Secret. Defaults - to the cluster namespace. + description: |- + Namespace is the namespace of the Secret. Defaults to the + CloudscaleCluster's own namespace if unset. type: string required: - name @@ -238,15 +250,20 @@ spec: - name x-kubernetes-list-type: map region: - description: Region is the cloudscale.ch region (e.g., "rma", "lpg"). + description: |- + Region is the cloudscale.ch region the cluster is provisioned in. + Determines the default zone and the set of available flavors. + Immutable after cluster creation. enum: - rma - lpg type: string zone: description: |- - Zone is the cloudscale.ch zone (e.g., "rma1", "lpg1"). - Defaults to region + "1" if not specified. + Zone is the cloudscale.ch zone within Region. + Defaults to Region + "1" (e.g., "rma1", "lpg1"). Set explicitly only when + the region offers multiple zones and you need to pin the cluster to one. + Immutable after cluster creation. type: string required: - credentialsRef diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclustertemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclustertemplates.yaml index 01a3c84..3e003da 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclustertemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscaleclustertemplates.yaml @@ -19,8 +19,14 @@ spec: - name: v1beta2 schema: openAPIV3Schema: - description: CloudscaleClusterTemplate is the Schema for the cloudscaleclustertemplates - API + description: |- + CloudscaleClusterTemplate is a template embedded in a ClusterClass that the + CAPI topology controller uses to materialize a CloudscaleCluster for every + Cluster whose spec.topology.classRef resolves to a ClusterClass referencing + this object. Unlike CloudscaleCluster, this CRD has no controller and no + status — it is consumed only at Cluster creation time by CAPI core. + Its spec is immutable after creation (enforced by the validating webhook); + per-cluster overrides go through ClusterClass variables, not template edits. properties: apiVersion: description: |- @@ -43,31 +49,16 @@ spec: description: spec defines the desired state of CloudscaleClusterTemplate properties: template: - description: CloudscaleClusterTemplateResource contains spec for CloudscaleClusterSpec. + description: |- + Template is the embedded resource the CAPI topology controller stamps out + into a CloudscaleCluster for each Cluster whose ClusterClass references + this CloudscaleClusterTemplate. properties: metadata: description: |- - ObjectMeta is metadata that all persisted resources must have, which includes all objects - users must create. This is a copy of customizable fields from metav1.ObjectMeta. - - ObjectMeta is embedded in `Machine.Spec`, `MachineDeployment.Template` and `MachineSet.Template`, - which are not top-level Kubernetes objects. Given that metav1.ObjectMeta has lots of special cases - and read-only fields which end up in the generated CRD validation, having it as a subset simplifies - the API and some issues that can impact user experience. - - During the [upgrade to controller-tools@v2](https://github.com/kubernetes-sigs/cluster-api/pull/1054) - for v1alpha2, we noticed a failure would occur running Cluster API test suite against the new CRDs, - specifically `spec.metadata.creationTimestamp in body must be of type string: "null"`. - The investigation showed that `controller-tools@v2` behaves differently than its previous version - when handling types from [metav1](k8s.io/apimachinery/pkg/apis/meta/v1) package. - - In more details, we found that embedded (non-top level) types that embedded `metav1.ObjectMeta` - had validation properties, including for `creationTimestamp` (metav1.Time). - The `metav1.Time` type specifies a custom json marshaller that, when IsZero() is true, returns `null` - which breaks validation because the field isn't marked as nullable. - - In future versions, controller-tools@v2 might allow overriding the type and validation for embedded - types. When that happens, this hack should be revisited. + ObjectMeta supplies labels and annotations that propagate to the + generated CloudscaleCluster. The name/namespace fields are ignored: + the topology controller derives those from the owning Cluster. minProperties: 1 properties: annotations: @@ -90,8 +81,12 @@ spec: type: object type: object spec: - description: CloudscaleClusterSpec defines the desired state of - CloudscaleCluster + description: |- + Spec embeds CloudscaleClusterSpec verbatim and shares the same defaulting + and validation logic via the cluster webhook helpers + (clusterSpecDefault / clusterSpecValidateCreate). + Immutable after creation; override per-cluster fields via + spec.topology.variables on the Cluster instead of mutating this spec. properties: controlPlaneEndpoint: description: |- @@ -119,7 +114,11 @@ spec: properties: algorithm: default: round_robin - description: Algorithm is the load balancing algorithm. + description: |- + Algorithm is the cloudscale.ch load-balancing algorithm. + - "round_robin" (default): rotate requests across healthy backends. + - "least_connections": send each request to the backend with the fewest active connections. + - "source_ip": hash the client IP so the same client lands on the same backend. enum: - round_robin - least_connections @@ -127,8 +126,10 @@ spec: type: string apiServerPort: default: 6443 - description: APIServerPort is the port for the Kubernetes - API server. + description: |- + APIServerPort is the LB listener port exposed for the Kubernetes API + server. Defaults to 6443. The pool always targets the API server on the + control plane nodes' 6443. format: int32 maximum: 65535 minimum: 1 @@ -142,7 +143,9 @@ spec: type: boolean flavor: default: lb-standard - description: Flavor is the load balancer flavor (size). + description: |- + Flavor is the cloudscale.ch load balancer flavor slug. Defaults to + "lb-standard". type: string healthMonitor: description: HealthMonitor configures the load balancer @@ -193,8 +196,9 @@ spec: description: Name is the name of the Secret. type: string namespace: - description: Namespace is the namespace of the Secret. - Defaults to the cluster namespace. + description: |- + Namespace is the namespace of the Secret. Defaults to the + CloudscaleCluster's own namespace if unset. type: string required: - name @@ -276,16 +280,20 @@ spec: - name x-kubernetes-list-type: map region: - description: Region is the cloudscale.ch region (e.g., "rma", - "lpg"). + description: |- + Region is the cloudscale.ch region the cluster is provisioned in. + Determines the default zone and the set of available flavors. + Immutable after cluster creation. enum: - rma - lpg type: string zone: description: |- - Zone is the cloudscale.ch zone (e.g., "rma1", "lpg1"). - Defaults to region + "1" if not specified. + Zone is the cloudscale.ch zone within Region. + Defaults to Region + "1" (e.g., "rma1", "lpg1"). Set explicitly only when + the region offers multiple zones and you need to pin the cluster to one. + Immutable after cluster creation. type: string required: - credentialsRef diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscalemachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscalemachines.yaml index 00353d5..0ba75e4 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscalemachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscalemachines.yaml @@ -36,7 +36,10 @@ spec: name: v1beta2 schema: openAPIV3Schema: - description: CloudscaleMachine is the Schema for the cloudscalemachines API + description: |- + CloudscaleMachine represents a single cloudscale.ch server backing a CAPI + Machine. Most spec fields are immutable after creation — to change them, + roll the owning MachineDeployment or KubeadmControlPlane. properties: apiVersion: description: |- @@ -59,12 +62,22 @@ spec: description: spec defines the desired state of CloudscaleMachine properties: flavor: - description: Flavor is the cloudscale.ch server flavor (e.g., "flex-8-4"). + description: |- + Flavor is the cloudscale.ch server flavor slug, e.g. "flex-4-2" or + "plus-8-4". List available flavors via the cloudscale API + (`GET /v1/flavors`) or the control panel. + Immutable after machine creation. minLength: 1 type: string image: - description: Image is the OS image slug (e.g., "ubuntu-24.04"), custom - image slug (e.g., "custom:ubuntu-foo"), or custom image UUID. + description: |- + Image identifies the OS image used to boot the server. One of: + - a public image slug (e.g. "ubuntu-24.04"), + - a custom image slug (e.g. "custom:ubuntu-2404-kube-v1.36.0"), or + - a custom image UUID. + For Kubernetes nodes you typically want a custom image built with + image-builder (https://image-builder.sigs.k8s.io/) that already contains + kubelet, containerd, and the chosen Kubernetes version. minLength: 1 type: string interfaces: @@ -112,7 +125,9 @@ spec: Format: cloudscale:// type: string rootVolumeSize: - description: RootVolumeSize is the root volume size in GB. + description: |- + RootVolumeSize is the root volume size in GB. Minimum 10. If unset, the + cloudscale.ch default for the chosen flavor is used. minimum: 10 type: integer serverGroup: @@ -124,8 +139,9 @@ spec: name: description: |- Name is the server group name. Machines with the same server group name - in the same zone will be placed on different physical hosts. - The server group is created automatically if it doesn't exist. + in the same zone are placed on different physical hosts. The group is + created automatically the first time CAPCS sees the name. + Immutable after machine creation. minLength: 1 type: string required: @@ -134,7 +150,10 @@ spec: tags: additionalProperties: type: string - description: Tags are key-value pairs to apply to the server. + description: |- + Tags are user-defined key/value pairs applied to the server as cloudscale + tags. CAPCS additionally sets its own ownership tag with the key + "capcs-cluster-"; do not set keys with the "capcs-" prefix. type: object required: - flavor diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscalemachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscalemachinetemplates.yaml index e70da90..e52a208 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscalemachinetemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_cloudscalemachinetemplates.yaml @@ -17,8 +17,11 @@ spec: - name: v1beta2 schema: openAPIV3Schema: - description: CloudscaleMachineTemplate is the Schema for the cloudscalemachinetemplates - API + description: |- + CloudscaleMachineTemplate is the immutable template a MachineDeployment or + KubeadmControlPlane uses to stamp out CloudscaleMachines. Its Status.Capacity + reports the CPU/memory of the chosen flavor (plus the root volume size) so + the cluster autoscaler can scale a MachineDeployment up from zero replicas. properties: apiVersion: description: |- @@ -48,14 +51,22 @@ spec: of the machine. properties: flavor: - description: Flavor is the cloudscale.ch server flavor (e.g., - "flex-8-4"). + description: |- + Flavor is the cloudscale.ch server flavor slug, e.g. "flex-4-2" or + "plus-8-4". List available flavors via the cloudscale API + (`GET /v1/flavors`) or the control panel. + Immutable after machine creation. minLength: 1 type: string image: - description: Image is the OS image slug (e.g., "ubuntu-24.04"), - custom image slug (e.g., "custom:ubuntu-foo"), or custom - image UUID. + description: |- + Image identifies the OS image used to boot the server. One of: + - a public image slug (e.g. "ubuntu-24.04"), + - a custom image slug (e.g. "custom:ubuntu-2404-kube-v1.36.0"), or + - a custom image UUID. + For Kubernetes nodes you typically want a custom image built with + image-builder (https://image-builder.sigs.k8s.io/) that already contains + kubelet, containerd, and the chosen Kubernetes version. minLength: 1 type: string interfaces: @@ -103,7 +114,9 @@ spec: Format: cloudscale:// type: string rootVolumeSize: - description: RootVolumeSize is the root volume size in GB. + description: |- + RootVolumeSize is the root volume size in GB. Minimum 10. If unset, the + cloudscale.ch default for the chosen flavor is used. minimum: 10 type: integer serverGroup: @@ -115,8 +128,9 @@ spec: name: description: |- Name is the server group name. Machines with the same server group name - in the same zone will be placed on different physical hosts. - The server group is created automatically if it doesn't exist. + in the same zone are placed on different physical hosts. The group is + created automatically the first time CAPCS sees the name. + Immutable after machine creation. minLength: 1 type: string required: @@ -125,7 +139,10 @@ spec: tags: additionalProperties: type: string - description: Tags are key-value pairs to apply to the server. + description: |- + Tags are user-defined key/value pairs applied to the server as cloudscale + tags. CAPCS additionally sets its own ownership tag with the key + "capcs-cluster-"; do not set keys with the "capcs-" prefix. type: object required: - flavor diff --git a/docs/development.md b/docs/development.md new file mode 100644 index 0000000..08c793b --- /dev/null +++ b/docs/development.md @@ -0,0 +1,208 @@ +# Development + +For contributors working on CAPCS itself. End-user docs are in +[Getting Started](getting-started.md) and [Troubleshooting](troubleshooting.md). + +## Architecture sketch + +CAPCS is a kubebuilder-scaffolded infrastructure provider. + +``` +api/v1beta2/ CRD types +internal/controller/ Reconcilers, one file per cloudscale resource (network, LB, FIP, server group, server) +internal/webhook/v1beta2/ Defaulting + validating webhooks (one per CRD) +internal/cloudscale/ SDK wrapper: shared HTTP transport, flavor/region helpers, per-cluster services +internal/credentials/ Resolves the per-cluster API token from `credentialsRef` +internal/scope/ Per-cluster / per-machine reconciliation scope objects +cmd/main.go Manager setup, controller wiring, leader election, webhook registration +``` + +`CloudscaleClusterTemplate` has a webhook but no reconciler — CAPI core's +topology controller consumes it to materialize a `CloudscaleCluster` for each +`Cluster` whose ClusterClass references the template. + +A few conventions to know before touching code: + +- **Webhooks own all defaulting and validation.** Controllers must never repeat + validation logic — if a field needs a default or a check, it goes in the + webhook so behavior stays consistent between `kubectl apply` and the + reconcile loop. +- **Ownership tags.** Cloudscale resources are tagged with the key + `capcs-cluster-` so the reconciler can identify what it owns + and clean it up. See `api/v1beta2/tags.go` and `internal/controller/cloudscale_tags.go`. +- **Shared HTTP transport.** Per-cluster cloudscale clients share an + `http.Transport` (see `internal/cloudscale/services.go`) so connection + pooling works across reconciliations. + +## Setup + +You need: + +- Go (version pinned in `go.mod`) +- [kind](https://kind.sigs.k8s.io/), [clusterctl](https://cluster-api.sigs.k8s.io/user/quick-start#install-clusterctl), + `kubectl`, `kustomize` +- [Tilt](https://tilt.dev/) for the inner-loop workflow +- A cloudscale.ch API token (export `CLOUDSCALE_API_TOKEN`) +- A cloudscale.ch custom image (see [Getting Started](getting-started.md#prerequisites)) + +## Make targets + +```bash +make test # unit tests + envtest (runs fmt, vet, generate, manifests) +make manifests # regenerate CRDs / webhook config from kubebuilder markers +make generate # regenerate deepcopy code +make lint # golangci-lint +make build # build the manager binary + +make test-e2e-lifecycle # smallest E2E suite — single CP + 1 worker +make test-e2e-topology # topology-flavor E2E (ClusterClass quick-start) +make test-e2e # full conformance-fast E2E suite (slow) +``` + +E2E suites and their cadence are documented in +[Testing Releases](testing-releases.md). + +## Iterating on cluster templates locally + +When you change a file under `templates/`, you can test it before it ships in a +release by pointing `clusterctl generate` at the local file: + +```bash +clusterctl generate cluster my-cluster \ + --infrastructure cloudscale-ch-cloudscale \ + --kubernetes-version v1.36.0 \ + --from templates/cluster-template-fip.yaml \ + | kubectl apply -f - +``` + +This is a contributor flow only — end users consume published flavors via +`--flavor` (see [Getting Started](getting-started.md#3-pick-a-cluster-template-flavor)). + +For the `topology` flavor, `clusterctl generate --from` only consumes a single +cluster-template file, so the `quick-start` ClusterClass in +`templates/cluster-class.yaml` must also be applied separately (or bundled with +a kustomize overlay). + +## Tilt + +The fastest inner loop is Cluster API's +[Tilt setup](https://cluster-api.sigs.k8s.io/developer/core/tilt.html). It runs +out of a local clone of [cluster-api](https://github.com/kubernetes-sigs/cluster-api), +**not** out of this repository. + +Drop a `tilt-settings.yaml` next to the cluster-api checkout: + +```yaml +default_registry: "" +provider_repos: + - path/to/local/clone/cluster-api-provider-cloudscale +enable_providers: + - cloudscale + - kubeadm-bootstrap + - kubeadm-control-plane +deploy_cert_manager: true +kustomize_substitutions: + CLOUDSCALE_API_TOKEN: "INSERT_TOKEN_HERE" + CLOUDSCALE_SSH_PUBLIC_KEY: "INSERT_SSH_PUBLIC_KEY_HERE" + CLOUDSCALE_REGION: "lpg" + CLOUDSCALE_CONTROL_PLANE_MACHINE_FLAVOR: "flex-4-2" + CLOUDSCALE_WORKER_MACHINE_FLAVOR: "flex-4-2" + CLOUDSCALE_MACHINE_IMAGE: "IMAGE_NAME" + CLOUDSCALE_ROOT_VOLUME_SIZE: "50" + # Required for the fip / public-lb-private-nodes / pre-existing-network flavors: + # CLOUDSCALE_NETWORK_UUID: "UUID_HERE" +extra_args: + cloudscale: + - "--zap-log-level=5" +template_dirs: + docker: + - ./test/infrastructure/docker/templates + cloudscale: + - path/to/local/clone/cluster-api-provider-cloudscale/templates +# optional, if wanting to deploy the observability stack +#deploy_observability: +# - grafana +# - kube-state-metrics +# - loki +# - metrics-server +# - prometheus +# - alloy +# - parca +# - tempo +``` + +Then `tilt up` from the cluster-api checkout. + +The `deploy_observability` block is processed by the cluster-api Tiltfile and +brings up Prometheus, Grafana, Tempo, and friends in the management cluster; +see [Cluster API's Tilt documentation](https://cluster-api.sigs.k8s.io/developer/core/tilt) +for what each component does and how to reach the resulting UIs. CAPCS's +`ServiceMonitor` is auto-discovered once the prometheus kustomization is +enabled. For production metric/tracing setup, see +[Observability](observability.md). + +## Tests + +| Layer | Location | What it covers | +|---------|-------------------------------------------|---------------------------------------------------------------------------------------| +| Unit | `*_test.go` next to each file | Pure logic; cloudscale API mocked | +| envtest | `internal/controller/suite_test.go` setup | Reconcilers against a real apiserver + etcd, cloudscale API mocked | +| E2E | `test/e2e/` | Real workload clusters on cloudscale.ch (see [Testing Releases](testing-releases.md)) | + +PRs do not run E2E automatically. Run the relevant suite locally before +submitting (`make test-e2e-lifecycle` at minimum); reviewers can run additional +suites or trigger the `test-e2e.yml` workflow manually after reviewing the +diff is safe — see [Running E2E on a PR](#running-e2e-on-a-pr). + +## Running E2E on a PR + +**When to use it.** A PR touches reconcilers, webhooks, or files under +`templates/` and the reviewer wants to ensure e2e runs through with these changes. + +**Prerequisites.** Maintainer role on +`cloudscale-ch/cluster-api-provider-cloudscale`. The `e2e-tests` +concurrency group means at most one e2e run is in flight at a time. + +**Triggering from the GitHub UI.** Actions → "E2E Tests (Manual)" → "Run +workflow". In the "Use workflow from" dropdown pick the PR branch (`gh pr view +` shows the head branch), choose the make target, and click "Run workflow". +PRs from forks are not directly selectable — push the head to a branch on the +upstream repo first: + +```bash +gh pr checkout +git push upstream HEAD:pr/ +``` + +then dispatch against `pr/`. + +**Triggering with `gh`.** Use `gh workflow run`: + +```bash +gh workflow run test-e2e.yml \ + --ref \ + -f test_target=test-e2e-lifecycle +``` + +`--ref` accepts branch or tag names but not `pull//head`; for fork PRs use +the push-to-upstream workaround above. Watch the run: + +```bash +gh run list --workflow=test-e2e.yml --limit 5 +gh run watch +``` + +**Cleaning up if the run is killed.** cloudscale-side cleanup runs inside the +e2e suite itself; if the workflow is cancelled mid-run, dangling cloudscale +resources may need manual cleanup via the cloudscale control panel. + +## Releases + +See [Releasing](releasing.md) for the tag-and-publish flow and +[Testing Releases](testing-releases.md) for post-release verification. + +## Notes for AI agent contributors + +If you are an AI agent contributing changes, read [`AGENTS.md`](../AGENTS.md) at +the repo root — it covers kubebuilder rules, auto-generated files to leave +alone, and project-specific conventions in more detail. diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 0000000..7828e14 --- /dev/null +++ b/docs/getting-started.md @@ -0,0 +1,276 @@ +# Getting Started + +A CAPCS cluster has two parts: a **management cluster** running CAPCS and core +Cluster API controllers, and a **workload cluster** whose servers, network, +and load balancer CAPCS provisions on cloudscale.ch. Once the workload cluster +comes up you install a CCM, a CNI, and (optionally) a CSI driver into it. + +For Cluster API fundamentals (concepts, `clusterctl`, upgrades) see the +[upstream documentation](https://cluster-api.sigs.k8s.io/) — this guide only +covers what is cloudscale-specific. + +## Contents + +- [Prerequisites](#prerequisites) +- [1. Build and import a custom OS image](#1-build-and-import-a-custom-os-image) +- [2. Create a management cluster](#2-create-a-management-cluster) +- [3. Configure cloudscale credentials](#3-configure-cloudscale-credentials) +- [4. Install CAPCS on the management cluster](#4-install-capcs-on-the-management-cluster) +- [5. Configure template variables](#5-configure-template-variables) +- [6. Pick a cluster template flavor](#6-pick-a-cluster-template-flavor) +- [7. Generate and apply the workload cluster](#7-generate-and-apply-the-workload-cluster) +- [8. Install the cloudscale CCM](#8-install-the-cloudscale-ccm) +- [9. Install a CNI (Cilium)](#9-install-a-cni-cilium) +- [10. Verify](#10-verify) +- [Optional: persistent storage (cloudscale CSI)](#optional-persistent-storage-cloudscale-csi) +- [Clean up](#clean-up) +- [Next steps](#next-steps) + +## Prerequisites + +- A cloudscale.ch account and an API token with read/write scope from the + [control panel](https://control.cloudscale.ch/). Keep it out of version + control. +- [`clusterctl`](https://cluster-api.sigs.k8s.io/user/quick-start#install-clusterctl) in version >= 1.13.0. +- `kubectl` +- The [`cilium` CLI](https://docs.cilium.io/en/stable/gettingstarted/k8s-install-default/) + (used in step 9) or use another CNI you prefer. +- `helm` for the optional CSI driver. + +## 1. Build and import a custom OS image + +CAPCS does not publish a pre-built image. Build one with +[image-builder for OpenStack](https://image-builder.sigs.k8s.io/capi/providers/openstack) +and upload it to your cloudscale.ch project via the control panel or API. + +The image bakes in `kubeadm`, `kubelet`, and the container runtime — its +Kubernetes minor version must match the `KUBERNETES_VERSION` you later pass to +`clusterctl generate cluster`. + +Once imported, note the exact image name. You pass it as +`CLOUDSCALE_MACHINE_IMAGE` with a `custom:` prefix, e.g. +`custom:ubuntu-2404-kube-v1.36.0`. + +## 2. Create a management cluster + +Any conformant Kubernetes cluster works. Local [kind](https://kind.sigs.k8s.io/) +is the simplest: + +```bash +kind create cluster --name capcs-mgmt +``` + +## 3. Configure cloudscale credentials + +`clusterctl` resolves template variables from either the shell environment or +its config file at `~/.config/cluster-api/clusterctl.yaml`. Pick one: + +**Shell:** + +```bash +export CLOUDSCALE_API_TOKEN= +``` + +**Config file** (`~/.config/cluster-api/clusterctl.yaml`): + +```yaml +CLOUDSCALE_API_TOKEN: +# any other CLOUDSCALE_* variable from step 5 can live here too +``` + +See [`clusterctl` configuration](https://cluster-api.sigs.k8s.io/clusterctl/configuration) +for the full format. + +## 4. Install CAPCS on the management cluster + +If you plan to use the `topology` flavor, export `CLUSTER_TOPOLOGY=true` first +so cluster-api core enables the +[ClusterClass feature gate](https://cluster-api.sigs.k8s.io/tasks/experimental-features/cluster-class/). + +```bash +clusterctl init --infrastructure cloudscale-ch-cloudscale +``` + +This installs the Cluster API core, kubeadm bootstrap, kubeadm control plane, +and CAPCS components. + +## 5. Configure template variables + +`clusterctl generate cluster` substitutes these into the chosen template: + +| Variable | Description | Example | +|-------------------------------------------|-------------------------------------------------------|-----------------------------------| +| `CLOUDSCALE_API_TOKEN` | API token used by the workload cluster's CAPCS Secret | `abc123...` | +| `CLOUDSCALE_REGION` | cloudscale.ch region | `lpg` or `rma` | +| `CLOUDSCALE_MACHINE_IMAGE` | Name of your imported custom image | `custom:ubuntu-2404-kube-v1.36.0` | +| `CLOUDSCALE_CONTROL_PLANE_MACHINE_FLAVOR` | Flavor for control plane nodes | `flex-4-2` | +| `CLOUDSCALE_WORKER_MACHINE_FLAVOR` | Flavor for worker nodes | `flex-4-2` | +| `CLOUDSCALE_ROOT_VOLUME_SIZE` | Root volume size in GB | `50` | +| `CLOUDSCALE_SSH_PUBLIC_KEY` | SSH public key added to every node | `ssh-ed25519 AAAA...` | +| `CLOUDSCALE_NETWORK_UUID` | Pre-existing network UUID (depends on flavor) | `2db69ba3-...` | + +Set them in your shell, or keep them in `clusterctl.yaml` alongside the token. + +## 6. Pick a cluster template flavor + +| Flavor | Network | Control plane endpoint | Node connectivity | Requires | +|---------------------------|--------------------------|------------------------|-------------------|------------------------------------------------------| +| *(default)* | Managed, `172.18.0.0/24` | Public LB, DualStack | Public + cluster | — | +| `fip` | Pre-existing | Floating IP, IPv4 | Public + cluster | `CLOUDSCALE_NETWORK_UUID` | +| `pre-existing-network` | Pre-existing | Public LB, DualStack | Public + cluster | `CLOUDSCALE_NETWORK_UUID` | +| `public-lb-private-nodes` | Pre-existing + NAT | Public LB | Private only | `CLOUDSCALE_NETWORK_UUID`, with a NAT gateway set up | +| `topology` | Managed, `172.18.0.0/24` | Public LB, DualStack | Public + cluster | `CLUSTER_TOPOLOGY=true` feature gate | + +These flavors just show possible configurations. You're encouraged to copy and adjust them to your needs. + +If you do need a NAT gateway set up for your cluster, set up a private network and contact cloudscale support with the +following details: + +- Desired name of the gateway +- UUID of the network (in control panel details) +- Optional: desired IP of the gateway in the network + +### Using the topology flavor + +The `topology` flavor generates a `Cluster` that references the `quick-start` +[ClusterClass](https://cluster-api.sigs.k8s.io/tasks/experimental-features/cluster-class/) +shipped in `templates/cluster-class.yaml`. Apply it once per namespace before +generating any topology cluster: + +```bash +clusterctl generate yaml \ + --from https://raw.githubusercontent.com/cloudscale-ch/cluster-api-provider-cloudscale/main/templates/cluster-class.yaml \ + | kubectl apply -f - +``` + +Per-cluster overrides go under `spec.topology.variables`. See `templates/cluster-class.yaml` for the full +variable list and defaults. + +Caveat: the `quick-start` ClusterClass exposes a narrower configurability than the traditional templates — no floating +IP, no pre-existing network, no private load balancer, and pod/service CIDRs are pinned to +`192.168.0.0/16` / `10.96.0.0/12`. Adjust the template for your use-case. + +Use `--flavor topology` in the next step if you did use ClusterClass. + +## 7. Generate and apply the workload cluster + +Make sure to adjust the `--flavor` flag to the right flavor or omit it for the default template. + +```bash +clusterctl generate cluster my-cluster \ + --infrastructure cloudscale-ch-cloudscale \ + --kubernetes-version v1.36.0 \ + --control-plane-machine-count 1 \ + --worker-machine-count 2 \ + --flavor \ + > my-cluster.yaml + +kubectl apply -f my-cluster.yaml +``` + +Inspect `my-cluster.yaml` before +applying — it includes a Secret holding `CLOUDSCALE_API_TOKEN`, which CAPCS +references via `CloudscaleCluster.spec.credentialsRef`. + +Watch progress: + +```bash +clusterctl describe cluster my-cluster +``` + +## 8. Install the cloudscale CCM + +CAPCS only provisions infrastructure. For a working cluster +the [cloudscale CCM](https://github.com/cloudscale-ch/cloudscale-cloud-controller-manager) must be installed on the +**workload** cluster. + +CAPCS provides a [`ClusterResourceSet` you apply on the **management** cluster](https://cluster-api.sigs.k8s.io/tasks/cluster-resource-set). +CAPI's CRS controller then deploys it into any workload cluster labelled `ccm: cloudscale` — every CAPCS template sets this label. + +**N.B.:** the CCM also needs a CLOUDSCALE_API_TOKEN in the environment set. It's recommended to use a different token for the CCM and CAPCS. + +```bash +# the namespace must match the namespace of the workload cluster definition +curl -L https://raw.githubusercontent.com/cloudscale-ch/cluster-api-provider-cloudscale/main/templates/addons/ccm.yaml \ + | envsubst | kubectl apply -n ${NAMESPACE} -f - +``` + +This creates a ConfigMap with the CCM manifests, a Secret with the API token, +and the `ClusterResourceSet` that wires them together. Workload-cluster nodes +will sit tainted with `node.cloudprovider.kubernetes.io/uninitialized` until +the CCM starts and removes the taint — that is expected. + +## 9. Install a CNI (Cilium) + +Fetch the workload cluster's kubeconfig and point `KUBECONFIG` at it: + +```bash +clusterctl get kubeconfig my-cluster > ~/.kube/my-cluster.yaml +export KUBECONFIG=~/.kube/my-cluster.yaml +``` + +Install Cilium with defaults: + +```bash +cilium install +``` + +Nodes become `Ready` once Cilium is up. Any standard CNI works — Cilium is just +the example. + +## 10. Verify + +```bash +# management cluster +clusterctl describe cluster + +# workload cluster +kubectl get nodes +kubectl -n kube-system get pods +``` + +All nodes should report `Ready`. In `kube-system` you should see +`cloudscale-cloud-controller-manager-*` on each control plane node and +`cilium-*` on every node. + +If something is stuck, see [troubleshooting](troubleshooting.md). + +## Optional: persistent storage (cloudscale CSI) + +For `PersistentVolumeClaim` support, install the +[cloudscale CSI driver](https://github.com/cloudscale-ch/csi-cloudscale) on the +**workload** cluster. With `KUBECONFIG` still pointing at the workload cluster: + +```bash +kubectl -n kube-system create secret generic cloudscale \ + --from-literal=access-token="$CLOUDSCALE_API_TOKEN" + +helm repo add csi-cloudscale https://cloudscale-ch.github.io/csi-cloudscale +helm repo update +helm install csi-cloudscale csi-cloudscale/csi-cloudscale -n kube-system +``` + +The chart ships the `cloudscale-volume-ssd` (default) and +`cloudscale-volume-bulk` storage classes, plus LUKS-encrypted variants. See +the upstream repo for version compatibility and storage class options. + +## Clean up + +```bash +kubectl delete cluster my-cluster +``` + +Deleting the `Cluster` cascades through CAPCS, which removes the servers, load +balancer, floating IPs, server groups, and any managed networks it created. +Pre-existing networks supplied via `CLOUDSCALE_NETWORK_UUID` are left intact. + +**N.B.:** Always delete the cluster. Do not delete other custom resources before the cluster is deleted. + +## Next steps + +- Look up CRD fields with `kubectl explain cloudscalecluster.spec` (or browse + the CRDs in [`config/crd/bases/`](../config/crd/bases)) +- Read the [troubleshooting guide](troubleshooting.md) when something gets + stuck +- Upstream Cluster API tasks (upgrades, scaling, MachineHealthChecks, etc.) + are documented at diff --git a/docs/observability.md b/docs/observability.md new file mode 100644 index 0000000..cdee9d9 --- /dev/null +++ b/docs/observability.md @@ -0,0 +1,93 @@ +# Observability + +CAPCS exposes Prometheus metrics, ships Grafana dashboards, and can emit +OpenTelemetry traces. Tracing and the pprof profiler are opt-in; metrics are +always served but require opt-in wiring to be scraped. + +For the developer Tilt loop, see [Development](development.md#tilt) — the +cluster-api core Tiltfile can deploy the full Prometheus / Grafana / Tempo +stack alongside CAPCS. + +## Metrics + +The manager exposes controller-runtime metrics on **HTTPS port 8443** at +`/metrics`, served via the `controller-manager-metrics-service` Service +(port name `https`). Authentication is via Kubernetes ServiceAccount bearer +token. + +Relevant flags (defaults shown): + +``` +--metrics-bind-address=:8443 +--metrics-secure=true +``` + +### Enabling scraping + +The shipped `config/default/kustomization.yaml` leaves the `ServiceMonitor` +and `NetworkPolicy` commented out. To enable scraping with the +[Prometheus Operator](https://prometheus-operator.dev/): + +1. Uncomment these two resources in `config/default/kustomization.yaml`: + + ```yaml + - ../prometheus + - ../network-policy + ``` + +2. Label the namespace that runs Prometheus so the `NetworkPolicy` allows + ingress: + + ```bash + kubectl label namespace metrics=enabled + ``` + +The shipped `ServiceMonitor` (`config/prometheus/monitor.yaml`) uses +`insecureSkipVerify: true` against the manager's self-signed TLS. For +production, enable the cert-manager-backed patch +`config/prometheus/monitor_tls_patch.yaml` (see comments in +`config/default/kustomization.yaml`). + +## Dashboards + +Three Grafana dashboards live under [`grafana/`](../grafana): + +| File | What it shows | +|-----------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------| +| `controller-runtime-metrics.json` | Standard controller-runtime metrics: reconcile rate, queue depth, latency | +| `controller-resources-metrics.json` | Pod CPU/memory/goroutine metrics for the manager | +| `custom-metrics/custom-metrics-dashboard.json` (and accompanying `config.yaml`) | cloudscale.ch API call rate and error rate, by endpoint | + +The custom dashboard reads `cloudscale_requests_total`, so it works for any +workload that uses cloudscale-go-sdk v9 with the instrumented transport, not +just CAPCS. + +## Tracing (opt-in) + +Tracing is **off by default**. To enable it, set the following on the manager: + +``` +--enable-tracing=true +--tracing-sample-rate=0.1 # 0.0–1.0; default 0.1 +``` + +Spans are exported via OTLP/gRPC (insecure). The endpoint is read from +`OTEL_EXPORTER_OTLP_ENDPOINT` (defaults to `localhost:4317`). Point it at your +collector — Tempo, Alloy, or an OpenTelemetry Collector — for example: + +```yaml +env: + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: http://tempo.observability.svc:4317 +``` + +## Profiler (opt-in) + +pprof is **off by default**. Set `--profiler-address` to bind it: + +``` +--profiler-address=localhost:6060 +``` + +Bind to loopback in production and reach it via `kubectl port-forward`. Do +not expose pprof on a routable interface. diff --git a/docs/testing-releases.md b/docs/testing-releases.md index 581093e..33ad251 100644 --- a/docs/testing-releases.md +++ b/docs/testing-releases.md @@ -100,7 +100,8 @@ E2E tests run on schedule against the dev build: | Biweekly (1st & 15th, 3 AM UTC) | Full K8s conformance | `e2e-biweekly.yml` | For release candidates, trigger a manual e2e run via the `test-e2e.yml` workflow dispatch in GitHub Actions. Select the -test suite(s) to run and the branch/tag to test against. +test suite(s) to run and the branch/tag to test against. The same workflow is also how maintainers run a broader suite +against a PR branch — see [Running E2E on a PR](development.md#running-e2e-on-a-pr) for the mechanics. See `test/e2e/` for test infrastructure details and `Makefile` for individual e2e targets (`test-e2e-lifecycle`, `test-e2e-ha`, etc.). diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md new file mode 100644 index 0000000..5edc4e2 --- /dev/null +++ b/docs/troubleshooting.md @@ -0,0 +1,153 @@ +# Troubleshooting + +Cloudscale-specific failure modes for CAPCS. For generic Cluster API issues +(bootstrap, certificates, MachineHealthCheck, etc.) see the +[upstream troubleshooting guide](https://cluster-api.sigs.k8s.io/user/troubleshooting.html). + +## Where to look first + +```bash +# Cluster-level status, conditions, and child resources +clusterctl describe cluster + +# Cloudscale infrastructure conditions +kubectl describe cloudscalecluster +kubectl describe cloudscalemachine + +# Controller logs +kubectl -n capcs-system logs deploy/capcs-controller-manager -f +``` + +Most problems surface as a `Ready: False` condition with a `Reason` and `Message` +on the `CloudscaleCluster` or `CloudscaleMachine` — read those before diving +into logs. + +## Authentication: `401 Unauthorized` from the cloudscale API + +**Symptom:** controller logs show `401` from `api.cloudscale.ch`; `CloudscaleCluster` +stays `Ready: False` with an auth-related message. + +**Common causes:** + +- The credentials Secret is missing the `token` key, or the value is empty. +- `credentialsRef.namespace` points to a namespace that doesn't contain the + Secret (it defaults to the `CloudscaleCluster`'s own namespace if unset). +- The token was revoked or scoped read-only in the cloudscale.ch control panel. + +**Fix:** verify the Secret: + +```bash +kubectl get secret -o jsonpath='{.data.token}' | base64 -d +``` + +Re-create it with read/write scope if needed and let the controller requeue. + +## Image: server creation fails with "image not found" + +**Symptom:** `CloudscaleMachine` stuck `Ready: False`; cloudscale API returns +404 when creating the server. + +**Cause:** the value of `spec.image` (set via `CLOUDSCALE_MACHINE_IMAGE`) +doesn't match a custom image imported into your cloudscale.ch project. CAPCS +does not ship a public image. + +**Fix:** build and import an image with +[image-builder for OpenStack](https://image-builder.sigs.k8s.io/capi/providers/openstack) +and reference its exact name (typically `custom:`). + +## Network: cluster stuck Provisioning, CIDR overlap + +**Symptom:** workers are `Ready` but the control-plane load balancer never goes +healthy; pod-to-LB traffic from inside the cluster fails. + +**Cause:** the network CIDR set on `CloudscaleCluster.spec.networks[].cidr` +overlaps with the CNI's pod or service range. The default Cilium cluster-pool +range is `10.0.0.0/8`, so any network CIDR inside that range collides. + +**Verify:** Check the route table of the servers using `ip route`. + +**Fix:** keep the network CIDR outside the CNI's IPAM range. The default +template uses `172.18.0.0/24` for this reason. If you must use a different +range, reconfigure your CNI to match. + +## Network: wrong pre-existing network UUID + +**Symptom:** `CloudscaleCluster` rejected by the webhook, or accepted but +reconciliation fails with `network not found`. + +**Cause:** `CLOUDSCALE_NETWORK_UUID` doesn't exist in the cloudscale.ch project +the API token belongs to, or it exists in a different region. + +**Fix:** look up the network in the cloudscale.ch control panel, confirm region +matches `CloudscaleCluster.spec.region`, and update the UUID. + +## Load balancer stuck in `degraded` or `error` + +**Symptom:** `clusterctl describe` shows the LB condition as `degraded` or +`error`; the control plane endpoint is unreachable. + +**Cause:** the cloudscale LB has reported a non-running status. CAPCS does not +block reconciliation on `degraded`/`error` (it does block on `changing`), so +stale pool members will still be removed — but a persistent non-running status +points at an issue on the LB itself or its backends. + +**Fix:** check the LB in the cloudscale.ch control panel; verify pool members +correspond to live control plane machines on the expected port. If a control +plane Machine was deleted and replaced, give the reconciler a minute to drop +the old member, then re-check. + +## Server group: cluster cannot scale beyond 4 nodes per pool + +**Symptom:** `MachineDeployment` scale-up stops at 4; new `CloudscaleMachine` +creation rejected by the cloudscale API. + +**Cause:** cloudscale.ch limits a server group to 4 servers. CAPCS places all +machines from one `CloudscaleMachineTemplate` into the server group named in +`spec.serverGroup.name` (if defined). + +**Fix:** split the workload across multiple `MachineDeployment`s, each +referencing a `CloudscaleMachineTemplate` with a distinct +`spec.serverGroup.name`. + +## Webhook rejection: `unknown flavor` + +**Symptom:** `kubectl apply` fails with a webhook validation error on +`spec.flavor` (or `spec.template.spec.flavor` for `CloudscaleMachineTemplate`). + +**Cause:** the webhook validates `flavor` against the live list of flavors +fetched from the cloudscale API. The value doesn't match any known flavor slug. + +**Fix:** list available flavors via the cloudscale API or control panel and pick +a slug that exists there. + +## Webhook validation: common rejections + +Other validations that commonly trip people up: + +| Rejection | What it means | +|-------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------| +| `exactly one of uuid or cidr must be specified` on `spec.networks[*]` | Each network entry references either a pre-existing network (uuid) or a managed one (cidr) | +| `gateway must be within CIDR ` | `networks[*].gatewayAddress` is outside the network's own CIDR | +| `floating IPs cannot be attached to a load balancer with a private VIP` | Combine a public LB with a floating IP, or drop one of them | +| `exactly one of ipFamily or ip must be specified` on `floatingIP` | Set `ipFamily` to let CAPCS allocate, or `ip` to reuse a pre-existing floating IP | +| `field is immutable after cluster creation` | Most cloudscale-side topology fields (region, zone, networks, floating IP, etc.) cannot be changed once the cluster exists | +| `field is immutable` on `CloudscaleMachine.spec` | Most machine spec fields (flavor, image, server group, …) cannot be changed once the machine exists — recreate via `MachineDeployment` rollout instead | +| `CloudscaleClusterTemplate.Spec is immutable` | Override `quick-start` ClusterClass variables on the `Cluster` (`spec.topology.variables`) instead of mutating the `CloudscaleClusterTemplate` directly. | + +When in doubt, run `kubectl explain cloudscalecluster.spec.` — the +generated CRDs carry the rules the webhook enforces. + +## `topology` flavor: cluster is admitted but never reconciles + +**Symptom:** `kubectl apply` of a `topology`-flavor manifest succeeds, but +`clusterctl describe cluster ` shows no progress and no `CloudscaleCluster` +object exists in the namespace. + +**Cause:** the `CLUSTER_TOPOLOGY` feature gate on cluster-api core is disabled, +so the topology controller never materializes infrastructure from the +`quick-start` ClusterClass. + +**Fix:** re-run `clusterctl init` with `CLUSTER_TOPOLOGY=true` exported. See the +upstream +[ClusterClass docs](https://cluster-api.sigs.k8s.io/tasks/experimental-features/cluster-class/) +for what the gate enables. diff --git a/go.mod b/go.mod index 277e8f4..6de8df8 100644 --- a/go.mod +++ b/go.mod @@ -3,10 +3,14 @@ module github.com/cloudscale-ch/cluster-api-provider-cloudscale go 1.26.0 require ( - github.com/cloudscale-ch/cloudscale-go-sdk/v8 v8.0.0 + github.com/cloudscale-ch/cloudscale-go-sdk/v9 v9.0.0 github.com/go-logr/logr v1.4.3 github.com/onsi/ginkgo/v2 v2.29.0 github.com/onsi/gomega v1.41.0 + go.opentelemetry.io/otel v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 + go.opentelemetry.io/otel/sdk v1.43.0 + go.opentelemetry.io/otel/trace v1.43.0 golang.org/x/crypto v0.52.0 golang.org/x/oauth2 v0.36.0 golang.org/x/sync v0.20.0 @@ -122,12 +126,8 @@ require ( go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 // indirect - go.opentelemetry.io/otel v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 // indirect go.opentelemetry.io/otel/metric v1.43.0 // indirect - go.opentelemetry.io/otel/sdk v1.43.0 // indirect - go.opentelemetry.io/otel/trace v1.43.0 // indirect go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.28.0 // indirect @@ -160,3 +160,5 @@ require ( sigs.k8s.io/structured-merge-diff/v6 v6.4.0 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) + +replace github.com/cloudscale-ch/cloudscale-go-sdk/v9 => ../cloudscale-go-sdk diff --git a/go.sum b/go.sum index 69e4b7f..adcddd1 100644 --- a/go.sum +++ b/go.sum @@ -28,8 +28,6 @@ github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1x github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cloudscale-ch/cloudscale-go-sdk/v8 v8.0.0 h1:XP3thdgotNVpPF27568RYHt9kqosVm8eJznJ+X4PJIk= -github.com/cloudscale-ch/cloudscale-go-sdk/v8 v8.0.0/go.mod h1:H4qxiHJof+IdwvaV26ZcmNR39EyggnKIcDfLYcYnBCI= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= diff --git a/grafana/custom-metrics/config.yaml b/grafana/custom-metrics/config.yaml index 3ee1beb..3f4d9dd 100644 --- a/grafana/custom-metrics/config.yaml +++ b/grafana/custom-metrics/config.yaml @@ -1,15 +1,72 @@ --- -customMetrics: -# - metric: # Raw custom metric (required) -# type: # Metric type: counter/gauge/histogram (required) -# expr: # Prom_ql for the metric (optional) -# unit: # Unit of measurement, examples: s,none,bytes,percent,etc. (optional) +# Custom metrics consumed by the grafana.kubebuilder.io/v1-alpha plugin. +# Regenerate dashboards with: +# kubebuilder edit --plugins=grafana.kubebuilder.io/v1-alpha # +# Schema per entry: +# metric: raw Prometheus metric name (required) +# type: counter | gauge | histogram (required) +# expr: PromQL expression (optional; overrides plugin default) +# unit: unit of measurement (optional) # -# Example: -# --- -# customMetrics: -# - metric: foo_bar -# unit: none -# type: histogram -# expr: histogram_quantile(0.90, sum by(instance, le) (rate(foo_bar{job=\"$job\", namespace=\"$namespace\"}[5m]))) +# NOTE: label matchers use single quotes (e.g. {job='$job'}) because the +# kubebuilder grafana plugin substitutes `expr` into JSON without escaping +# inner double quotes, which would break the generated dashboard JSON. +customMetrics: + # ---- cloudscale-go-sdk: cloudscale.ch API ---- + + - metric: cloudscale_requests_total + type: counter + unit: reqps + expr: "sum by (endpoint, method, status) (rate(cloudscale_requests_total{job='$job', namespace='$namespace'}[5m]))" + + - metric: cloudscale_requests_total + type: counter + unit: reqps + expr: "sum by (endpoint, status) (rate(cloudscale_requests_total{job='$job', namespace='$namespace', status!~'2..|3..'}[5m]))" + + - metric: cloudscale_request_duration_seconds + type: histogram + unit: s + expr: "histogram_quantile(0.95, sum by (endpoint, le) (rate(cloudscale_request_duration_seconds_bucket{job='$job', namespace='$namespace'}[5m])))" + + - metric: cloudscale_in_flight_requests + type: gauge + unit: none + expr: "cloudscale_in_flight_requests{job='$job', namespace='$namespace'}" + + # ---- controller-runtime: reconciler health ---- + + - metric: controller_runtime_reconcile_time_seconds + type: histogram + unit: s + expr: "histogram_quantile(0.99, sum by (controller, le) (rate(controller_runtime_reconcile_time_seconds_bucket{job='$job', namespace='$namespace'}[5m])))" + + - metric: controller_runtime_terminal_reconcile_errors_total + type: counter + unit: none + expr: "sum by (controller) (rate(controller_runtime_terminal_reconcile_errors_total{job='$job', namespace='$namespace'}[5m]))" + + - metric: controller_runtime_reconcile_panics_total + type: counter + unit: none + expr: "sum by (controller) (rate(controller_runtime_reconcile_panics_total{job='$job', namespace='$namespace'}[5m]))" + + # ---- controller-runtime: admission webhooks ---- + + - metric: controller_runtime_webhook_requests_total + type: counter + unit: reqps + expr: "sum by (webhook, code) (rate(controller_runtime_webhook_requests_total{job='$job', namespace='$namespace'}[5m]))" + + - metric: controller_runtime_webhook_requests_in_flight + type: gauge + unit: none + expr: "controller_runtime_webhook_requests_in_flight{job='$job', namespace='$namespace'}" + + # ---- client-go: management-cluster apiserver traffic ---- + + - metric: rest_client_requests_total + type: counter + unit: reqps + expr: "sum by (code, method) (rate(rest_client_requests_total{job='$job', namespace='$namespace'}[5m]))" diff --git a/grafana/custom-metrics/custom-metrics-dashboard.json b/grafana/custom-metrics/custom-metrics-dashboard.json new file mode 100644 index 0000000..e5fa398 --- /dev/null +++ b/grafana/custom-metrics/custom-metrics-dashboard.json @@ -0,0 +1,995 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line",that + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "sum by (endpoint, method, status) (rate(cloudscale_requests_total{job='$job', namespace='$namespace'}[5m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "cloudscale_requests_total (counter)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "sum by (endpoint, status) (rate(cloudscale_requests_total{job='$job', namespace='$namespace', status!~'2..|3..'}[5m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "cloudscale_requests_total (counter)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "histogram_quantile(0.95, sum by (endpoint, le) (rate(cloudscale_request_duration_seconds_bucket{job='$job', namespace='$namespace'}[5m])))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "cloudscale_request_duration_seconds (histogram)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "cloudscale_in_flight_requests{job='$job', namespace='$namespace'}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "cloudscale_in_flight_requests (gauge)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by (controller, le) (rate(controller_runtime_reconcile_time_seconds_bucket{job='$job', namespace='$namespace'}[5m])))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "controller_runtime_reconcile_time_seconds (histogram)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "sum by (controller) (rate(controller_runtime_terminal_reconcile_errors_total{job='$job', namespace='$namespace'}[5m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "controller_runtime_terminal_reconcile_errors_total (counter)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "sum by (controller) (rate(controller_runtime_reconcile_panics_total{job='$job', namespace='$namespace'}[5m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "controller_runtime_reconcile_panics_total (counter)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "sum by (webhook, code) (rate(controller_runtime_webhook_requests_total{job='$job', namespace='$namespace'}[5m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "controller_runtime_webhook_requests_total (counter)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "controller_runtime_webhook_requests_in_flight{job='$job', namespace='$namespace'}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "controller_runtime_webhook_requests_in_flight (gauge)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "sum by (code, method) (rate(rest_client_requests_total{job='$job', namespace='$namespace'}[5m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "rest_client_requests_total (counter)", + "type": "timeseries" + } + ], + "refresh": "", + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(controller_runtime_reconcile_total{namespace=~\"$namespace\"}, job)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "job", + "options": [], + "query": { + "query": "label_values(controller_runtime_reconcile_total{namespace=~\"$namespace\"}, job)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "observability", + "value": "observability" + }, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(controller_runtime_reconcile_total, namespace)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(controller_runtime_reconcile_total, namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(controller_runtime_reconcile_total{namespace=~\"$namespace\", job=~\"$job\"}, pod)", + "hide": 2, + "includeAll": true, + "label": "pod", + "multi": true, + "name": "pod", + "options": [], + "query": { + "query": "label_values(controller_runtime_reconcile_total{namespace=~\"$namespace\", job=~\"$job\"}, pod)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Custom-Metrics", + "weekStart": "" +} diff --git a/internal/cloudscale/client.go b/internal/cloudscale/client.go index 2438603..86e0900 100644 --- a/internal/cloudscale/client.go +++ b/internal/cloudscale/client.go @@ -25,7 +25,7 @@ import ( "strings" "time" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" "golang.org/x/oauth2" ) @@ -87,10 +87,14 @@ func NewTransport() *http.Transport { // and reused across clients. Each client gets its own oauth2 token injection // but shares the underlying connection pool. // +// version is appended to the SDK's User-Agent header (e.g. +// "cloudscale/v9.0.0 capcs/") so the API server can identify +// the controller making the call. +// // No global HTTP timeout is set on the client. Instead, callers must use // context.WithTimeout with ReadTimeout, WriteTimeout, or DeleteTimeout // for each API call. -func NewClient(token string, transport *http.Transport) *Client { +func NewClient(token, version string, transport http.RoundTripper) *Client { tokenSource := oauth2.StaticTokenSource(&oauth2.Token{AccessToken: token}) httpClient := &http.Client{ @@ -100,6 +104,7 @@ func NewClient(token string, transport *http.Transport) *Client { }, } sdkClient := cloudscalesdk.NewClient(httpClient) + sdkClient.UserAgent = sdkClient.UserAgent + " capcs/" + version return &Client{ Networks: sdkClient.Networks, diff --git a/internal/cloudscale/client_test.go b/internal/cloudscale/client_test.go index 865c0a2..5dc9e16 100644 --- a/internal/cloudscale/client_test.go +++ b/internal/cloudscale/client_test.go @@ -22,7 +22,7 @@ import ( "os" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" ) diff --git a/internal/cloudscale/flavors.go b/internal/cloudscale/flavors.go index 1120c3d..9776baf 100644 --- a/internal/cloudscale/flavors.go +++ b/internal/cloudscale/flavors.go @@ -3,7 +3,7 @@ package cloudscale import ( "fmt" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" ) diff --git a/internal/cloudscale/flavors_test.go b/internal/cloudscale/flavors_test.go index b954017..5397635 100644 --- a/internal/cloudscale/flavors_test.go +++ b/internal/cloudscale/flavors_test.go @@ -3,7 +3,7 @@ package cloudscale import ( "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" diff --git a/internal/cloudscale/regions.go b/internal/cloudscale/regions.go index e70ad0d..e0e891b 100644 --- a/internal/cloudscale/regions.go +++ b/internal/cloudscale/regions.go @@ -17,7 +17,7 @@ limitations under the License. package cloudscale import ( - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" ) // RegionInfo holds cloudscale.ch region and zone information for validation. diff --git a/internal/cloudscale/regions_test.go b/internal/cloudscale/regions_test.go index d6f634f..bbeeeed 100644 --- a/internal/cloudscale/regions_test.go +++ b/internal/cloudscale/regions_test.go @@ -19,7 +19,7 @@ package cloudscale import ( "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" ) diff --git a/internal/cloudscale/services.go b/internal/cloudscale/services.go index e6290a0..464d0f5 100644 --- a/internal/cloudscale/services.go +++ b/internal/cloudscale/services.go @@ -19,7 +19,7 @@ package cloudscale import ( "context" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" ) type NetworkService interface { diff --git a/internal/controller/cloudscale_services.go b/internal/controller/cloudscale_services.go index 664b571..e7b6062 100644 --- a/internal/controller/cloudscale_services.go +++ b/internal/controller/cloudscale_services.go @@ -4,7 +4,7 @@ import ( "context" "fmt" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" diff --git a/internal/controller/cloudscale_services_test.go b/internal/controller/cloudscale_services_test.go index df601bd..b105f4c 100644 --- a/internal/controller/cloudscale_services_test.go +++ b/internal/controller/cloudscale_services_test.go @@ -21,7 +21,7 @@ import ( "fmt" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" "github.com/go-logr/logr" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" diff --git a/internal/controller/cloudscale_tags.go b/internal/controller/cloudscale_tags.go index 2323711..3edffa3 100644 --- a/internal/controller/cloudscale_tags.go +++ b/internal/controller/cloudscale_tags.go @@ -1,7 +1,7 @@ package controller import ( - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" ) diff --git a/internal/controller/cloudscalecluster_controller.go b/internal/controller/cloudscalecluster_controller.go index 92acf1a..5d7da24 100644 --- a/internal/controller/cloudscalecluster_controller.go +++ b/internal/controller/cloudscalecluster_controller.go @@ -23,6 +23,7 @@ import ( "net/http" "time" + "go.opentelemetry.io/otel/attribute" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -38,11 +39,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/handler" - logf "sigs.k8s.io/controller-runtime/pkg/log" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/credentials" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -52,7 +53,8 @@ type CloudscaleClusterReconciler struct { Scheme *runtime.Scheme recorder events.EventRecorder WatchFilter string - Transport *http.Transport + Transport http.RoundTripper + Version string MaxConcurrentReconciles int } @@ -69,7 +71,12 @@ func (r *CloudscaleClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re ctx, cancel := context.WithTimeout(ctx, 5*time.Minute) defer cancel() - logger := logf.FromContext(ctx) + ctx, logger, done := observability.StartSpanWithLogger(ctx, + "controllers.CloudscaleClusterReconciler.Reconcile", + attribute.String("namespace", req.Namespace), + attribute.String("name", req.Name), + ) + defer done() cloudscaleCluster := &infrastructurev1beta2.CloudscaleCluster{} if err := r.Get(ctx, req.NamespacedName, cloudscaleCluster); err != nil { @@ -108,7 +115,7 @@ func (r *CloudscaleClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re return ctrl.Result{}, fmt.Errorf("failed to get cloudscale.ch credentials: %w", err) } - cloudscaleClient := cloudscale.NewClient(token, r.Transport) + cloudscaleClient := cloudscale.NewClient(token, r.Version, r.Transport) clusterScope, err := scope.NewClusterScope(scope.ClusterScopeParams{ Client: r.Client, @@ -144,7 +151,10 @@ func (r *CloudscaleClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re // reconcileNormal handles normal reconciliation of cloudscale infrastructure. func (r *CloudscaleClusterReconciler) reconcileNormal(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { - clusterScope.Info("Reconciling CloudscaleCluster") + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleClusterReconciler.reconcileNormal") + defer done() + + logger.Info("Reconciling CloudscaleCluster") // update ready conditions upon returning from this function based on updated clusterScope. defer r.setReadyCondition(clusterScope) @@ -187,7 +197,10 @@ func (r *CloudscaleClusterReconciler) reconcileNormal(ctx context.Context, clust // //nolint:unparam // Returns ctrl.Result for consistency with reconcile pattern func (r *CloudscaleClusterReconciler) reconcileDelete(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { - clusterScope.Info("Reconciling CloudscaleCluster deletion") + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleClusterReconciler.reconcileDelete") + defer done() + + logger.Info("Reconciling CloudscaleCluster deletion") // Set Deleting condition r.setCondition(clusterScope, infrastructurev1beta2.DeletingCondition, metav1.ConditionTrue, infrastructurev1beta2.DeletingReason, "Deleting infrastructure resources") diff --git a/internal/controller/cloudscalecluster_floatingip.go b/internal/controller/cloudscalecluster_floatingip.go index 0acb95f..12a561e 100644 --- a/internal/controller/cloudscalecluster_floatingip.go +++ b/internal/controller/cloudscalecluster_floatingip.go @@ -21,7 +21,7 @@ import ( "fmt" "time" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" @@ -31,6 +31,7 @@ import ( infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -39,6 +40,11 @@ const createFloatingIPTimeoutRequeueAfter = 5 * time.Second // reconcileFloatingIP ensures the floating IP exists and is assigned to the correct target. // When no floating IP is configured, this sets the condition to true and returns. func (r *CloudscaleClusterReconciler) reconcileFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleClusterReconciler.reconcileFloatingIP") + defer done() + + logger.Info("Reconciling floating IP") + fipSpec := clusterScope.CloudscaleCluster.Spec.FloatingIP if fipSpec == nil { r.setCondition(clusterScope, infrastructurev1beta2.FloatingIPReadyCondition, metav1.ConditionTrue, infrastructurev1beta2.FloatingIPDisabledReason, "") diff --git a/internal/controller/cloudscalecluster_floatingip_test.go b/internal/controller/cloudscalecluster_floatingip_test.go index 38f1cb6..e497f67 100644 --- a/internal/controller/cloudscalecluster_floatingip_test.go +++ b/internal/controller/cloudscalecluster_floatingip_test.go @@ -23,7 +23,7 @@ import ( "os" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" diff --git a/internal/controller/cloudscalecluster_loadbalancer.go b/internal/controller/cloudscalecluster_loadbalancer.go index b7f4ab9..c9cfc80 100644 --- a/internal/controller/cloudscalecluster_loadbalancer.go +++ b/internal/controller/cloudscalecluster_loadbalancer.go @@ -23,7 +23,7 @@ import ( "slices" "time" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" @@ -33,6 +33,7 @@ import ( infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -48,6 +49,11 @@ const ( // It also sets the control plane endpoint from the load balancer's VIP address. // When the load balancer is disabled (external control plane), this function returns immediately. func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, clusterScope *scope.ClusterScope) (result ctrl.Result, reterr error) { + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleClusterReconciler.reconcileLoadBalancer") + defer done() + + logger.Info("Reconciling load balancer") + // LB disabled: set condition and return before defer is registered if !ptr.Deref(clusterScope.CloudscaleCluster.Spec.ControlPlaneLoadBalancer.Enabled, true) { clusterScope.Info("Load balancer is disabled, skipping reconciliation (external control plane)") diff --git a/internal/controller/cloudscalecluster_loadbalancer_test.go b/internal/controller/cloudscalecluster_loadbalancer_test.go index fdd4100..182c0d7 100644 --- a/internal/controller/cloudscalecluster_loadbalancer_test.go +++ b/internal/controller/cloudscalecluster_loadbalancer_test.go @@ -21,7 +21,7 @@ import ( "testing" "time" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" diff --git a/internal/controller/cloudscalecluster_network.go b/internal/controller/cloudscalecluster_network.go index 3f9fb08..d4e4a3f 100644 --- a/internal/controller/cloudscalecluster_network.go +++ b/internal/controller/cloudscalecluster_network.go @@ -23,13 +23,14 @@ import ( "strings" "time" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -38,6 +39,11 @@ const createNetworkTimeoutRequeueAfter = 5 * time.Second // reconcileNetwork orchestrates network and subnet provisioning for all networks // defined in spec.networks. A single NetworkReadyCondition covers all networks. func (r *CloudscaleClusterReconciler) reconcileNetwork(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleClusterReconciler.reconcileNetwork") + defer done() + + logger.Info("Reconciling network") + defer func() { if reterr != nil { r.setCondition(clusterScope, infrastructurev1beta2.NetworkReadyCondition, metav1.ConditionFalse, infrastructurev1beta2.NetworkErrorReason, reterr.Error()) diff --git a/internal/controller/cloudscalecluster_network_test.go b/internal/controller/cloudscalecluster_network_test.go index b99e1b2..513c475 100644 --- a/internal/controller/cloudscalecluster_network_test.go +++ b/internal/controller/cloudscalecluster_network_test.go @@ -23,7 +23,7 @@ import ( "os" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" diff --git a/internal/controller/cloudscalecluster_reconcile_test.go b/internal/controller/cloudscalecluster_reconcile_test.go index ff8dc16..b89a0aa 100644 --- a/internal/controller/cloudscalecluster_reconcile_test.go +++ b/internal/controller/cloudscalecluster_reconcile_test.go @@ -27,7 +27,7 @@ import ( "context" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" diff --git a/internal/controller/cloudscalecluster_servergroup.go b/internal/controller/cloudscalecluster_servergroup.go index 3f480e9..300c60d 100644 --- a/internal/controller/cloudscalecluster_servergroup.go +++ b/internal/controller/cloudscalecluster_servergroup.go @@ -21,7 +21,7 @@ import ( "errors" "fmt" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" "k8s.io/apimachinery/pkg/util/sets" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/controller-runtime/pkg/client" diff --git a/internal/controller/cloudscalecluster_servergroup_test.go b/internal/controller/cloudscalecluster_servergroup_test.go index 0e5e16e..a73e643 100644 --- a/internal/controller/cloudscalecluster_servergroup_test.go +++ b/internal/controller/cloudscalecluster_servergroup_test.go @@ -21,7 +21,7 @@ import ( "fmt" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/events" diff --git a/internal/controller/cloudscalemachine_controller.go b/internal/controller/cloudscalemachine_controller.go index ac141e9..f758585 100644 --- a/internal/controller/cloudscalemachine_controller.go +++ b/internal/controller/cloudscalemachine_controller.go @@ -22,6 +22,7 @@ import ( "net/http" "time" + "go.opentelemetry.io/otel/attribute" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -37,11 +38,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/handler" - logf "sigs.k8s.io/controller-runtime/pkg/log" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/credentials" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -56,7 +57,8 @@ type CloudscaleMachineReconciler struct { Scheme *runtime.Scheme recorder events.EventRecorder WatchFilter string - Transport *http.Transport + Transport http.RoundTripper + Version string MaxConcurrentReconciles int } @@ -71,7 +73,12 @@ func (r *CloudscaleMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re ctx, cancel := context.WithTimeout(ctx, 3*time.Minute) defer cancel() - logger := logf.FromContext(ctx) + ctx, logger, done := observability.StartSpanWithLogger(ctx, + "controllers.CloudscaleMachineReconciler.Reconcile", + attribute.String("namespace", req.Namespace), + attribute.String("name", req.Name), + ) + defer done() cloudscaleMachine := &infrastructurev1beta2.CloudscaleMachine{} if err := r.Get(ctx, req.NamespacedName, cloudscaleMachine); err != nil { @@ -134,7 +141,7 @@ func (r *CloudscaleMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re return ctrl.Result{}, fmt.Errorf("failed to get cloudscale.ch credentials: %w", err) } - cloudscaleClient := cloudscale.NewClient(token, r.Transport) + cloudscaleClient := cloudscale.NewClient(token, r.Version, r.Transport) machineScope, err := scope.NewMachineScope(scope.MachineScopeParams{ Client: r.Client, @@ -172,7 +179,10 @@ func (r *CloudscaleMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re // reconcileNormal handles normal reconciliation of CloudscaleMachine. func (r *CloudscaleMachineReconciler) reconcileNormal(ctx context.Context, machineScope *scope.MachineScope) (ctrl.Result, error) { - machineScope.Info("Reconciling CloudscaleMachine") + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleMachineReconciler.reconcileNormal") + defer done() + + logger.Info("Reconciling CloudscaleMachine") defer r.setReadyCondition(machineScope.CloudscaleMachine) @@ -242,7 +252,10 @@ func (r *CloudscaleMachineReconciler) setReadyCondition(machine *infrastructurev // //nolint:unparam // Returns ctrl.Result for consistency with reconcile pattern func (r *CloudscaleMachineReconciler) reconcileDelete(ctx context.Context, machineScope *scope.MachineScope) (ctrl.Result, error) { - machineScope.Info("Reconciling CloudscaleMachine deletion") + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleMachineReconciler.reconcileDelete") + defer done() + + logger.Info("Reconciling CloudscaleMachine deletion") // Set Deleting condition r.setCondition(machineScope.CloudscaleMachine, infrastructurev1beta2.DeletingCondition, metav1.ConditionTrue, infrastructurev1beta2.DeletingReason, "Deleting server") diff --git a/internal/controller/cloudscalemachine_reconcile_test.go b/internal/controller/cloudscalemachine_reconcile_test.go index 9865775..cc9bb17 100644 --- a/internal/controller/cloudscalemachine_reconcile_test.go +++ b/internal/controller/cloudscalemachine_reconcile_test.go @@ -27,7 +27,7 @@ import ( "fmt" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" diff --git a/internal/controller/cloudscalemachine_server.go b/internal/controller/cloudscalemachine_server.go index ce307b6..7c1353c 100644 --- a/internal/controller/cloudscalemachine_server.go +++ b/internal/controller/cloudscalemachine_server.go @@ -22,7 +22,7 @@ import ( "maps" "time" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" @@ -31,6 +31,7 @@ import ( infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -69,6 +70,11 @@ const ( ) func (r *CloudscaleMachineReconciler) reconcileServer(ctx context.Context, machineScope *scope.MachineScope) (_ ctrl.Result, reterr error) { + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleMachineReconciler.reconcileServer") + defer done() + + logger.Info("Reconciling server") + var server *cloudscalesdk.Server defer func() { if reterr != nil { diff --git a/internal/controller/cloudscalemachine_server_test.go b/internal/controller/cloudscalemachine_server_test.go index 7923112..955c242 100644 --- a/internal/controller/cloudscalemachine_server_test.go +++ b/internal/controller/cloudscalemachine_server_test.go @@ -20,7 +20,7 @@ import ( "context" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/events" diff --git a/internal/controller/cloudscalemachine_servergroup.go b/internal/controller/cloudscalemachine_servergroup.go index b2dcfce..e32a6fa 100644 --- a/internal/controller/cloudscalemachine_servergroup.go +++ b/internal/controller/cloudscalemachine_servergroup.go @@ -22,12 +22,13 @@ import ( "sync" "time" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -40,6 +41,11 @@ var serverGroupMu sync.Mutex // reconcileServerGroup ensures the server group exists if specified. // Server groups are zone-scoped and created once per unique name+zone combination. func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, machineScope *scope.MachineScope) (_ ctrl.Result, reterr error) { + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleMachineReconciler.reconcileServerGroup") + defer done() + + logger.Info("Reconciling server group") + defer func() { if reterr != nil { r.setCondition(machineScope.CloudscaleMachine, infrastructurev1beta2.ServerGroupReadyCondition, metav1.ConditionFalse, infrastructurev1beta2.ServerGroupErrorReason, reterr.Error()) diff --git a/internal/controller/cloudscalemachine_servergroup_test.go b/internal/controller/cloudscalemachine_servergroup_test.go index 0a3aa62..9e5c0c1 100644 --- a/internal/controller/cloudscalemachine_servergroup_test.go +++ b/internal/controller/cloudscalemachine_servergroup_test.go @@ -21,7 +21,7 @@ import ( "fmt" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/events" diff --git a/internal/observability/composite_logger.go b/internal/observability/composite_logger.go new file mode 100644 index 0000000..74301ee --- /dev/null +++ b/internal/observability/composite_logger.go @@ -0,0 +1,81 @@ +/* +Copyright 2026 cloudscale.ch. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package observability + +import "github.com/go-logr/logr" + +// compositeLogSink is a logr.LogSink that multiplexes calls to multiple +// underlying sinks. +type compositeLogSink struct { + sinks []logr.LogSink +} + +// Init implements logr.LogSink. +func (c *compositeLogSink) Init(info logr.RuntimeInfo) { + for _, s := range c.sinks { + s.Init(info) + } +} + +// Enabled implements logr.LogSink. It returns true if any underlying sink is +// enabled. +func (c *compositeLogSink) Enabled(level int) bool { + for _, s := range c.sinks { + if s.Enabled(level) { + return true + } + } + return false +} + +// Info implements logr.LogSink. +func (c *compositeLogSink) Info(level int, msg string, keysAndValues ...any) { + for _, s := range c.sinks { + s.Info(level, msg, keysAndValues...) + } +} + +// Error implements logr.LogSink. +func (c *compositeLogSink) Error(err error, msg string, keysAndValues ...any) { + for _, s := range c.sinks { + s.Error(err, msg, keysAndValues...) + } +} + +// WithValues implements logr.LogSink. +func (c *compositeLogSink) WithValues(keysAndValues ...any) logr.LogSink { + newSinks := make([]logr.LogSink, len(c.sinks)) + for i, s := range c.sinks { + newSinks[i] = s.WithValues(keysAndValues...) + } + return &compositeLogSink{sinks: newSinks} +} + +// WithName implements logr.LogSink. +func (c *compositeLogSink) WithName(name string) logr.LogSink { + newSinks := make([]logr.LogSink, len(c.sinks)) + for i, s := range c.sinks { + newSinks[i] = s.WithName(name) + } + return &compositeLogSink{sinks: newSinks} +} + +// NewCompositeLogger returns a LogSink that forwards calls to all provided +// sinks. +func NewCompositeLogger(sinks ...logr.LogSink) logr.LogSink { + return &compositeLogSink{sinks: sinks} +} diff --git a/internal/observability/span.go b/internal/observability/span.go new file mode 100644 index 0000000..e9e543b --- /dev/null +++ b/internal/observability/span.go @@ -0,0 +1,46 @@ +/* +Copyright 2026 cloudscale.ch. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package observability + +import ( + "context" + + "github.com/go-logr/logr" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// StartSpanWithLogger starts a new OTel span and returns a context, logger, and +// done function. The returned logger is a composite that writes to both the +// standard logger and the span as events. +func StartSpanWithLogger( + ctx context.Context, + spanName string, + attrs ...attribute.KeyValue, +) (context.Context, logr.Logger, func()) { + tracer := otel.Tracer("capcs") + ctx, span := tracer.Start(ctx, spanName, trace.WithAttributes(attrs...)) + + baseLogger := logf.FromContext(ctx) + sink := NewCompositeLogger(baseLogger.GetSink(), NewSpanLogSink(span)) + logger := logr.New(sink).WithName(spanName) + ctx = logr.NewContext(ctx, logger) + + return ctx, logger, func() { span.End() } +} diff --git a/internal/observability/span_logger.go b/internal/observability/span_logger.go new file mode 100644 index 0000000..cb1ebb6 --- /dev/null +++ b/internal/observability/span_logger.go @@ -0,0 +1,93 @@ +/* +Copyright 2026 cloudscale.ch. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package observability + +import ( + "fmt" + "time" + + "github.com/go-logr/logr" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" +) + +// spanLogSink is a logr.LogSink implementation that writes log data to an +// OpenTelemetry span as events. +type spanLogSink struct { + span trace.Span + name string + vals []any +} + +// Init implements logr.LogSink. +func (s *spanLogSink) Init(_ logr.RuntimeInfo) {} + +// Enabled implements logr.LogSink. +func (s *spanLogSink) Enabled(_ int) bool { return true } + +// Info implements logr.LogSink, writing an event to the span. +func (s *spanLogSink) Info(_ int, msg string, keysAndValues ...any) { + attrs := kvsToAttrs(append(s.vals, keysAndValues...)...) + s.span.AddEvent( + fmt.Sprintf("[INFO | %s] %s", s.name, msg), + trace.WithTimestamp(time.Now()), + trace.WithAttributes(attrs...), + ) +} + +// Error implements logr.LogSink, recording the error and writing an event to +// the span. +func (s *spanLogSink) Error(err error, msg string, keysAndValues ...any) { + attrs := kvsToAttrs(append(s.vals, keysAndValues...)...) + s.span.RecordError(err) + s.span.AddEvent( + fmt.Sprintf("[ERROR | %s] %s (%s)", s.name, msg, err), + trace.WithTimestamp(time.Now()), + trace.WithAttributes(attrs...), + ) +} + +// WithValues implements logr.LogSink. +func (s spanLogSink) WithValues(keysAndValues ...any) logr.LogSink { + vals := make([]any, len(s.vals)+len(keysAndValues)) + copy(vals, s.vals) + copy(vals[len(s.vals):], keysAndValues) + s.vals = vals + return &s +} + +// WithName implements logr.LogSink. +func (s spanLogSink) WithName(name string) logr.LogSink { + s.name = name + return &s +} + +// NewSpanLogSink returns a LogSink that writes log events to the given span. +func NewSpanLogSink(span trace.Span) logr.LogSink { + return &spanLogSink{span: span} +} + +// kvsToAttrs converts key-value pairs (from a logr call) to OTel attributes. +func kvsToAttrs(kvs ...any) []attribute.KeyValue { + var attrs []attribute.KeyValue + for i := 0; i+1 < len(kvs); i += 2 { + k := fmt.Sprint(kvs[i]) + v := fmt.Sprint(kvs[i+1]) + attrs = append(attrs, attribute.String(k, v)) + } + return attrs +} diff --git a/internal/observability/tracing.go b/internal/observability/tracing.go new file mode 100644 index 0000000..b3e7d50 --- /dev/null +++ b/internal/observability/tracing.go @@ -0,0 +1,78 @@ +/* +Copyright 2026 cloudscale.ch. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package observability + +import ( + "context" + "fmt" + "time" + + "github.com/go-logr/logr" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.4.0" +) + +// InitTracing initializes an OpenTelemetry tracer provider with an OTLP gRPC +// exporter. The OTLP endpoint is read from the OTEL_EXPORTER_OTLP_ENDPOINT +// environment variable (defaults to localhost:4317 if unset). +func InitTracing(ctx context.Context, log logr.Logger, serviceName, version string, sampleRate float64) (func(), error) { + if sampleRate < 0.0 || sampleRate > 1.0 { + return nil, fmt.Errorf("tracing-sample-rate must be between 0.0 and 1.0, got %f", sampleRate) + } + + res, err := resource.New(ctx, + resource.WithAttributes( + semconv.ServiceNameKey.String(serviceName), + attribute.String("version", version), + ), + ) + if err != nil { + return nil, fmt.Errorf("failed to create opentelemetry resource: %w", err) + } + + exporter, err := otlptracegrpc.New(ctx, otlptracegrpc.WithInsecure()) + if err != nil { + return nil, fmt.Errorf("failed to create OTLP trace exporter: %w", err) + } + + sampler := sdktrace.AlwaysSample() + if sampleRate < 1.0 { + sampler = sdktrace.ParentBased(sdktrace.TraceIDRatioBased(sampleRate)) + } + + tp := sdktrace.NewTracerProvider( + sdktrace.WithSampler(sampler), + sdktrace.WithResource(res), + sdktrace.WithBatcher(exporter), + ) + otel.SetTracerProvider(tp) + otel.SetTextMapPropagator(propagation.TraceContext{}) + + shutdown := func() { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := tp.Shutdown(ctx); err != nil { + log.Error(err, "Failed to shut down tracer provider") + } + } + return shutdown, nil +} diff --git a/internal/testutils/fixtures.go b/internal/testutils/fixtures.go index 0f589ca..4803137 100644 --- a/internal/testutils/fixtures.go +++ b/internal/testutils/fixtures.go @@ -1,7 +1,7 @@ package testutils import ( - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" diff --git a/internal/testutils/mocks.go b/internal/testutils/mocks.go index 0eb80b5..ae2d1c2 100644 --- a/internal/testutils/mocks.go +++ b/internal/testutils/mocks.go @@ -4,7 +4,7 @@ import ( "context" "errors" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" ) // --- Network Service Mock --- diff --git a/test/e2e/cloudscale_helpers.go b/test/e2e/cloudscale_helpers.go index f46a973..ac86377 100644 --- a/test/e2e/cloudscale_helpers.go +++ b/test/e2e/cloudscale_helpers.go @@ -28,7 +28,7 @@ import ( // newCloudscaleClient creates a new cloudscale API client from the given token. func newCloudscaleClient(token string) *cloudscale.Client { - return cloudscale.NewClient(token, cloudscale.NewTransport()) + return cloudscale.NewClient(token, "e2e", cloudscale.NewTransport()) } // resourceSnapshot holds a snapshot of cloudscale API resources for leak detection.