diff --git a/Cargo.lock b/Cargo.lock index 2bda6f78364e9..733b560b8bbe5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -212,6 +212,15 @@ dependencies = [ "derive_arbitrary", ] +[[package]] +name = "arc-swap" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9f3647c145568cec02c42054e07bdf9a5a698e15b466fb2341bfc393cd24aa5" +dependencies = [ + "rustversion", +] + [[package]] name = "archery" version = "1.2.2" @@ -1277,6 +1286,28 @@ dependencies = [ "tracing", ] +[[package]] +name = "axum-server" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1df331683d982a0b9492b38127151e6453639cd34926eb9c07d4cd8c6d22bfc" +dependencies = [ + "arc-swap", + "bytes", + "either", + "fs-err", + "http 1.4.0", + "http-body 1.0.1", + "hyper 1.9.0", + "hyper-util", + "openssl", + "openssl-sys", + "pin-project-lite", + "tokio", + "tokio-openssl", + "tower-service", +] + [[package]] name = "azure_core" version = "0.21.0" @@ -3530,6 +3561,16 @@ dependencies = [ "uuid", ] +[[package]] +name = "fs-err" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73fde052dbfc920003cfd2c8e2c6e6d4cc7c1091538c3a24226cec0665ab08c0" +dependencies = [ + "autocfg", + "tokio", +] + [[package]] name = "fs_extra" version = "1.3.0" @@ -5974,6 +6015,7 @@ dependencies = [ "semver", "serde", "serde_json", + "sha2", "tracing", "uuid", ] @@ -7020,6 +7062,7 @@ dependencies = [ "anyhow", "async-trait", "axum", + "axum-server", "clap", "futures", "http 1.4.0", diff --git a/Cargo.toml b/Cargo.toml index d4305b703d716..1ea8b765b2230 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -290,6 +290,7 @@ aws-smithy-types = { version = "1.1.8", features = ["byte-stream-poll-next"] } aws-types = "1.3.9" axum = { version = "0.8.8", features = ["ws"] } axum-extra = { version = "0.12.5", features = ["typed-header"] } +axum-server = { version = "0.8.0", features = ["tls-openssl"] } azure_core = "0.21.0" azure_identity = "0.21.0" azure_storage = "0.21.0" diff --git a/bin/bump-version b/bin/bump-version index 1b6379f4c428e..96050379d276f 100755 --- a/bin/bump-version +++ b/bin/bump-version @@ -70,9 +70,11 @@ rm -f src/{clusterd,environmentd,materialized,persist-client,testdrive,catalog-d cargo update --workspace -crd_descriptions_json=doc/user/data/self_managed/materialize_crd_descriptions.json -cargo run -p mz-cloud-resources --bin crd-writer > "${crd_descriptions_json}" -git add "${crd_descriptions_json}" +for crd_version in v1alpha1 v1; do + crd_descriptions_json="doc/user/data/self_managed/materialize_crd_descriptions_${crd_version}.json" + cargo run -p mz-cloud-resources --bin crd-writer -- "${crd_version}" > "${crd_descriptions_json}" + git add "${crd_descriptions_json}" +done bin/helm-chart-version-bump --bump-orchestratord-version "v$version" diff --git a/ci/nightly/pipeline.template.yml b/ci/nightly/pipeline.template.yml index 4a9578693b991..05d63542f9e47 100644 --- a/ci/nightly/pipeline.template.yml +++ b/ci/nightly/pipeline.template.yml @@ -2439,6 +2439,32 @@ steps: agents: queue: hetzner-aarch64-16cpu-32gb + - id: orchestratord-v1-opt-in + label: "Orchestratord v1 opt-in tests" + artifact_paths: ["mz_debug_*.zip"] + depends_on: devel-docker-tags + timeout_in_minutes: 120 + plugins: + - ./ci/plugins/mzcompose: + composition: orchestratord + run: v1-opt-in + ci-builder: stable + agents: + queue: hetzner-aarch64-16cpu-32gb + + - id: orchestratord-manually-promote + label: "Orchestratord ManuallyPromote tests" + artifact_paths: ["mz_debug_*.zip"] + depends_on: devel-docker-tags + timeout_in_minutes: 120 + plugins: + - ./ci/plugins/mzcompose: + composition: orchestratord + run: manually-promote + ci-builder: stable + agents: + queue: hetzner-aarch64-16cpu-32gb + - id: emulator label: Materialize Emulator depends_on: build-aarch64 diff --git a/doc/developer/design/20260209_simplified_rollout_triggers_and_crd.md b/doc/developer/design/20260209_simplified_rollout_triggers_and_crd.md index 35fc80bbfe7fa..5ae76fa79862c 100644 --- a/doc/developer/design/20260209_simplified_rollout_triggers_and_crd.md +++ b/doc/developer/design/20260209_simplified_rollout_triggers_and_crd.md @@ -19,7 +19,7 @@ Additionally, the current system is difficult to automate when faced with evicti 1. **Automatic rollout detection**: The system should automatically detect when a rollout is needed based on spec changes, without requiring users to manually set a UUID. -2. **Seamless version migration**: Existing v1alpha1 resources should continue to work, with automatic conversion to v1alpha2 as needed. +2. **Seamless version migration**: Existing v1alpha1 resources should continue to work, with automatic conversion to v1 as needed. 3. **Terraform compatibility**: Configuration must not fight with infrastructure as code tools such as Terraform. @@ -34,9 +34,9 @@ Additionally, the current system is difficult to automate when faced with evicti ## Solution Proposal -### 1. New CRD Version: v1alpha2 +### 1. New CRD Version: v1 -Introduce a new `v1alpha2` version of the Materialize CRD with the following changes: +Introduce a new `v1` version of the Materialize CRD with the following changes: **Spec changes:** - Remove `requestRollout` (`Uuid`) - Rollouts are now triggered automatically when the spec hash changes. @@ -122,14 +122,14 @@ A new HTTPS webhook server handles CRD version conversion: **Endpoint:** `POST /convert` **Supported conversions:** -- v1alpha1 -> v1alpha2 -- v1alpha2 -> v1alpha1\* +- v1alpha1 -> v1 +- v1 -> v1alpha1\* \*The API server seemed to want this, I don't know why. We can't reconcile these, so going back never makes sense. **Key conversion logic:** -###### v1alpha1 to v1alpha2: +###### v1alpha1 to v1: - Spec fields: - `forcePromote: Uuid` becomes `forcePromote: Option` (nil UUID becomes None) - `requestRollout` is removed. @@ -144,52 +144,52 @@ A new HTTPS webhook server handles CRD version conversion: - If we are already in "promoting" status, we should unconditionally complete the promotion for the current rollout rather than destroying and replacing it. This may trigger an additional rollout this one time, but I don't know any way around that. I think this is acceptable given the user is doing something very weird by updating orchestratord mid-rollout. -###### v1alpha2 to v1alpha1: +###### v1 to v1alpha1: -We need to include the `lastCompletedRolloutHash` from v1alpha2 in v1alpha1 as well. This is required for round tripping from v1alpha2 -> v1alpha1 -> v1alpha2, -which may happen if a user applies a v1alpha1 change over a v1alpha2 object. +We need to include the `lastCompletedRolloutHash` from v1 in v1alpha1 as well. This is required for round tripping from v1 -> v1alpha1 -> v1, +which may happen if a user applies a v1alpha1 change over a v1 object. -In the case there is an existing `lastCompletedRolloutHash`, it should be kept as-is through the round trip. As we never reconcile with v1alpha1, it should only change at v1alpha2, so this should be safe. +In the case there is an existing `lastCompletedRolloutHash`, it should be kept as-is through the round trip. As we never reconcile with v1alpha1, it should only change at v1, so this should be safe. -No attempt is made to support v1alpha1 beyond giving a valid v1alpha1 structure and supporting round tripping to v1alpha2. Fields that do not exist in v1alpha2 may have their nil value. +No attempt is made to support v1alpha1 beyond giving a valid v1alpha1 structure and supporting round tripping to v1. Fields that do not exist in v1 may have their nil value. ##### Example round trips -In these examples, we assume that orchestratord's attempt to update the stored version succeeds and that reconciliation is triggered after this update. This is only to simplify this document, and is not necessary for correctness. If orchestratord's attempt to update the stored version fails, or the reconciliation is triggered first, the conversion webhook is simply called at that time and we will reconcile the same v1alpha2 object. +In these examples, we assume that orchestratord's attempt to update the stored version succeeds and that reconciliation is triggered after this update. This is only to simplify this document, and is not necessary for correctness. If orchestratord's attempt to update the stored version fails, or the reconciliation is triggered first, the conversion webhook is simply called at that time and we will reconcile the same v1 object. ###### Simplest case 1. There is a stored v1alpha1 Materialize resource, not actively rolling out, with both `status.lastCompletedRolloutRequest` and `spec.requestRollout` matching. -1. Orchestratord gets updated to a version with v1alpha2 support. -1. Orchestratord lists existing v1alpha1 resources on startup, in order to upgrade them to v1alpha2. - 1. The API server calls the conversion webhook, which returns a v1alpha2 resource. In this case, it would have `status.lastCompletedRolloutHash` and `status.requestedRolloutHash` set to the same calculated hash after conversion. -1. Orchestratord calls `replace` to store the resource as v1alpha2. -1. Orchestratord gets notified of the new v1alpha2 resource, but determines there is nothing to do. +1. Orchestratord gets updated to a version with v1 support. +1. Orchestratord lists existing v1alpha1 resources on startup, in order to upgrade them to v1. + 1. The API server calls the conversion webhook, which returns a v1 resource. In this case, it would have `status.lastCompletedRolloutHash` and `status.requestedRolloutHash` set to the same calculated hash after conversion. +1. Orchestratord calls `replace` to store the resource as v1. +1. Orchestratord gets notified of the new v1 resource, but determines there is nothing to do. -At this point, the stored version is v1alpha2, and no rollout is triggered. +At this point, the stored version is v1, and no rollout is triggered. 1. The user then applies a v1alpha1 resource. It contains some change that affects the hash (ie: `spec.environmentd_image_ref`). It may or may not include `spec.requestRollout`, that doesn't matter. -1. Before storing this change, the API server calls the conversion webhook, which returns a v1alpha2 resource. In this case, it should not contain a status, as the user applied v1alpha1 resource did not contain a status (TODO verify this). -1. Orchestratord gets notified of the new v1alpha2 resource, which contains the old status not yet updated after the applied v1alpha1 resource. This means the `status.lastCompletedRolloutHash` and `status.requestedRolloutHash` still match each other, but do not match the calculated hash. +1. Before storing this change, the API server calls the conversion webhook, which returns a v1 resource. In this case, it should not contain a status, as the user applied v1alpha1 resource did not contain a status (TODO verify this). +1. Orchestratord gets notified of the new v1 resource, which contains the old status not yet updated after the applied v1alpha1 resource. This means the `status.lastCompletedRolloutHash` and `status.requestedRolloutHash` still match each other, but do not match the calculated hash. 1. Orchestratord reconciles like normal, calculating a new `status.requestedRolloutHash` and triggering a rollout since it is different. -If the user had instead applied a v1alpha2 resource instead, no conversion would be needed and orchestratord would reconcile it directly. +If the user had instead applied a v1 resource instead, no conversion would be needed and orchestratord would reconcile it directly. ###### Existing v1alpha1 resource is mid-upgrade, but not promoting 1. There is a stored v1alpha1 Materialize resource, actively rolling out, with `status.lastCompletedRolloutRequest` and `spec.requestRollout` not matching. It is not in "promoting" status. -1. Orchestratord gets updated to a version with v1alpha2 support. -1. Orchestratord lists existing v1alpha1 resources on startup, in order to upgrade them to v1alpha2. - 1. The API server calls the conversion webhook, which returns a v1alpha2 resource. In this case, it would have `status.lastCompletedRolloutHash` set to `None` and `status.requestedRolloutHash` set to the calculated hash after conversion. -1. Orchestratord calls `replace` to store the resource as v1alpha2. -1. Orchestratord gets notified of the new v1alpha2 resource. +1. Orchestratord gets updated to a version with v1 support. +1. Orchestratord lists existing v1alpha1 resources on startup, in order to upgrade them to v1. + 1. The API server calls the conversion webhook, which returns a v1 resource. In this case, it would have `status.lastCompletedRolloutHash` set to `None` and `status.requestedRolloutHash` set to the calculated hash after conversion. +1. Orchestratord calls `replace` to store the resource as v1. +1. Orchestratord gets notified of the new v1 resource. 1. Orchestratord reconciles like normal, continuing the existing rollout and overwriting any objects that are different. This is the same behavior it would have with current orchestratord and v1alpha1. ###### Existing v1alpha1 resource is mid-upgrade and already promoting 1. There is a stored v1alpha1 Materialize resource, actively rolling out, with `status.lastCompletedRolloutRequest` and `spec.requestRollout` not matching. It is in "promoting" status. -1. Orchestratord gets updated to a version with v1alpha2 support. -1. Orchestratord lists existing v1alpha1 resources on startup, in order to upgrade them to v1alpha2. - 1. The API server calls the conversion webhook, which returns a v1alpha2 resource. In this case, it would have `status.lastCompletedRolloutHash` set to `None` and `status.requestedRolloutHash` set to the calculated hash after conversion. -1. Orchestratord calls `replace` to store the resource as v1alpha2. -1. Orchestratord gets notified of the new v1alpha2 resource. +1. Orchestratord gets updated to a version with v1 support. +1. Orchestratord lists existing v1alpha1 resources on startup, in order to upgrade them to v1. + 1. The API server calls the conversion webhook, which returns a v1 resource. In this case, it would have `status.lastCompletedRolloutHash` set to `None` and `status.requestedRolloutHash` set to the calculated hash after conversion. +1. Orchestratord calls `replace` to store the resource as v1. +1. Orchestratord gets notified of the new v1 resource. 1. Orchestratord reconciles like normal. Critically, it unconditionally continues with promotion rather than overwriting any objects. 1. After promotion is successful, the updated status triggers a new rollout. (TODO verify that this works if we have a `status.requestedRolloutHash` set in the initial conversion) @@ -216,8 +216,8 @@ Orchestratord will also get readiness probes so nothing tries to call this webho ### 5. CRD Registration The CRD is registered with: -- Both v1alpha1 and v1alpha2 versions -- v1alpha2 as the stored version +- Both v1alpha1 and v1 versions +- v1 as the stored version - Webhook conversion configuration pointing to the operator service ```rust @@ -241,7 +241,7 @@ mz_crd.spec.conversion = Some(CustomResourceConversion { ### 6. Replace all Materialize resources to update their stored versions -We have set v1alpha2 as the stored version, but that doesn't update existing resources. Those are only updated when they are reapplied. +We have set v1 as the stored version, but that doesn't update existing resources. Those are only updated when they are reapplied. During orchestratord startup, after waiting for the CRD to be established, we need to loop through all Materialize resources and `replace` them. @@ -250,22 +250,22 @@ If it is possible to determine the stored version of these resources, we should I think it is OK for this to be best-effort, and only warn in case of failure. For backward compatibility reasons, we're going to have to support the old version for some time. Orchestratord is likely to get restarted/upgraded multiple times in that period, so it can try again. -If the user ever writes an updated CR, it will also be stored in v1alpha2, so it isn't critical that this work immediately. +If the user ever writes an updated CR, it will also be stored in v1, so it isn't critical that this work immediately. ## Known testing required Our existing nightly orchestratord tests cover a lot, but we'll need to extend them to work with multiple CRD versions. -- Upgrades from existing v1alpha1 environments by applying v1alpha1 CR. (this is basically what we have now, but we need to not break it with the orchestratord changes to reconcile v1alpha2 after conversion) -- Upgrades from existing v1alpha1 environments by applying v1alpha2 CR. -- Upgrades from existing v1alpha2 environments by applying v1alpha1 CR. -- Upgrades from existing v1alpha2 environments by applying v1alpha2 CR. +- Upgrades from existing v1alpha1 environments by applying v1alpha1 CR. (this is basically what we have now, but we need to not break it with the orchestratord changes to reconcile v1 after conversion) +- Upgrades from existing v1alpha1 environments by applying v1 CR. +- Upgrades from existing v1 environments by applying v1alpha1 CR. +- Upgrades from existing v1 environments by applying v1 CR. - Upgrade from existing v1alpha1 environment that is mid-rollout not in "promoting" status. - Upgrade from existing v1alpha1 environment that is mid-rollout in "promoting" status. - Upgrades with a previous rollout already in progress. - Upgrades triggered by annotation. -- Deploy of latest Materialize image versions using v1alpha2 CR. -- Deploy of older Materialize image versions using v1alpha2 CR. +- Deploy of latest Materialize image versions using v1 CR. +- Deploy of older Materialize image versions using v1 CR. ## Minimal Viable Prototype diff --git a/doc/user/content/releases/_index.md b/doc/user/content/releases/_index.md index 17de75bfb4c26..30e14a4b64d22 100644 --- a/doc/user/content/releases/_index.md +++ b/doc/user/content/releases/_index.md @@ -1042,7 +1042,7 @@ v26.4.0 introduces several performance improvements and bugfixes. - **Up to 3x faster hydration times for large PostgreSQL tables**: We've reduced the overhead incurred by communication between multiple *workers* on a large cluster. We've observed up to 3x throughput improvement when ingesting 1 TB PostgreSQL tables on large clusters. - **More efficient source ingestion batching**: Sources now batch writes more effectively. This can result in improved freshness and lower resource utilization, especially when a source is doing a large number of writes. - **CloudSQL HA failover support** (*Materialize Self-Managed only*): Materialize Self-Managed now offers better support for handling failovers in CloudSQL HA sources, without downtime. [Contact our support team](/support/) to enable this in your environment. -- **Manual Promotion** (*Materialize Self-Managed only*): [Rollout strategies](/self-managed-deployments/upgrading/#rollout-strategies) allow you control how Materialize transitions from the current generation to a new generation during an upgrade. We've added a new rollout strategy called `ManuallyPromote` which allows you to choose when to promote the new generation. This means that you can minimize the impact of potential downtime. +- **Manual Promotion** (*Materialize Self-Managed only*): [Rollout strategies](/self-managed-deployments/upgrading/materialize-instances/v1/#rollout-strategies) allow you control how Materialize transitions from the current generation to a new generation during an upgrade. We've added a new rollout strategy called `ManuallyPromote` which allows you to choose when to promote the new generation. This means that you can minimize the impact of potential downtime. ### Bug Fixes {#v26.4-bug-fixes} - Fixed timestamp determination logic to handle empty read holds correctly. @@ -1211,7 +1211,7 @@ use the new setting `rolloutStrategy` to specify either: - `WaitUntilReady` (*Default*) - `ImmediatelyPromoteCausingDowntime` -For more information, see [`rolloutStrategy`](/self-managed-deployments/upgrading/#rollout-strategies). +For more information, see [`rolloutStrategy`](/self-managed-deployments/upgrading/materialize-instances/v1/#rollout-strategies). ### Terraform helpers diff --git a/doc/user/content/security/self-managed/authentication.md b/doc/user/content/security/self-managed/authentication.md index 6e48e7af5c7b0..d47fe34fbbb47 100644 --- a/doc/user/content/security/self-managed/authentication.md +++ b/doc/user/content/security/self-managed/authentication.md @@ -50,6 +50,40 @@ following fields: The following example Kubernetes manifest includes configuration for SASL/SCRAM-SHA-256 authentication: +{{< tabs >}} +{{< tab "v1 (v26.29+)" >}} + +```hc {hl_lines="15 25"} +apiVersion: v1 +kind: Namespace +metadata: + name: materialize-environment +--- +apiVersion: v1 +kind: Secret +metadata: + name: materialize-backend + namespace: materialize-environment +stringData: + metadata_backend_url: "..." + persist_backend_url: "..." + license_key: "..." + external_login_password_mz_system: "enter_mz_system_password" +--- +apiVersion: materialize.cloud/v1 +kind: Materialize +metadata: + name: 12345678-1234-1234-1234-123456789012 + namespace: materialize-environment +spec: + environmentdImageRef: materialize/environmentd:v26.12.1 + backendSecretName: materialize-backend + authenticatorKind: Sasl +``` + +{{< /tab >}} +{{< tab "v1alpha1 (before v26.29)" >}} + ```hc {hl_lines="15 25"} apiVersion: v1 kind: Namespace @@ -78,6 +112,9 @@ spec: authenticatorKind: Sasl ``` +{{< /tab >}} +{{< /tabs >}} + {{% include-headless "/headless/self-managed-deployments/enabled-auth-setting-warning" %}} @@ -97,6 +134,40 @@ To configure Self-Managed Materialize for password authentication, update the fo The following example Kubernetes manifest includes configuration for password authentication: +{{< tabs >}} +{{< tab "v1 (v26.29+)" >}} + +```hc {hl_lines="15 25"} +apiVersion: v1 +kind: Namespace +metadata: + name: materialize-environment +--- +apiVersion: v1 +kind: Secret +metadata: + name: materialize-backend + namespace: materialize-environment +stringData: + metadata_backend_url: "..." + persist_backend_url: "..." + license_key: "..." + external_login_password_mz_system: "enter_mz_system_password" +--- +apiVersion: materialize.cloud/v1 +kind: Materialize +metadata: + name: 12345678-1234-1234-1234-123456789012 + namespace: materialize-environment +spec: + environmentdImageRef: materialize/environmentd:v26.12.1 + backendSecretName: materialize-backend + authenticatorKind: Password +``` + +{{< /tab >}} +{{< tab "v1alpha1 (before v26.29)" >}} + ```hc {hl_lines="15 25"} apiVersion: v1 kind: Namespace @@ -125,6 +196,9 @@ spec: authenticatorKind: Password ``` +{{< /tab >}} +{{< /tabs >}} + {{% include-headless "/headless/self-managed-deployments/enabled-auth-setting-warning" %}} diff --git a/doc/user/content/self-managed-deployments/_index.md b/doc/user/content/self-managed-deployments/_index.md index 8965aea956ab4..1c5a276a43fa0 100644 --- a/doc/user/content/self-managed-deployments/_index.md +++ b/doc/user/content/self-managed-deployments/_index.md @@ -157,6 +157,23 @@ custom resource definitions(CRDs). For a full list of fields available for the Materialize CR, see [Materialize CRD Field Descriptions](/self-managed-deployments/materialize-crd-field-descriptions/). +{{< tabs >}} +{{< tab "v1 (v26.29+)" >}} + +```yaml +apiVersion: materialize.cloud/v1 +kind: Materialize +metadata: + name: 12345678-1234-1234-1234-123456789012 + namespace: materialize-environment +spec: + environmentdImageRef: materialize/environmentd:{{< self-managed/versions/get-latest-version >}} +# ... additional fields omitted for brevity +``` + +{{< /tab >}} +{{< tab "v1alpha1 (before v26.29)" >}} + ```yaml apiVersion: materialize.cloud/v1alpha1 kind: Materialize @@ -168,11 +185,23 @@ spec: # ... additional fields omitted for brevity ``` +{{< /tab >}} +{{< /tabs >}} + When you first apply the Materialize custom resource, the operator automatically creates all required Kubernetes resources. #### Modifying the custom resource +{{< tabs >}} +{{< tab "v1 (v26.29+)" >}} + +To modify a custom resource, update the CRD with your changes. +When you apply the CRD, the operator will roll out the changes. + +{{< /tab >}} +{{< tab "v1alpha1 (before v26.29)" >}} + To modify a custom resource, update the CRD with your changes, including the `requestRollout` field with a new UUID value. When you apply the CRD, the operator will roll out the changes. @@ -182,6 +211,9 @@ If you do not specify a new `requestRollout` UUID, the operator watches for updates but does not roll out the changes. {{< /note >}} +{{< /tab >}} +{{< /tabs >}} + For a full list of fields available for the Materialize CR, see [Materialize CRD Field Descriptions](/self-managed-deployments/materialize-crd-field-descriptions/). diff --git a/doc/user/content/self-managed-deployments/configuration-system-parameters.md b/doc/user/content/self-managed-deployments/configuration-system-parameters.md index 5a1604f228f4d..994bd9e92257b 100644 --- a/doc/user/content/self-managed-deployments/configuration-system-parameters.md +++ b/doc/user/content/self-managed-deployments/configuration-system-parameters.md @@ -67,6 +67,24 @@ kubectl apply -f system-params-configmap.yaml Reference the ConfigMap in your Materialize custom resource by setting the `systemParameterConfigmapName` field to the name of your ConfigMap: +{{< tabs >}} +{{< tab "v1 (v26.29+)" >}} + +```yaml {hl_lines="9"} +apiVersion: materialize.cloud/v1 +kind: Materialize +metadata: + name: 12345678-1234-1234-1234-123456789012 + namespace: materialize-environment +spec: + environmentdImageRef: materialize/environmentd:v26.0.0 + backendSecretName: materialize-backend + systemParameterConfigmapName: mz-system-params +``` + +{{< /tab >}} +{{< tab "v1alpha1 (before v26.29)" >}} + ```yaml {hl_lines="9-10"} apiVersion: materialize.cloud/v1alpha1 kind: Materialize @@ -80,6 +98,9 @@ spec: requestRollout: 00000000-0000-0000-0000-000000000003 # Changing the CR requires a rollout ``` +{{< /tab >}} +{{< /tabs >}} + Apply the updated Materialize resource: ```shell @@ -129,6 +150,24 @@ Alternatively, you can add the `configmap-reload-trigger` annotation to your Materialize custom resource YAML and update it whenever you need to force a ConfigMap reload: +{{< tabs >}} +{{< tab "v1 (v26.29+)" >}} + +```yaml +apiVersion: materialize.cloud/v1 +kind: Materialize +metadata: + name: 12345678-1234-1234-1234-123456789012 + namespace: materialize-environment + annotations: + configmap-reload-trigger: "1234567890" # Update this value to force reload +spec: + # ... rest of spec +``` + +{{< /tab >}} +{{< tab "v1alpha1 (before v26.29)" >}} + ```yaml apiVersion: materialize.cloud/v1alpha1 kind: Materialize @@ -141,6 +180,9 @@ spec: # ... rest of spec ``` +{{< /tab >}} +{{< /tabs >}} + {{< note >}} Even after the ConfigMap is synced, some system parameters may require a restart to take effect. diff --git a/doc/user/content/self-managed-deployments/installation/install-on-local-kind.md b/doc/user/content/self-managed-deployments/installation/install-on-local-kind.md index b0d9fee2a1c55..5c30e1cd52c8e 100644 --- a/doc/user/content/self-managed-deployments/installation/install-on-local-kind.md +++ b/doc/user/content/self-managed-deployments/installation/install-on-local-kind.md @@ -107,6 +107,22 @@ Starting in v26.0, Self-Managed Materialize requires a license key. kubectl get nodes --show-labels ``` +1. Recommended: Install cert-manager + + Cert-manager is used for generating TLS certificates needed by the materialize operator + for CRD conversion webhooks. It is currently only required if you enable the v1 + version of the Materialize CRD by setting `operator.args.installV1CRD=true` + when installing the operator, but certificates will become required in a + future version of Materialize. + + ```shell + helm install cert-manager oci://quay.io/jetstack/charts/cert-manager \ + --version v1.19.2 \ + --namespace cert-manager \ + --create-namespace \ + --set crds.enabled=true + ``` + 1. To help you get started for local evaluation/testing, Materialize provides some sample configuration files. Download the sample configuration files from the Materialize repo: diff --git a/doc/user/content/self-managed-deployments/materialize-crd-field-descriptions.md b/doc/user/content/self-managed-deployments/materialize-crd-field-descriptions/_index.md similarity index 52% rename from doc/user/content/self-managed-deployments/materialize-crd-field-descriptions.md rename to doc/user/content/self-managed-deployments/materialize-crd-field-descriptions/_index.md index bca28ab7a50d5..c5a0d1d4c80df 100644 --- a/doc/user/content/self-managed-deployments/materialize-crd-field-descriptions.md +++ b/doc/user/content/self-managed-deployments/materialize-crd-field-descriptions/_index.md @@ -1,6 +1,7 @@ --- title: "Materialize CRD Field Descriptions" description: "Reference page on Materialize CRD Fields" +disable_list: true menu: main: parent: "sm-deployments" @@ -10,4 +11,7 @@ aliases: - /installation/appendix-materialize-crd-field-descriptions/ --- -{{% self-managed/materialize-crd-descriptions %}} +Select the CRD API version for your Materialize deployment: + +- [v1 (v26.29+)](/self-managed-deployments/materialize-crd-field-descriptions/v1/) +- [v1alpha1 (before v26.29)](/self-managed-deployments/materialize-crd-field-descriptions/v1alpha1/) diff --git a/doc/user/content/self-managed-deployments/materialize-crd-field-descriptions/v1.md b/doc/user/content/self-managed-deployments/materialize-crd-field-descriptions/v1.md new file mode 100644 index 0000000000000..c842075cff71f --- /dev/null +++ b/doc/user/content/self-managed-deployments/materialize-crd-field-descriptions/v1.md @@ -0,0 +1,10 @@ +--- +title: "v1" +description: "Reference page on Materialize CRD Fields for the v1 API (v26.29+)" +menu: + main: + parent: "materialize-crd-field-descriptions" + weight: 10 +--- + +{{% self-managed/materialize-crd-descriptions-v1 %}} diff --git a/doc/user/content/self-managed-deployments/materialize-crd-field-descriptions/v1alpha1.md b/doc/user/content/self-managed-deployments/materialize-crd-field-descriptions/v1alpha1.md new file mode 100644 index 0000000000000..070f43c2b83bc --- /dev/null +++ b/doc/user/content/self-managed-deployments/materialize-crd-field-descriptions/v1alpha1.md @@ -0,0 +1,10 @@ +--- +title: "v1alpha1" +description: "Reference page on Materialize CRD Fields for the v1alpha1 API (before v26.29)" +menu: + main: + parent: "materialize-crd-field-descriptions" + weight: 20 +--- + +{{% self-managed/materialize-crd-descriptions-v1alpha1 %}} diff --git a/doc/user/content/self-managed-deployments/upgrading/_index.md b/doc/user/content/self-managed-deployments/upgrading/_index.md index fe19470c8dc43..92c3a0c56deaa 100644 --- a/doc/user/content/self-managed-deployments/upgrading/_index.md +++ b/doc/user/content/self-managed-deployments/upgrading/_index.md @@ -93,211 +93,16 @@ helm upgrade -n materialize my-demo materialize/operator \ ## Upgrading Materialize Instances -**After** you have upgraded your Materialize Operator, upgrade your Materialize -instance(s) to the **APP Version** of the Operator. To find the version of your -currently deployed Materialize Operator: +Select the instructions for your CRD API version: -```shell -helm list -n materialize -``` - -You will use the returned **App Version** for the updated `environmentdImageRef` -value. Specifically, for your Materialize instance(s), set -`environmentdImageRef` value to use the new version: - -``` -spec: - environmentdImageRef: docker.io/materialize/environmentd: -``` - -To minimize unexpected downtime and avoid connection drops at critical -periods for your application, the upgrade process involves two steps: - -- First, stage the changes (update the `environmentdImageRef` with the new - version) to the Materialize custom resource. The Operator watches for changes - but does not automatically roll out the changes. - -- Second, roll out the changes by specifying a new UUID for `requestRollout`. - -### Stage the Materialize instance version change - -To stage the Materialize instances version upgrade, update the -`environmentdImageRef` field in the Materialize custom resource spec to the -compatible version of your currently deployed Materialize Operator. - -To stage, but **not** rollout, the Materialize instance version upgrade, you can -use the `kubectl patch` command; for example, if the **App Version** is {{< self-managed/versions/get-latest-version >}}: - -```shell -kubectl patch materialize \ - -n \ - --type='merge' \ - -p "{\"spec\": {\"environmentdImageRef\": \"docker.io/materialize/environmentd:{{< self-managed/versions/get-latest-version >}}\"}}" -``` - -{{< note >}} -Until you specify a new `requestRollout`, the Operator watches for updates but -does not roll out the changes. -{{< /note >}} - - -### Applying the changes via `requestRollout` - -To apply chang Materialize instance upgrade, you must update the `requestRollout` field in the Materialize custom resource spec to a new UUID. -Be sure to consult the [Rollout Configurations](#rollout-configuration) to ensure you've selected the correct rollout behavior. -```shell -# Then trigger the rollout with a new UUID -kubectl patch materialize \ - -n \ - --type='merge' \ - -p "{\"spec\": {\"requestRollout\": \"$(uuidgen)\"}}" -``` - -### Staging and applying in a single command - -Although separating the staging and rollout of the changes into two steps can -minimize unexpected downtime and avoid connection drops at critical periods, you -can, if preferred, combine both operations in a single command - -```shell -kubectl patch materialize \ - -n materialize-environment \ - --type='merge' \ - -p "{\"spec\": {\"environmentdImageRef\": \"docker.io/materialize/environmentd:{{< self-managed/versions/get-latest-version >}}\", \"requestRollout\": \"$(uuidgen)\"}}" -``` - -#### Using YAML Definition - -Alternatively, you can update your Materialize custom resource definition directly: - -```yaml -apiVersion: materialize.cloud/v1alpha1 -kind: Materialize -metadata: - name: 12345678-1234-1234-1234-123456789012 - namespace: materialize-environment -spec: - environmentdImageRef: materialize/environmentd:{{< self-managed/versions/get-latest-version >}} # Update version as needed - requestRollout: 22222222-2222-2222-2222-222222222222 # Use a new UUID - forceRollout: 33333333-3333-3333-3333-333333333333 # Optional: for forced rollouts - inPlaceRollout: false # In Place rollout is deprecated and ignored. Please use rolloutStrategy - rolloutStrategy: WaitUntilReady # The mechanism to use when rolling out the new version. Can be WaitUntilReady or ImmediatelyPromoteCausingDowntime - backendSecretName: materialize-backend -``` - -Apply the updated definition: - -```shell -kubectl apply -f materialize.yaml -``` - -## Rollout Configuration - -### `requestRollout` - -Specify a new `UUID` value for the `requestRollout` to roll out the changes to -the Materialize instance. - -{{< note >}} - -`requestRollout` without the `forcedRollout` field only rolls out if changes -exist to the Materialize instance. To roll out even if there are no changes to -the instance, use with `forcedRollouts`. +- [v1 (v26.29+)](/self-managed-deployments/upgrading/materialize-instances/v1/) +- [v1alpha1 (before v26.29)](/self-managed-deployments/upgrading/materialize-instances/v1alpha1/) -{{< /note >}} - -```shell -# Only rolls out if there are changes -kubectl patch materialize \ - -n \ - --type='merge' \ - -p "{\"spec\": {\"requestRollout\": \"$(uuidgen)\"}}" -``` -#### `requestRollout` with `forcedRollouts` - -Specify a new `UUID` value for `forcedRollout` to roll out even when there are -no changes to the instance. Use `forcedRollout` with `requestRollout`. - -```shell -kubectl patch materialize \ - -n materialize-environment \ - --type='merge' \ - -p "{\"spec\": {\"requestRollout\": \"$(uuidgen)\", \"forceRollout\": \"$(uuidgen)\"}}" -``` - -### Rollout strategies - -Rollout strategies control how Materialize transitions from the current generation to a new generation during an upgrade. - -The behavior of the new version rollout follows your `rolloutStrategy` setting. - -#### *WaitUntilReady* - ***Default*** - -`WaitUntilReady` creates a new generation of pods and automatically cuts over to them as soon as they catch up to the old generation and become `ReadyToPromote`. This strategy temporarily doubles the required resources to run Materialize. -{{< warning >}} `WaitUntilReady` waits up to 72 hours (configurable by the `with_0dt_deployment_max_wait` flag) for the new pods to become ready. If the promotion has not occurred by then, the new pods are automatically promoted. {{< /warning >}} - -#### *ImmediatelyPromoteCausingDowntime* -{{< warning >}} Using the `ImmediatelyPromoteCausingDowntime` rollout flag will cause downtime. {{< /warning >}} - -`ImmediatelyPromoteCausingDowntime` tears down the prior generation, and immediately promotes the new generation without waiting for it to hydrate. This causes downtime until the new generation has hydrated. However, it does not require additional resources. - -#### *ManuallyPromote* - -`ManuallyPromote` allows you to choose when to promote the new generation. This means you can time the promotion for periods when load is low, minimizing the impact of potential downtime for any clients connected to Materialize. This strategy temporarily doubles the required resources to run Materialize. - -To minimize downtime, wait until the new generation has fully hydrated and caught up to the prior generation before promoting. To check hydration status, inspect the `UpToDate` condition in the Materialize resource status. When hydration completes, the condition will be `ReadyToPromote`. - -To promote, update the `forcePromote` field to match the `requestRollout` field in the Materialize spec. If you need to promote before hydration completes, you can set `forcePromote` immediately, but clients may experience downtime. - -{{< warning >}} Leaving a new generation unpromoted for over 6 hours may cause downtime. {{< /warning >}} - -**Do not leave new generations unpromoted indefinitely**. They should either be promoted or canceled. New generations open a read hold on the metadata database that prevents compaction. This hold is only released when the generation is promoted or canceled. If left open too long, promoting or canceling can trigger a spike in deletion load on the metadata database, potentially causing downtime. It is not recommended to leave generations unpromoted for over 6 hours. - -#### *inPlaceRollout* - ***Deprecated*** - -The setting is ignored. - -## Verifying the Upgrade - -After initiating the rollout, you can monitor the status field of the Materialize custom resource to check on the upgrade. - -```shell -# Watch the status of your Materialize environment -kubectl get materialize -n materialize-environment -w - -# Check the logs of the operator -kubectl logs -l app.kubernetes.io/name=materialize-operator -n materialize -``` - -## Cancelling the Upgrade - -You may want to cancel an in-progress rollout if the upgrade has failed. This may be indicated by new pods not being healthy. Before cancelling, verify that the upgrade has not already completed by checking that the deploy generation (found via `status.activeGeneration`) is still the one from before the upgrade. Once an upgrade has already happened, you cannot revert using this method. - -To cancel an in-progress rollout and revert to the last completed rollout -state, revert both `requestRollout` and `environmentdImageRef` back to the -values from the last completed rollout. Reverting `environmentdImageRef` -alongside `requestRollout` keeps the spec aligned with what is actually -running, so a later rollout doesn't accidentally pick up the previously -attempted upgrade image. - -First, retrieve the last completed rollout request ID and the matching -environmentd image ref from your Materialize CR: - -```shell -kubectl get materialize -n materialize-environment \ - -o jsonpath='{.status.lastCompletedRolloutRequest} {.status.lastCompletedRolloutEnvironmentdImageRef}' -``` +## Version Specific Upgrade Notes -Then, set both fields back to these values in a single patch: +### Upgrading to `v26.29` and later versions -```shell -kubectl patch materialize \ - -n materialize-environment \ - --type='merge' \ - -p "{\"spec\": {\"requestRollout\": \"\", \"environmentdImageRef\": \"\"}}" -``` - -## Version Specific Upgrade Notes +{{< include-md file="shared-content/self-managed/upgrade-notes/v26.29.md" >}} ### Upgrading to `v26.1` and later versions diff --git a/doc/user/content/self-managed-deployments/upgrading/materialize-instances/_index.md b/doc/user/content/self-managed-deployments/upgrading/materialize-instances/_index.md new file mode 100644 index 0000000000000..8acf866cbcd7d --- /dev/null +++ b/doc/user/content/self-managed-deployments/upgrading/materialize-instances/_index.md @@ -0,0 +1,86 @@ +--- +title: "Upgrading Materialize Instances" +description: "Upgrading Materialize instances for Self-Managed deployments." +disable_list: true +menu: + main: + parent: "upgrading" + identifier: "upgrading-materialize-instances" + weight: 20 +--- + +{{< important >}} + +When upgrading Materialize, always upgrade the Helm Chart and Materialize +Operator first. See [Upgrading the Helm Chart and Materialize Operator](/self-managed-deployments/upgrading/#upgrading-the-helm-chart-and-materialize-operator). + +{{}} + +## CRD API Versions + +Starting in v26.29, the Materialize Operator supports two CRD API versions: + +- **v1** simplifies the upgrade process. Rollouts trigger automatically when spec fields change, removing the need to manually set a `requestRollout` UUID. +- **v1alpha1** uses the original two-step upgrade process: first stage changes, then trigger a rollout with a new `requestRollout` UUID. + +Switching to v1 is **opt-in**. Upgrading the operator to v26.29+ does not change your existing v1alpha1 CRs or their behavior. You can continue using v1alpha1 indefinitely. When you are ready, you can switch individual instances to v1 at your own pace. + +{{< note >}} +We recommend opting in to v1 at your convenience, as v1 behavior will become the default in the next major release. +{{}} + +Select the instructions for your CRD API version: + +- [v1 (v26.29+)](/self-managed-deployments/upgrading/materialize-instances/v1/) +- [v1alpha1 (before v26.29)](/self-managed-deployments/upgrading/materialize-instances/v1alpha1/) + +## Switching from v1alpha1 to v1 + +Switching to v1 is opt-in and does not trigger a rollout on its own. Before switching, ensure you have completed the prerequisites in the [v26.29 upgrade notes](/self-managed-deployments/upgrading/#upgrading-to-v2629-and-later-versions) (cert-manager, network/firewall changes), and have enabled the v1 CRD by setting the Helm value `operator.args.installV1CRD=true` on the operator. Without this value, the operator only installs the v1alpha1 CRD version, and the Kubernetes API server rejects v1 CRs. + +### How it works + +The v1alpha1 CRD remains the storage version. When you submit a v1 CR, the operator's conversion webhook automatically converts it to v1alpha1 for storage. During conversion, the webhook computes a SHA256 hash of the spec and derives a deterministic `requestRollout` UUID from it. This means: + +- If the spec hasn't changed, the same UUID is generated, so **no unintended rollout is triggered** by switching API versions alone. +- If the spec has changed, a different UUID is produced, automatically triggering a rollout. + +### Using kubectl + +To switch an existing instance to v1, apply your CR with the updated `apiVersion` and remove the `requestRollout` field: + +```shell +kubectl apply -f - < + namespace: +spec: + environmentdImageRef: + backendSecretName: + # ... other spec fields (copy from your existing CR, removing requestRollout) +EOF +``` + +Or patch the API version on an existing CR: + +```shell +kubectl patch materialize \ + -n \ + --type='merge' \ + -p '{"apiVersion":"materialize.cloud/v1"}' +``` + +### Using Terraform + +If you are managing your Materialize instance with the [Materialize Terraform modules](https://github.com/MaterializeInc/materialize-terraform-self-managed), set: + +```hcl +crd_version = "v1" +request_rollout = null +``` + +### Switching back to v1alpha1 + +You can switch back to v1alpha1 at any time by reapplying your CR with `apiVersion: materialize.cloud/v1alpha1` and an explicit `requestRollout` UUID. diff --git a/doc/user/content/self-managed-deployments/upgrading/materialize-instances/v1.md b/doc/user/content/self-managed-deployments/upgrading/materialize-instances/v1.md new file mode 100644 index 0000000000000..927fbe94f278e --- /dev/null +++ b/doc/user/content/self-managed-deployments/upgrading/materialize-instances/v1.md @@ -0,0 +1,156 @@ +--- +title: "v1" +description: "Upgrading Self-Managed Materialize instances using the v1 CRD API (v26.29+)." +menu: + main: + parent: "upgrading-materialize-instances" + identifier: "upgrading-v1" + weight: 10 +--- + +{{< note >}} +If you are currently using v1alpha1 and want to switch, see [Switching from v1alpha1 to v1](/self-managed-deployments/upgrading/materialize-instances/#switching-from-v1alpha1-to-v1). +{{}} + +## Upgrading Materialize Instances + +With the `v1` CRD, rollouts trigger automatically when spec fields change. +Unlike `v1alpha1`, there is no `requestRollout` field. You only need to update the spec and the operator handles the rest. + +**After** you have upgraded your Materialize Operator, upgrade your Materialize +instance(s) to the **APP Version** of the Operator. To find the version of your +currently deployed Materialize Operator: + +```shell +helm list -n materialize +``` + +You will use the returned **App Version** for the updated `environmentdImageRef` +value. Specifically, for your Materialize instance(s), set +`environmentdImageRef` value to use the new version: + +``` +spec: + environmentdImageRef: docker.io/materialize/environmentd: +``` + +#### Using `kubectl patch` + +To initiate the Materialize instance version upgrade, you can +use the `kubectl patch` command; for example, if the **App Version** is {{< self-managed/versions/get-latest-version >}}: + +```shell +kubectl patch materialize \ + -n \ + --type='merge' \ + -p "{\"apiVersion\": \"materialize.cloud/v1\", \"spec\": {\"environmentdImageRef\": \"docker.io/materialize/environmentd:{{< self-managed/versions/get-latest-version >}}\"}}" +``` + + +#### Using YAML Definition + +Alternatively, you can update your Materialize custom resource definition directly: + +```yaml +apiVersion: materialize.cloud/v1 +kind: Materialize +metadata: + name: 12345678-1234-1234-1234-123456789012 + namespace: materialize-environment +spec: + environmentdImageRef: materialize/environmentd:{{< self-managed/versions/get-latest-version >}} # Update version as needed + forceRollout: 33333333-3333-3333-3333-333333333333 # Optional: for forced rollouts + rolloutStrategy: WaitUntilReady # The mechanism to use when rolling out the new version. Can be WaitUntilReady or ImmediatelyPromoteCausingDowntime + backendSecretName: materialize-backend +``` + +Apply the updated definition: + +```shell +kubectl apply -f materialize.yaml +``` + +#### Using Terraform + +If you are managing your Materialize instance with the +[Materialize Terraform modules](https://github.com/MaterializeInc/materialize-terraform-self-managed), +update the `environmentd_version` variable: + +```hcl +module "materialize_instance" { + source = "../kubernetes/modules/materialize-instance" + + crd_version = "v1" + request_rollout = null + environmentd_version = "v26.29.0" # Update to the new version + # ... other configuration +} +``` + +## Rollout Configuration + +With `v1`, the operator computes a hash of the spec fields and automatically triggers a rollout when the hash changes. The `requestRollout` field from `v1alpha1` is not used. + +#### `forceRollout` + +Specify a new `UUID` value for `forceRollout` to trigger a rollout even when there are +no other changes to the instance. The `forceRollout` value is included in the +hash calculation, so changing it produces a new hash and triggers a rollout. + +```shell +kubectl patch materialize \ + -n materialize-environment \ + --type='merge' \ + -p "{\"apiVersion\": \"materialize.cloud/v1\", \"spec\": {\"forceRollout\": \"$(uuidgen)\"}}" +``` + +### Rollout strategies + +Rollout strategies control how Materialize transitions from the current generation to a new generation during an upgrade. + +The behavior of the new version rollout follows your `rolloutStrategy` setting. + +#### *WaitUntilReady* - ***Default*** + +`WaitUntilReady` creates a new generation of pods and automatically cuts over to them as soon as they catch up to the old generation and become `ReadyToPromote`. This strategy temporarily doubles the required resources to run Materialize. +{{< warning >}} `WaitUntilReady` waits up to 72 hours (configurable by the `with_0dt_deployment_max_wait` flag) for the new pods to become ready. If the promotion has not occurred by then, the new pods are automatically promoted. {{< /warning >}} + +#### *ImmediatelyPromoteCausingDowntime* +{{< warning >}} Using the `ImmediatelyPromoteCausingDowntime` rollout flag will cause downtime. {{< /warning >}} + +`ImmediatelyPromoteCausingDowntime` tears down the prior generation, and immediately promotes the new generation without waiting for it to hydrate. This causes downtime until the new generation has hydrated. However, it does not require additional resources. + +#### *ManuallyPromote* + +`ManuallyPromote` allows you to choose when to promote the new generation. This means you can time the promotion for periods when load is low, minimizing the impact of potential downtime for any clients connected to Materialize. This strategy temporarily doubles the required resources to run Materialize. + +To minimize downtime, wait until the new generation has fully hydrated and caught up to the prior generation before promoting. To check hydration status, inspect the `UpToDate` condition in the Materialize resource status. When hydration completes, the condition will be `ReadyToPromote`. + +To promote, update the `forcePromote` field to match the `requestedRolloutHash` field in the Materialize status. If you need to promote before hydration completes, you can set `forcePromote` immediately, but clients may experience downtime. + +{{< warning >}} Leaving a new generation unpromoted for over 6 hours may cause downtime. {{< /warning >}} + +**Do not leave new generations unpromoted indefinitely**. They should either be promoted or canceled. New generations open a read hold on the metadata database that prevents compaction. This hold is only released when the generation is promoted or canceled. If left open too long, promoting or canceling can trigger a spike in deletion load on the metadata database, potentially causing downtime. It is not recommended to leave generations unpromoted for over 6 hours. + + +## Verifying the Upgrade + +After initiating the rollout, you can monitor the status field of the Materialize custom resource to check on the upgrade. + +```shell +# Watch the status of your Materialize environment +kubectl get materialize -n materialize-environment -w + +# Check the logs of the operator +kubectl logs -l app.kubernetes.io/name=materialize-operator -n materialize +``` + +## Cancelling the Upgrade + +You may want to cancel an in-progress rollout if the upgrade has failed. This may be indicated by new pods not being healthy. Before cancelling, verify that the upgrade has not already completed by checking that the deploy generation (found via `status.activeGeneration`) is still the one from before the upgrade. Once an upgrade has already happened, you cannot revert using this method. + +To cancel an in-progress rollout and revert to the last completed rollout state, you must revert the Materialize resource to the contents it had before triggering the rollout: + +```shell +kubectl apply -f previous_materialize_configuration.yaml +``` diff --git a/doc/user/content/self-managed-deployments/upgrading/materialize-instances/v1alpha1.md b/doc/user/content/self-managed-deployments/upgrading/materialize-instances/v1alpha1.md new file mode 100644 index 0000000000000..dc956847ece31 --- /dev/null +++ b/doc/user/content/self-managed-deployments/upgrading/materialize-instances/v1alpha1.md @@ -0,0 +1,217 @@ +--- +title: "v1alpha1" +description: "Upgrading Self-Managed Materialize instances using the v1alpha1 CRD API (before v26.29)." +menu: + main: + parent: "upgrading-materialize-instances" + identifier: "upgrading-v1alpha1" + weight: 20 +aliases: + - /self-managed-deployments/upgrading/v1alpha1-upgrade-instructions/ +--- + +## Upgrading Materialize Instances + +**After** you have upgraded your Materialize Operator, upgrade your Materialize +instance(s) to the **APP Version** of the Operator. To find the version of your +currently deployed Materialize Operator: + +```shell +helm list -n materialize +``` + +You will use the returned **App Version** for the updated `environmentdImageRef` +value. Specifically, for your Materialize instance(s), set +`environmentdImageRef` value to use the new version: + +``` +spec: + environmentdImageRef: docker.io/materialize/environmentd: +``` + +To minimize unexpected downtime and avoid connection drops at critical +periods for your application, the upgrade process involves two steps: + +- First, stage the changes (update the `environmentdImageRef` with the new + version) to the Materialize custom resource. The Operator watches for changes + but does not automatically roll out the changes. + +- Second, roll out the changes by specifying a new UUID for `requestRollout`. + +### Stage the Materialize instance version change + +To stage the Materialize instances version upgrade, update the +`environmentdImageRef` field in the Materialize custom resource spec to the +compatible version of your currently deployed Materialize Operator. + +To stage, but **not** rollout, the Materialize instance version upgrade, you can +use the `kubectl patch` command; for example, if the **App Version** is {{< self-managed/versions/get-latest-version >}}: + +```shell +kubectl patch materialize \ + -n \ + --type='merge' \ + -p "{\"spec\": {\"environmentdImageRef\": \"docker.io/materialize/environmentd:{{< self-managed/versions/get-latest-version >}}\"}}" +``` + +{{< note >}} +Until you specify a new `requestRollout`, the Operator watches for updates but +does not roll out the changes. +{{< /note >}} + + +### Applying the changes via `requestRollout` + +To apply the Materialize instance upgrade, you must update the `requestRollout` field in the Materialize custom resource spec to a new UUID. +Be sure to consult the [Rollout Configurations](#rollout-configuration) to ensure you've selected the correct rollout behavior. +```shell +# Then trigger the rollout with a new UUID +kubectl patch materialize \ + -n \ + --type='merge' \ + -p "{\"spec\": {\"requestRollout\": \"$(uuidgen)\"}}" +``` + +### Staging and applying in a single command + +Although separating the staging and rollout of the changes into two steps can +minimize unexpected downtime and avoid connection drops at critical periods, you +can, if preferred, combine both operations in a single command + +```shell +kubectl patch materialize \ + -n materialize-environment \ + --type='merge' \ + -p "{\"spec\": {\"environmentdImageRef\": \"docker.io/materialize/environmentd:{{< self-managed/versions/get-latest-version >}}\", \"requestRollout\": \"$(uuidgen)\"}}" +``` + +#### Using YAML Definition + +Alternatively, you can update your Materialize custom resource definition directly: + +```yaml +apiVersion: materialize.cloud/v1alpha1 +kind: Materialize +metadata: + name: 12345678-1234-1234-1234-123456789012 + namespace: materialize-environment +spec: + environmentdImageRef: materialize/environmentd:{{< self-managed/versions/get-latest-version >}} # Update version as needed + requestRollout: 22222222-2222-2222-2222-222222222222 # Use a new UUID + forceRollout: 33333333-3333-3333-3333-333333333333 # Optional: for forced rollouts + inPlaceRollout: false # In Place rollout is deprecated and ignored. Please use rolloutStrategy + rolloutStrategy: WaitUntilReady # The mechanism to use when rolling out the new version. Can be WaitUntilReady or ImmediatelyPromoteCausingDowntime + backendSecretName: materialize-backend +``` + +Apply the updated definition: + +```shell +kubectl apply -f materialize.yaml +``` + +## Rollout Configuration + +### `requestRollout` + +Specify a new `UUID` value for the `requestRollout` to roll out the changes to +the Materialize instance. + +{{< note >}} + +`requestRollout` without the `forcedRollout` field only rolls out if changes +exist to the Materialize instance. To roll out even if there are no changes to +the instance, use with `forcedRollouts`. + +{{< /note >}} + +```shell +# Only rolls out if there are changes +kubectl patch materialize \ + -n \ + --type='merge' \ + -p "{\"spec\": {\"requestRollout\": \"$(uuidgen)\"}}" +``` +#### `requestRollout` with `forcedRollouts` + +Specify a new `UUID` value for `forcedRollout` to roll out even when there are +no changes to the instance. Use `forcedRollout` with `requestRollout`. + +```shell +kubectl patch materialize \ + -n materialize-environment \ + --type='merge' \ + -p "{\"spec\": {\"requestRollout\": \"$(uuidgen)\", \"forceRollout\": \"$(uuidgen)\"}}" +``` + +### Rollout strategies + +Rollout strategies control how Materialize transitions from the current generation to a new generation during an upgrade. + +The behavior of the new version rollout follows your `rolloutStrategy` setting. + +#### *WaitUntilReady* - ***Default*** + +`WaitUntilReady` creates a new generation of pods and automatically cuts over to them as soon as they catch up to the old generation and become `ReadyToPromote`. This strategy temporarily doubles the required resources to run Materialize. +{{< warning >}} `WaitUntilReady` waits up to 72 hours (configurable by the `with_0dt_deployment_max_wait` flag) for the new pods to become ready. If the promotion has not occurred by then, the new pods are automatically promoted. {{< /warning >}} + +#### *ImmediatelyPromoteCausingDowntime* +{{< warning >}} Using the `ImmediatelyPromoteCausingDowntime` rollout flag will cause downtime. {{< /warning >}} + +`ImmediatelyPromoteCausingDowntime` tears down the prior generation, and immediately promotes the new generation without waiting for it to hydrate. This causes downtime until the new generation has hydrated. However, it does not require additional resources. + +#### *ManuallyPromote* + +`ManuallyPromote` allows you to choose when to promote the new generation. This means you can time the promotion for periods when load is low, minimizing the impact of potential downtime for any clients connected to Materialize. This strategy temporarily doubles the required resources to run Materialize. + +To minimize downtime, wait until the new generation has fully hydrated and caught up to the prior generation before promoting. To check hydration status, inspect the `UpToDate` condition in the Materialize resource status. When hydration completes, the condition will be `ReadyToPromote`. + +To promote, update the `forcePromote` field to match the `requestRollout` field in the Materialize spec. If you need to promote before hydration completes, you can set `forcePromote` immediately, but clients may experience downtime. + +{{< warning >}} Leaving a new generation unpromoted for over 6 hours may cause downtime. {{< /warning >}} + +**Do not leave new generations unpromoted indefinitely**. They should either be promoted or canceled. New generations open a read hold on the metadata database that prevents compaction. This hold is only released when the generation is promoted or canceled. If left open too long, promoting or canceling can trigger a spike in deletion load on the metadata database, potentially causing downtime. It is not recommended to leave generations unpromoted for over 6 hours. + +#### *inPlaceRollout* - ***Deprecated*** + +The setting is ignored. + +## Verifying the Upgrade + +After initiating the rollout, you can monitor the status field of the Materialize custom resource to check on the upgrade. + +```shell +# Watch the status of your Materialize environment +kubectl get materialize -n materialize-environment -w + +# Check the logs of the operator +kubectl logs -l app.kubernetes.io/name=materialize-operator -n materialize +``` + +## Cancelling the Upgrade + +You may want to cancel an in-progress rollout if the upgrade has failed. This may be indicated by new pods not being healthy. Before cancelling, verify that the upgrade has not already completed by checking that the deploy generation (found via `status.activeGeneration`) is still the one from before the upgrade. Once an upgrade has already happened, you cannot revert using this method. + +To cancel an in-progress rollout and revert to the last completed rollout +state, revert both `requestRollout` and `environmentdImageRef` back to the +values from the last completed rollout. Reverting `environmentdImageRef` +alongside `requestRollout` keeps the spec aligned with what is actually +running, so a later rollout doesn't accidentally pick up the previously +attempted upgrade image. + +First, retrieve the last completed rollout request ID and the matching +environmentd image ref from your Materialize CR: + +```shell +kubectl get materialize -n materialize-environment \ + -o jsonpath='{.status.lastCompletedRolloutRequest} {.status.lastCompletedRolloutEnvironmentdImageRef}' +``` + +Then, set both fields back to these values in a single patch: + +```shell +kubectl patch materialize \ + -n materialize-environment \ + --type='merge' \ + -p "{\"spec\": {\"requestRollout\": \"\", \"environmentdImageRef\": \"\"}}" +``` diff --git a/doc/user/data/self_managed/materialize_crd_descriptions_v1.json b/doc/user/data/self_managed/materialize_crd_descriptions_v1.json new file mode 100644 index 0000000000000..13769128d68be --- /dev/null +++ b/doc/user/data/self_managed/materialize_crd_descriptions_v1.json @@ -0,0 +1,558 @@ +[ + [ + "MaterializeSpec", + [ + { + "name": "backendSecretName", + "type": "String", + "description": "The name of a secret containing `metadata_backend_url` and `persist_backend_url`.\nIt may also contain `external_login_password_mz_system`, which will be used as\nthe password for the `mz_system` user if `authenticatorKind` is `Password`.", + "default": null, + "required": true, + "deprecated": false + }, + { + "name": "environmentdImageRef", + "type": "String", + "description": "The environmentd image to run.", + "default": null, + "required": true, + "deprecated": false + }, + { + "name": "authenticatorKind", + "type": "Enum", + "description": "How to authenticate with Materialize.\n\nValid values:\n- `Frontegg`:
Authenticate users using Frontegg.\n- `Password`:
Authenticate users using internally stored password hashes.\n The backend secret must contain external_login_password_mz_system.\n- `Sasl`:
Authenticate users using SASL.\n- `Oidc`:
Authenticate users using OIDC (JWT tokens).\n- `None` (default):
Do not authenticate users. Trust they are who they say they are without verification.", + "default": "None", + "required": false, + "deprecated": false + }, + { + "name": "balancerdExternalCertificateSpec", + "type": "MaterializeCertSpec", + "description": "The configuration for generating an x509 certificate using cert-manager for balancerd\nto present to incoming connections.\nThe `dnsNames` and `issuerRef` fields are required.\n\nThis field is excluded from the rollout hash and changes will not trigger a rollout.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "balancerdReplicas", + "type": "Integer", + "description": "Number of balancerd pods to create.\n\nThis field is excluded from the rollout hash and changes will not trigger a rollout.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "balancerdResourceRequirements", + "type": "io.k8s.api.core.v1.ResourceRequirements", + "description": "Resource requirements for the balancerd pod.\n\nThis field is excluded from the rollout hash and changes will not trigger a rollout.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "consoleExternalCertificateSpec", + "type": "MaterializeCertSpec", + "description": "The configuration for generating an x509 certificate using cert-manager for the console\nto present to incoming connections.\nThe `dnsNames` and `issuerRef` fields are required.\nNot yet implemented.\n\nThis field is excluded from the rollout hash and changes will not trigger a rollout.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "consoleReplicas", + "type": "Integer", + "description": "Number of console pods to create.\n\nThis field is excluded from the rollout hash and changes will not trigger a rollout.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "consoleResourceRequirements", + "type": "io.k8s.api.core.v1.ResourceRequirements", + "description": "Resource requirements for the console pod.\n\nThis field is excluded from the rollout hash and changes will not trigger a rollout.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "enableRbac", + "type": "Bool", + "description": "Whether to enable role based access control. Defaults to false.", + "default": false, + "required": false, + "deprecated": false + }, + { + "name": "environmentId", + "type": "Uuid", + "description": "The value used by environmentd (via the --environment-id flag) to\nuniquely identify this instance. Must be globally unique, and\nis required if a license key is not provided.\nNOTE: This value MUST NOT be changed in an existing instance,\nsince it affects things like the way data is stored in the persist\nbackend.", + "default": "00000000-0000-0000-0000-000000000000", + "required": false, + "deprecated": false + }, + { + "name": "environmentdConnectionRoleArn", + "type": "String", + "description": "If running in AWS, override the IAM role to use to support\nthe CREATE CONNECTION feature.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "environmentdExtraArgs", + "type": "Array", + "description": "Extra args to pass to the environmentd binary.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "environmentdExtraEnv", + "type": "Array", + "description": "Extra environment variables to pass to the environmentd binary.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "environmentdResourceRequirements", + "type": "io.k8s.api.core.v1.ResourceRequirements", + "description": "Resource requirements for the environmentd pod.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "environmentdScratchVolumeStorageRequirement", + "type": "io.k8s.apimachinery.pkg.api.resource.Quantity", + "description": "Amount of disk to allocate, if a storage class is provided.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "forcePromote", + "type": "String", + "description": "If `forcePromote` is set to the same value as the `status.requestedRolloutHash`,\ncurrent rollout will skip waiting for clusters in the new\ngeneration to rehydrate before promoting the new environmentd to\nleader.\n\nThis field is excluded from the rollout hash and changes will not trigger a rollout.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "forceRollout", + "type": "Uuid", + "description": "This value will force the controller to detect the spec as changed\neven if no other changes happened. This can be used to force a rollout\nto a new generation even without making any meaningful changes.", + "default": "00000000-0000-0000-0000-000000000000", + "required": false, + "deprecated": false + }, + { + "name": "internalCertificateSpec", + "type": "MaterializeCertSpec", + "description": "The cert-manager Issuer or ClusterIssuer to use for database internal communication.\nThe `issuerRef` field is required.\nThis currently is only used for environmentd, but will eventually support clusterd.\nNot yet implemented.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "podAnnotations", + "type": "Map", + "description": "Annotations to apply to the pods.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "podLabels", + "type": "Map", + "description": "Labels to apply to the pods.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "rolloutRequestTimeout", + "type": "RolloutRequestTimeout", + "description": "The maximum amount of time a rollout may remain in progress before\nit is automatically cancelled.\n\nWhile a rollout is in progress, the new generation of `environmentd`\nruns in a read-only, un-promoted state and holds back compaction via\nread holds. Leaving it in this state for too long can cause\nincident-inducing load when it is eventually promoted, so the\noperator cancels the rollout once this timeout is exceeded: the new\ngeneration is torn down and the previously-active generation\ncontinues serving. A new rollout can then be triggered by setting\n`requestRollout` to a new value.\n\nThis does not apply to the `ImmediatelyPromoteCausingDowntime`\nrollout strategy or to force-promoted rollouts, since by the time\nthose are in progress the old generation may already be gone.\n\nThe value is parsed as a human-readable duration, e.g. `24h`,\n`90m`, or `1h 30m`. Defaults to [`DEFAULT_ROLLOUT_REQUEST_TIMEOUT`]\nwhen omitted (the API server fills it in); an unparseable value also\nfalls back to that default.", + "default": "24h", + "required": false, + "deprecated": false + }, + { + "name": "rolloutStrategy", + "type": "Enum", + "description": "Rollout strategy to use when upgrading this Materialize instance.\n\nValid values:\n- `WaitUntilReady` (default):
Create a new generation of pods, leaving the old generation around until the\n new ones are ready to take over.\n This minimizes downtime, and is what almost everyone should use.\n- `ManuallyPromote`:
Create a new generation of pods, leaving the old generation as the serving generation\n until the user manually promotes the new generation.\n \n When using `ManuallyPromote`, the new generation can be promoted at any\n time, even if it has dataflows that are not fully caught up, by setting\n `forcePromote` to the same value as `requestRollout` in the Materialize spec.\n \n To minimize downtime, promotion should occur when the new generation\n has caught up to the prior generation. To determine if the new\n generation has caught up, consult the `UpToDate` condition in the\n status of the Materialize Resource. If the condition's reason is\n `ReadyToPromote` the new generation is ready to promote.\n \n {{}}\n Do not leave new generations unpromoted indefinitely.\n \n The new generation keeps open read holds which prevent compaction. Once promoted or\n cancelled, those read holds are released. If left unpromoted for an extended time, this\n data can build up, and can cause extreme deletion load on the metadata backend database\n when finally promoted or cancelled.\n \n To guard against this, a rollout that remains in progress longer\n than `rolloutRequestTimeout` (default 24h) is automatically\n cancelled.\n {{}}\n- `ImmediatelyPromoteCausingDowntime`:
{{}}\n THIS WILL CAUSE YOUR MATERIALIZE INSTANCE TO BE UNAVAILABLE FOR SOME TIME!!!\n \n This strategy should ONLY be used by customers with physical hardware who do not have\n enough hardware for the `WaitUntilReady` strategy. If you think you want this, please\n consult with Materialize engineering to discuss your situation.\n {{}}\n \n Tear down the old generation of pods and promote the new generation of pods immediately,\n without waiting for the new generation of pods to be ready.", + "default": "WaitUntilReady", + "required": false, + "deprecated": false + }, + { + "name": "serviceAccountAnnotations", + "type": "Map", + "description": "Annotations to apply to the service account.\n\nAnnotations on service accounts are commonly used by cloud providers for IAM.\nAWS uses \"eks.amazonaws.com/role-arn\".\nAzure uses \"azure.workload.identity/client-id\", but\nadditionally requires \"azure.workload.identity/use\": \"true\" on the pods.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "serviceAccountLabels", + "type": "Map", + "description": "Labels to apply to the service account.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "serviceAccountName", + "type": "String", + "description": "Name of the kubernetes service account to use.\nIf not set, we will create one with the same name as this Materialize object.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "systemParameterConfigmapName", + "type": "String", + "description": "The name of a ConfigMap containing system parameters in JSON format.\nThe ConfigMap must contain a `system-params.json` key whose value\nis a valid JSON object containing valid system parameters.\n\nRun `SHOW ALL` in SQL to see a subset of configurable system parameters.\n\nExample ConfigMap:\n```yaml\ndata:\n system-params.json: |\n {\n \"max_connections\": 1000\n }\n```", + "default": null, + "required": false, + "deprecated": false + } + ] + ], + [ + "MaterializeCertSpec", + [ + { + "name": "dnsNames", + "type": "Array", + "description": "Additional DNS names the certificate will be valid for.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "duration", + "type": "String", + "description": "Duration the certificate will be requested for.\nValue must be in units accepted by Go\n[`time.ParseDuration`](https://golang.org/pkg/time/#ParseDuration).", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "issuerRef", + "type": "CertificateIssuerRef", + "description": "Reference to an `Issuer` or `ClusterIssuer` that will generate the certificate.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "privateKeyAlgorithm", + "type": "CertificatePrivateKeyAlgorithm", + "description": "Optional algorithm to use for the private key. If not specified, a recommended default will be chosen.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "privateKeySize", + "type": "Integer", + "description": "Optional size for the private key.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "renewBefore", + "type": "String", + "description": "Duration before expiration the certificate will be renewed.\nValue must be in units accepted by Go\n[`time.ParseDuration`](https://golang.org/pkg/time/#ParseDuration).", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "secretTemplate", + "type": "CertificateSecretTemplate", + "description": "Additional annotations and labels to include in the Certificate object.", + "default": null, + "required": false, + "deprecated": false + } + ] + ], + [ + "CertificateSecretTemplate", + [ + { + "name": "annotations", + "type": "Map", + "description": "Annotations is a key value map to be copied to the target Kubernetes Secret.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "labels", + "type": "Map", + "description": "Labels is a key value map to be copied to the target Kubernetes Secret.", + "default": null, + "required": false, + "deprecated": false + } + ] + ], + [ + "CertificateIssuerRef", + [ + { + "name": "name", + "type": "String", + "description": "Name of the resource being referred to.", + "default": null, + "required": true, + "deprecated": false + }, + { + "name": "group", + "type": "String", + "description": "Group of the resource being referred to.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "kind", + "type": "String", + "description": "Kind of the resource being referred to.", + "default": null, + "required": false, + "deprecated": false + } + ] + ], + [ + "io.k8s.api.core.v1.ResourceRequirements", + [ + { + "name": "claims", + "type": "Array", + "description": "Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container.\n\nThis is an alpha field and requires enabling the DynamicResourceAllocation feature gate.\n\nThis field is immutable. It can only be set for containers.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "limits", + "type": "Map", + "description": "Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "requests", + "type": "Map", + "description": "Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/", + "default": null, + "required": false, + "deprecated": false + } + ] + ], + [ + "io.k8s.api.core.v1.ResourceClaim", + [ + { + "name": "name", + "type": "String", + "description": "Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.", + "default": null, + "required": true, + "deprecated": false + }, + { + "name": "request", + "type": "String", + "description": "Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request.", + "default": null, + "required": false, + "deprecated": false + } + ] + ], + [ + "io.k8s.api.core.v1.EnvVar", + [ + { + "name": "name", + "type": "String", + "description": "Name of the environment variable. Must be a C_IDENTIFIER.", + "default": null, + "required": true, + "deprecated": false + }, + { + "name": "value", + "type": "String", + "description": "Variable references $(VAR_NAME) are expanded using the previously defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. \"$$(VAR_NAME)\" will produce the string literal \"$(VAR_NAME)\". Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "valueFrom", + "type": "io.k8s.api.core.v1.EnvVarSource", + "description": "Source for the environment variable's value. Cannot be used if value is not empty.", + "default": null, + "required": false, + "deprecated": false + } + ] + ], + [ + "io.k8s.api.core.v1.EnvVarSource", + [ + { + "name": "configMapKeyRef", + "type": "io.k8s.api.core.v1.ConfigMapKeySelector", + "description": "Selects a key of a ConfigMap.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "fieldRef", + "type": "io.k8s.api.core.v1.ObjectFieldSelector", + "description": "Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['']`, `metadata.annotations['']`, spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "resourceFieldRef", + "type": "io.k8s.api.core.v1.ResourceFieldSelector", + "description": "Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported.", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "secretKeyRef", + "type": "io.k8s.api.core.v1.SecretKeySelector", + "description": "Selects a key of a secret in the pod's namespace", + "default": null, + "required": false, + "deprecated": false + } + ] + ], + [ + "io.k8s.api.core.v1.SecretKeySelector", + [ + { + "name": "key", + "type": "String", + "description": "The key of the secret to select from. Must be a valid secret key.", + "default": null, + "required": true, + "deprecated": false + }, + { + "name": "name", + "type": "String", + "description": "Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names", + "default": null, + "required": true, + "deprecated": false + }, + { + "name": "optional", + "type": "Bool", + "description": "Specify whether the Secret or its key must be defined", + "default": null, + "required": false, + "deprecated": false + } + ] + ], + [ + "io.k8s.api.core.v1.ResourceFieldSelector", + [ + { + "name": "resource", + "type": "String", + "description": "Required: resource to select", + "default": null, + "required": true, + "deprecated": false + }, + { + "name": "containerName", + "type": "String", + "description": "Container name: required for volumes, optional for env vars", + "default": null, + "required": false, + "deprecated": false + }, + { + "name": "divisor", + "type": "io.k8s.apimachinery.pkg.api.resource.Quantity", + "description": "Specifies the output format of the exposed resources, defaults to \"1\"", + "default": null, + "required": false, + "deprecated": false + } + ] + ], + [ + "io.k8s.api.core.v1.ObjectFieldSelector", + [ + { + "name": "fieldPath", + "type": "String", + "description": "Path of the field to select in the specified API version.", + "default": null, + "required": true, + "deprecated": false + }, + { + "name": "apiVersion", + "type": "String", + "description": "Version of the schema the FieldPath is written in terms of, defaults to \"v1\".", + "default": null, + "required": false, + "deprecated": false + } + ] + ], + [ + "io.k8s.api.core.v1.ConfigMapKeySelector", + [ + { + "name": "key", + "type": "String", + "description": "The key to select.", + "default": null, + "required": true, + "deprecated": false + }, + { + "name": "name", + "type": "String", + "description": "Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names", + "default": null, + "required": true, + "deprecated": false + }, + { + "name": "optional", + "type": "Bool", + "description": "Specify whether the ConfigMap or its key must be defined", + "default": null, + "required": false, + "deprecated": false + } + ] + ] +] diff --git a/doc/user/data/self_managed/materialize_crd_descriptions.json b/doc/user/data/self_managed/materialize_crd_descriptions_v1alpha1.json similarity index 99% rename from doc/user/data/self_managed/materialize_crd_descriptions.json rename to doc/user/data/self_managed/materialize_crd_descriptions_v1alpha1.json index 98fa14c717950..a87d9961edd19 100644 --- a/doc/user/data/self_managed/materialize_crd_descriptions.json +++ b/doc/user/data/self_managed/materialize_crd_descriptions_v1alpha1.json @@ -132,9 +132,9 @@ }, { "name": "forcePromote", - "type": "Uuid", + "type": "String", "description": "If `forcePromote` is set to the same value as `requestRollout`, the\ncurrent rollout will skip waiting for clusters in the new\ngeneration to rehydrate before promoting the new environmentd to\nleader.", - "default": "00000000-0000-0000-0000-000000000000", + "default": "", "required": false, "deprecated": false }, diff --git a/doc/user/data/self_managed/upgrades.yml b/doc/user/data/self_managed/upgrades.yml index d53015f6ee8d2..693f9c72459b1 100644 --- a/doc/user/data/self_managed/upgrades.yml +++ b/doc/user/data/self_managed/upgrades.yml @@ -152,7 +152,7 @@ The **APP VERSION** will be the value that you will use for upgrading Materialize instances. -- name: upgrade-materialize-instance +- name: upgrade-materialize-instance-v1alpha1 content: | **After** you have upgraded your Materialize Operator, upgrade your @@ -166,7 +166,7 @@ - Second, rolling out the changes via a `requestRollout` flag. 1. Find the name of the Materialize instance to upgrade. The sample example - deployment using the unified Terraform module deploys the Materialie + deployment using the unified Terraform module deploys the Materialize instance in the`materialize-environment` namespace. ```shell @@ -203,3 +203,39 @@ ```bash kubectl -n materialize-environment describe pod -l app=environmentd ``` + +- name: upgrade-materialize-instance + content: | + + **After** you have upgraded your Materialize Operator, upgrade your + Materialize instance(s) to the **APP Version** of the Operator. + + 1. Find the name of the Materialize instance to upgrade. The sample example + deployment using the unified Terraform module deploys the Materialize + instance in the`materialize-environment` namespace. + + ```shell + kubectl get materialize -n materialize-environment + ``` + + In the example deployment, the name of the instance is `main`. + + ```none + NAME + main + ``` + + 1. Apply the Materialize instance version upgrade. + + ```shell + kubectl patch materialize main\ + -n materialize-environment \ + --type='merge' \ + -p "{\"spec\": {\"environmentdImageRef\": \"docker.io/materialize/environmentd:{{< self-managed/versions/get-latest-version >}}\"}}" + ``` + + 1. Verify the upgrade by checking the `environmentd` events: + + ```bash + kubectl -n materialize-environment describe pod -l app=environmentd + ``` diff --git a/doc/user/layouts/shortcodes/self-managed/materialize-crd-descriptions.html b/doc/user/layouts/shortcodes/self-managed/materialize-crd-descriptions-v1.html similarity index 89% rename from doc/user/layouts/shortcodes/self-managed/materialize-crd-descriptions.html rename to doc/user/layouts/shortcodes/self-managed/materialize-crd-descriptions-v1.html index 5c09e0cb47f88..66b194694e322 100644 --- a/doc/user/layouts/shortcodes/self-managed/materialize-crd-descriptions.html +++ b/doc/user/layouts/shortcodes/self-managed/materialize-crd-descriptions-v1.html @@ -1,10 +1,10 @@ {{ $types := dict }} -{{ range $.Site.Data.self_managed.materialize_crd_descriptions }} +{{ range $.Site.Data.self_managed.materialize_crd_descriptions_v1 }} {{ $types = merge $types (dict (index . 0) true) }} {{ end }} -{{ range $.Site.Data.self_managed.materialize_crd_descriptions }} +{{ range $.Site.Data.self_managed.materialize_crd_descriptions_v1 }} #### {{ index . 0 }} diff --git a/doc/user/layouts/shortcodes/self-managed/materialize-crd-descriptions-v1alpha1.html b/doc/user/layouts/shortcodes/self-managed/materialize-crd-descriptions-v1alpha1.html new file mode 100644 index 0000000000000..de77f8a4e1a95 --- /dev/null +++ b/doc/user/layouts/shortcodes/self-managed/materialize-crd-descriptions-v1alpha1.html @@ -0,0 +1,57 @@ + +{{ $types := dict }} +{{ range $.Site.Data.self_managed.materialize_crd_descriptions_v1alpha1 }} +{{ $types = merge $types (dict (index . 0) true) }} +{{ end }} + +{{ range $.Site.Data.self_managed.materialize_crd_descriptions_v1alpha1 }} +#### {{ index . 0 }} +
+ + + + + + + + +{{- range (index . 1) }} + {{- if not .deprecated }} + + + + + + {{- end }} +{{- end }} + +
Field NameRequiredDescription
{{ .name }}{{ if .required }}✅{{ end }} + + {{- $matches := findRESubmatch `(Array<|Map]+)(>*)` .type 1 -}} + {{- $captures := index $matches 0 -}} + + {{- $typePrefix := index $captures 1 -}} + {{- $baseType := index $captures 2 -}} + {{- $typeSuffix := index $captures 3 -}} + + {{- $typePrefix -}} + {{- if (index $types $baseType) -}} + + {{- end -}} + {{ $baseType }} + {{- if (index $types $baseType) -}} + + {{- end -}} + {{- $typeSuffix -}} + + + +

{{- .description | markdownify -}}

+ + + {{- if .default -}} +

Default: {{ .default }}

+ {{- end -}} + +
+{{ end }} diff --git a/doc/user/layouts/shortcodes/self-managed/versions/upgrade/upgrade-steps-local-kind.html b/doc/user/layouts/shortcodes/self-managed/versions/upgrade/upgrade-steps-local-kind.html index 94ee296c4aac3..145cd47fb2974 100644 --- a/doc/user/layouts/shortcodes/self-managed/versions/upgrade/upgrade-steps-local-kind.html +++ b/doc/user/layouts/shortcodes/self-managed/versions/upgrade/upgrade-steps-local-kind.html @@ -63,6 +63,33 @@ 1. Create a new `upgrade-materialize.yaml` file, updating the following fields: +
+ +
+
+ + | Field | Description | + |-------|-------------| + | `environmentdImageRef` | Update the version to the new version. This should be the same as the operator version: `{{ $operator_version }}`. | + | `forceRollout`| Enter a new UUID. Can be generated with `uuidgen`.
  • `forceRollout` triggers a rollout even if no other changes exist.
| + + + ```yaml + apiVersion: materialize.cloud/v1 + kind: Materialize + metadata: + name: 12345678-1234-1234-1234-123456789012 + namespace: materialize-environment + spec: + environmentdImageRef: materialize/environmentd:{{ $environmentd_version }} # Update version + # forceRollout: 33333333-3333-3333-3333-333333333333 # For forced rollouts + rolloutStrategy: WaitUntilReady # The mechanism to use when rolling out the new version. + backendSecretName: materialize-backend + ``` + +
+
+ | Field | Description | |-------|-------------| | `environmentdImageRef` | Update the version to the new version. This should be the same as the operator version: `{{ $operator_version }}`. | @@ -83,6 +110,10 @@ backendSecretName: materialize-backend ``` +
+
+
+ 1. Apply the upgrade-materialize.yaml file to your Materialize instance: ```shell diff --git a/doc/user/shared-content/self-managed/upgrade-notes/v26.0.md b/doc/user/shared-content/self-managed/upgrade-notes/v26.0.md index a59c43a810a08..a740e3c7e9224 100644 --- a/doc/user/shared-content/self-managed/upgrade-notes/v26.0.md +++ b/doc/user/shared-content/self-managed/upgrade-notes/v26.0.md @@ -8,7 +8,7 @@ - `ImmediatelyPromoteCausingDowntime` For more information, see - [`rolloutStrategy`](/self-managed-deployments/upgrading/#rollout-strategies). + [`rolloutStrategy`](/self-managed-deployments/upgrading/materialize-instances/v1/#rollout-strategies). - New requirements were introduced for [license keys](/releases/#license-key). To upgrade, you will first need to add a license key to the `backendSecret` diff --git a/doc/user/shared-content/self-managed/upgrade-notes/v26.29.md b/doc/user/shared-content/self-managed/upgrade-notes/v26.29.md new file mode 100644 index 0000000000000..be010a8d5f8d5 --- /dev/null +++ b/doc/user/shared-content/self-managed/upgrade-notes/v26.29.md @@ -0,0 +1,287 @@ +- v26.29.0 introduces a new version of the Materialize CRD, `v1`. The new CRD simplifies rollouts by automatically detecting changes. This means you will no longer need to manually rotate a UUID to trigger a rollout. + +Before, on `v1alpha1`: +```yaml +apiVersion: materialize.cloud/v1alpha1 +kind: Materialize +metadata: + name: 12345678-1234-1234-1234-123456789012 + namespace: materialize-environment +spec: + environmentdImageRef: materialize/environmentd:v26.16.0 + requestRollout: 22222222-2222-2222-2222-222222222222 # ← MUST set a new UUID every upgrade +# forceRollout: 33333333-3333-3333-3333-333333333333 # ← for forced rollouts + rolloutStrategy: WaitUntilReady + backendSecretName: materialize-backend +``` + +After, on `v1`: +```yaml +apiVersion: materialize.cloud/v1 +kind: Materialize +metadata: + name: 12345678-1234-1234-1234-123456789012 + namespace: materialize-environment +spec: + environmentdImageRef: materialize/environmentd:v26.16.0 # ← just change this +# forceRollout: 33333333-3333-3333-3333-333333333333 # ← only for forced rollouts + rolloutStrategy: WaitUntilReady + backendSecretName: materialize-backend +``` + +With the new change, the `requestRollout` field will be removed, along with all previously deprecated fields. + +Switching to `v1` is **opt-in**. You may continue to apply `v1alpha1` CRs and your existing instances will behave exactly as before. We recommend you opt-in to `v1` at your convenience, as we will migrate to the `v1` behavior in the next major release. For step-by-step migration instructions, see [Switching from v1alpha1 to v1](/self-managed-deployments/upgrading/materialize-instances/#switching-from-v1alpha1-to-v1). + + +{{< important >}} + +Using the `v1` CRD requires infrastructure changes and an additional Helm +value, `operator.args.installV1CRD=true`. If you continue to use `v1alpha1`, +no infrastructure changes are required, but we recommend making them anyway, +as TLS certificates will become required in a future version of Materialize. + +{{< /important >}} + +Materialize uses conversion webhooks to allow you to gracefully migrate from +`v1alpha1` to `v1`. The `v1` CRD and its conversion webhook are only installed +when you set the Helm value `operator.args.installV1CRD=true`. Before enabling +it, you need to install cert-manager (or provide your own certificate) and +allow internal network ingress on port `8001`. + +Choose the tab that matches your deployment method: + +
+ +
+
+ +If you are using the [supported Terraform +modules](https://github.com/MaterializeInc/materialize-terraform-self-managed), +the required infrastructure changes (cert-manager and network ingress) will be +handled for you automatically. + +To upgrade, update each module's `source` to point to the new release tag and +run `terraform init -upgrade && terraform plan && terraform apply`. To enable +the `v1` CRD, also set the Helm value `operator.args.installV1CRD=true` in the +values passed to the operator module. + +The key modules and their dependency chain are shown below. Your configuration +may include additional modules (networking, storage, database, node pools, etc.) +— update those to the same release tag as well. + +
+ +
+
+ +```hcl +module "eks" { + source = "github.com/MaterializeInc/materialize-terraform-self-managed//aws/modules/eks?ref=" + # ... your existing configuration ... +} + +module "cert_manager" { + source = "github.com/MaterializeInc/materialize-terraform-self-managed//kubernetes/modules/cert-manager?ref=" + # ... your existing configuration ... + + # Your configuration may have additional dependencies here. + depends_on = [module.eks] +} + +module "operator" { + source = "github.com/MaterializeInc/materialize-terraform-self-managed//aws/modules/operator?ref=" + # ... your existing configuration ... + + # Your configuration may have additional dependencies here. + depends_on = [module.cert_manager] +} + +module "materialize_instance" { + source = "github.com/MaterializeInc/materialize-terraform-self-managed//kubernetes/modules/materialize-instance?ref=" + # ... your existing configuration ... + + # Your configuration may have additional dependencies here. + depends_on = [module.operator] +} +``` + +For a complete example, see +[`aws/examples/simple/main.tf`](https://github.com/MaterializeInc/materialize-terraform-self-managed/blob/main/aws/examples/simple/main.tf). + +
+
+ +```hcl +module "gke" { + source = "github.com/MaterializeInc/materialize-terraform-self-managed//gcp/modules/gke?ref=" + # ... your existing configuration ... +} + +module "cert_manager" { + source = "github.com/MaterializeInc/materialize-terraform-self-managed//kubernetes/modules/cert-manager?ref=" + # ... your existing configuration ... + + # Your configuration may have additional dependencies here. + depends_on = [module.gke] +} + +module "operator" { + source = "github.com/MaterializeInc/materialize-terraform-self-managed//gcp/modules/operator?ref=" + # ... your existing configuration ... + + # Your configuration may have additional dependencies here. + depends_on = [module.cert_manager] +} + +module "materialize_instance" { + source = "github.com/MaterializeInc/materialize-terraform-self-managed//kubernetes/modules/materialize-instance?ref=" + # ... your existing configuration ... + + # Your configuration may have additional dependencies here. + depends_on = [module.operator] +} +``` + +For a complete example, see +[`gcp/examples/simple/main.tf`](https://github.com/MaterializeInc/materialize-terraform-self-managed/blob/main/gcp/examples/simple/main.tf). + +
+
+ +```hcl +module "aks" { + source = "github.com/MaterializeInc/materialize-terraform-self-managed//azure/modules/aks?ref=" + # ... your existing configuration ... +} + +module "cert_manager" { + source = "github.com/MaterializeInc/materialize-terraform-self-managed//kubernetes/modules/cert-manager?ref=" + # ... your existing configuration ... + + # Your configuration may have additional dependencies here. + depends_on = [module.aks] +} + +module "operator" { + source = "github.com/MaterializeInc/materialize-terraform-self-managed//azure/modules/operator?ref=" + # ... your existing configuration ... + + # Your configuration may have additional dependencies here. + depends_on = [module.cert_manager] +} + +module "materialize_instance" { + source = "github.com/MaterializeInc/materialize-terraform-self-managed//kubernetes/modules/materialize-instance?ref=" + # ... your existing configuration ... + + # Your configuration may have additional dependencies here. + depends_on = [module.operator] +} +``` + +For a complete example, see +[`azure/examples/simple/main.tf`](https://github.com/MaterializeInc/materialize-terraform-self-managed/blob/main/azure/examples/simple/main.tf). + +
+
+
+ +
+
+ +If you are using the legacy Terraform modules +([AWS](https://github.com/MaterializeInc/terraform-aws-materialize), +[GCP](https://github.com/MaterializeInc/terraform-gcp-materialize), or +[Azure](https://github.com/MaterializeInc/terraform-azure-materialize)), +we recommend migrating to the [new supported Terraform +modules](https://github.com/MaterializeInc/materialize-terraform-self-managed) +before opting in to the `v1` CRD. + +The new modules include built-in support for the conversion webhooks used by +the `v1` CRD, including cert-manager installation and network policy +configuration. The legacy modules do not include these changes, so you would +need to apply them manually (see the **Manual** tab). + +For migration guidance, see the documentation for your cloud provider: + +- [AWS migration guide](https://github.com/MaterializeInc/materialize-terraform-self-managed/tree/main/aws/examples/migration) +- [GCP migration guide](https://github.com/MaterializeInc/materialize-terraform-self-managed/tree/main/gcp/examples/migration) +- [Azure migration guide](https://github.com/MaterializeInc/materialize-terraform-self-managed/tree/main/azure/examples/migration) + +
+
+ +If you are not using our Terraform modules, you **must** complete the following +steps before enabling the `v1` CRD: + +**1. Install cert-manager** + +The conversion webhook requires a TLS certificate. +The Helm chart defaults to using [cert-manager](https://cert-manager.io/) +to automatically create and manage this certificate. cert-manager must be +installed **before** enabling the `v1` CRD. + +If you prefer to provide your own certificate instead of using cert-manager, +set the following Helm values: +- `operator.certificate.source`: `secret` +- `operator.certificate.secretName`: the name of the Kubernetes Secret + containing `ca.crt`, `tls.crt`, and `tls.key` entries. + +**2. Allow network access to the webhook port** + +The conversion webhooks require the Kubernetes API server to reach the +`orchestratord` pod on port `8001`. If your cluster enforces network policies +or cloud-level firewall rules, you must allow ingress traffic on TCP port +`8001` from the API server to pods with the label +`app.kubernetes.io/name: materialize-operator`. + +**Kubernetes NetworkPolicy:** Add a policy that allows ingress from the +Kubernetes API server on port `8001` to the `materialize-operator` pods in the +namespace where the operator is deployed: + +```yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-api-server-ingress-to-conversion-webhook + namespace: materialize # the namespace where the operator runs +spec: + podSelector: + matchLabels: + app.kubernetes.io/name: materialize-operator + policyTypes: + - Ingress + ingress: + - ports: + - protocol: TCP + port: 8001 +``` + +**Cloud firewall rules (e.g., AWS security groups, GCP firewall rules):** +Ensure the node security group or firewall allows inbound TCP traffic on +port `8001` from the Kubernetes control plane. For example, on AWS, add an +ingress rule to the EKS node security group allowing port `8001` from the +cluster security group. On GCP with private clusters, add a firewall rule +allowing port `8001` from the GKE control plane CIDR. + +For a complete example of the required changes across AWS, Azure, and GCP, +see [this pull request](https://github.com/MaterializeInc/materialize-terraform-self-managed/pull/160). + +**3. Enable the v1 CRD** + +Once the prerequisites above are in place, set the following Helm value when +installing or upgrading the operator: + +```yaml +operator: + args: + installV1CRD: true +``` + +This installs the `v1` version of the Materialize CRD and the conversion +webhook that converts between `v1` and `v1alpha1`. + +
+
+
diff --git a/misc/helm-charts/operator/README.md b/misc/helm-charts/operator/README.md index b18af7d4e74e5..7c6d072b6fd98 100644 --- a/misc/helm-charts/operator/README.md +++ b/misc/helm-charts/operator/README.md @@ -143,7 +143,13 @@ The following table lists the configurable parameters of the Materialize operato | `operator.affinity` | Affinity to use for the operator pod | ``{}`` | | `operator.args.enableInternalStatementLogging` | | ``true`` | | `operator.args.enableLicenseKeyChecks` | | ``false`` | +| `operator.args.installV1CRD` | Whether to install the v1 version of the Materialize CRD and the conversion webhook that converts between v1 and v1alpha1. When false, only the v1alpha1 CRD version is installed and no webhook serving certificate or service is created. | ``false`` | | `operator.args.startupLogFilter` | Log filtering settings for startup logs | ``"INFO,mz_orchestratord=TRACE"`` | +| `operator.args.webhookCertReloadInterval` | How often orchestratord reloads its webhook TLS certificate from disk and, when the CA changes, refreshes the conversion webhook's CA bundle. Must be shorter than the certificate's lifetime. Accepts a humantime duration (e.g. "1h", "30m"). Leave null to use the binary default. Only used if `installV1CRD` is true. | ``nil`` | +| `operator.certificate.caDuration` | Lifetime of the root CA that signs the webhook serving certificate, when `source` is "cert-manager". The serving certificate is signed by this CA, so the CA outlives individual serving-certificate rotations. | ``"87600h"`` | +| `operator.certificate.caRenewBefore` | How long before the root CA expires to renew it. Must be less than `caDuration`. | ``"8760h"`` | +| `operator.certificate.secretName` | Name of a secret in the operator's namespace containing ca.crt, tls.crt, and tls.key entries. Only used if `source` is "secret". | ``nil`` | +| `operator.certificate.source` | Where to obtain the certificate for orchestratord. Valid values are 'cert-manager' and 'secret'. Only used if `operator.args.installV1CRD` is true. | ``"cert-manager"`` | | `operator.cloudProvider.providers.aws.accountID` | When using AWS, accountID is required | ``""`` | | `operator.cloudProvider.providers.aws.enabled` | | ``false`` | | `operator.cloudProvider.providers.aws.iam.roles.connection` | ARN for CREATE CONNECTION feature | ``""`` | diff --git a/misc/helm-charts/operator/templates/certificate.yaml b/misc/helm-charts/operator/templates/certificate.yaml new file mode 100644 index 0000000000000..a267f81a4cc82 --- /dev/null +++ b/misc/helm-charts/operator/templates/certificate.yaml @@ -0,0 +1,88 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +{{- if and .Values.operator.args.installV1CRD (eq .Values.operator.certificate.source "cert-manager") -}} +# We provision the webhook serving certificate from a stable root CA rather +# than as a bare self-signed certificate. The serving certificate rotates +# frequently, but it is always signed by the same long-lived CA, so the +# `ca.crt` that orchestratord registers as the conversion webhook's caBundle +# stays valid across serving-certificate rotations. orchestratord refreshes the +# caBundle if the CA itself ever changes, e.g. on the rare root CA renewal, so a +# routine serving-certificate rotation leaves the webhook undisturbed. +--- +# Bootstrap issuer used only to sign the root CA below. +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: {{ include "materialize-operator.fullname" . }}-self-signed + namespace: {{ .Release.Namespace }} + labels: + {{- include "materialize-operator.labels" . | nindent 4 }} +spec: + selfSigned: {} + +--- +# Long-lived root CA. Stays stable across serving-certificate rotations so the +# conversion webhook's caBundle remains valid without changing every rotation. +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ include "materialize-operator.fullname" . }}-ca + namespace: {{ .Release.Namespace }} + labels: + {{- include "materialize-operator.labels" . | nindent 4 }} +spec: + isCA: true + commonName: {{ include "materialize-operator.fullname" . }}-ca + secretName: {{ include "materialize-operator.fullname" . }}-ca + duration: {{ .Values.operator.certificate.caDuration }} + renewBefore: {{ .Values.operator.certificate.caRenewBefore }} + privateKey: + algorithm: ECDSA + rotationPolicy: Always + issuerRef: + name: {{ include "materialize-operator.fullname" . }}-self-signed + kind: Issuer + group: cert-manager.io + +--- +# CA issuer that signs the serving certificate using the stable root CA. +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: {{ include "materialize-operator.fullname" . }}-ca + namespace: {{ .Release.Namespace }} + labels: + {{- include "materialize-operator.labels" . | nindent 4 }} +spec: + ca: + secretName: {{ include "materialize-operator.fullname" . }}-ca + +--- +# Webhook serving certificate. Rotates frequently; its `ca.crt` is the stable +# root CA above. +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ include "materialize-operator.fullname" . }}-cert + namespace: {{ .Release.Namespace }} + labels: + {{- include "materialize-operator.labels" . | nindent 4 }} +spec: + dnsNames: + - {{ include "materialize-operator.fullname" . }}.{{ .Release.Namespace }}.svc + secretName: {{ include "materialize-operator.fullname" . }}-cert + privateKey: + algorithm: ECDSA + rotationPolicy: Always + issuerRef: + name: {{ include "materialize-operator.fullname" . }}-ca + kind: Issuer + group: cert-manager.io +{{- end -}} diff --git a/misc/helm-charts/operator/templates/clusterrole.yaml b/misc/helm-charts/operator/templates/clusterrole.yaml index ce265f5d89ebc..b981930372a82 100644 --- a/misc/helm-charts/operator/templates/clusterrole.yaml +++ b/misc/helm-charts/operator/templates/clusterrole.yaml @@ -76,6 +76,7 @@ rules: - apiGroups: ["apiextensions.k8s.io"] resources: - customresourcedefinitions + - customresourcedefinitions/status verbs: - create - update diff --git a/misc/helm-charts/operator/templates/deployment.yaml b/misc/helm-charts/operator/templates/deployment.yaml index 50c5f0823ee20..dc2cc0eca936d 100644 --- a/misc/helm-charts/operator/templates/deployment.yaml +++ b/misc/helm-charts/operator/templates/deployment.yaml @@ -66,6 +66,12 @@ spec: {{- if not .Values.operator.args.enableLicenseKeyChecks }} - "--disable-license-key-checks" {{- end }} + {{- if .Values.operator.args.installV1CRD }} + - "--install-v1-crd" + {{- if .Values.operator.args.webhookCertReloadInterval }} + - "--webhook-cert-reload-interval={{ .Values.operator.args.webhookCertReloadInterval }}" + {{- end }} + {{- end }} {{/* AWS Configuration */}} {{- if eq .Values.operator.cloudProvider.type "aws" }} @@ -242,9 +248,19 @@ spec: - > --additional-crd-columns={{ toJson .Values.operator.additionalMaterializeCRDColumns }} {{- end }} + {{- if .Values.operator.args.installV1CRD }} + - "--webhook-service-name" + - {{ include "materialize-operator.fullname" . }} + - "--webhook-service-namespace" + - {{ .Release.Namespace }} + {{- end }} ports: - containerPort: 3100 name: metrics + {{- if .Values.operator.args.installV1CRD }} + - containerPort: 8001 + name: webhook + {{- end }} resources: {{- toYaml .Values.operator.resources | nindent 10 }} securityContext: @@ -256,3 +272,29 @@ spec: runAsNonRoot: true seccompProfile: type: RuntimeDefault + {{- if .Values.operator.args.installV1CRD }} + livenessProbe: + httpGet: + path: /healthz + port: webhook + scheme: HTTPS + failureThreshold: 3 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: webhook + scheme: HTTPS + failureThreshold: 1 + periodSeconds: 10 + volumeMounts: + - mountPath: /etc/tls + name: certificate + readOnly: true + volumes: + - name: certificate + secret: + defaultMode: 256 + optional: false + secretName: {{ if eq .Values.operator.certificate.source "cert-manager" }}{{ include "materialize-operator.fullname" . }}-cert{{ else }}{{ .Values.operator.certificate.secretName }}{{ end }} + {{- end }} diff --git a/misc/helm-charts/operator/templates/service.yaml b/misc/helm-charts/operator/templates/service.yaml new file mode 100644 index 0000000000000..b02d58d057bdd --- /dev/null +++ b/misc/helm-charts/operator/templates/service.yaml @@ -0,0 +1,27 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +{{- if .Values.operator.args.installV1CRD -}} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ include "materialize-operator.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "materialize-operator.labels" . | nindent 4 }} +spec: + selector: + {{- include "materialize-operator.selectorLabels" . | nindent 4 }} + ports: + - name: webhook + protocol: TCP + port: 8001 + targetPort: 8001 +{{- end -}} diff --git a/misc/helm-charts/operator/values.yaml b/misc/helm-charts/operator/values.yaml index 0b8712d6a994a..510e5486eb097 100644 --- a/misc/helm-charts/operator/values.yaml +++ b/misc/helm-charts/operator/values.yaml @@ -24,6 +24,17 @@ operator: enableInternalStatementLogging: true # Newer versions ignore this setting and always enforce license key checks. enableLicenseKeyChecks: false + # -- Whether to install the v1 version of the Materialize CRD and the + # conversion webhook that converts between v1 and v1alpha1. When false, + # only the v1alpha1 CRD version is installed and no webhook serving + # certificate or service is created. + installV1CRD: false + # -- (string) How often orchestratord reloads its webhook TLS certificate + # from disk and, when the CA changes, refreshes the conversion webhook's CA + # bundle. Must be shorter than the certificate's lifetime. Accepts a + # humantime duration (e.g. "1h", "30m"). Leave null to use the binary + # default. Only used if `installV1CRD` is true. + webhookCertReloadInterval: null # -- Additional columns to display when printing the Materialize CRD in table format. additionalMaterializeCRDColumns: {} @@ -34,6 +45,23 @@ operator: # priority: 2 # type: "string" + # Webhook serving certificate configuration. Only used if + # `operator.args.installV1CRD` is true. + certificate: + # -- (string) Where to obtain the certificate for orchestratord. Valid values are 'cert-manager' and 'secret'. Only used if `operator.args.installV1CRD` is true. + source: cert-manager + # -- (string) Name of a secret in the operator's namespace containing ca.crt, tls.crt, and tls.key entries. Only used if `source` is "secret". + secretName: null + # -- (string) Lifetime of the root CA that signs the webhook serving + # certificate, when `source` is "cert-manager". The serving certificate is + # signed by this CA, so the CA outlives individual serving-certificate + # rotations. + caDuration: 87600h + # -- (string) How long before the root CA expires to renew it. Must be less + # than `caDuration`. + caRenewBefore: 8760h + + # Cloud provider configuration cloudProvider: # -- Specifies cloud provider. Valid values are 'aws', 'gcp', 'azure' , 'generic', or 'local' diff --git a/src/cloud-resources/Cargo.toml b/src/cloud-resources/Cargo.toml index 731e324f1a24c..94a78f6e23737 100644 --- a/src/cloud-resources/Cargo.toml +++ b/src/cloud-resources/Cargo.toml @@ -28,8 +28,9 @@ schemars.workspace = true semver.workspace = true serde.workspace = true serde_json.workspace = true +sha2.workspace = true tracing.workspace = true -uuid = { workspace = true, features = ["serde", "v4"] } +uuid = { workspace = true, features = ["serde", "v4", "v5"] } async-trait = { workspace = true, optional = true } mz-repr = { path = "../repr", default-features = false, optional = true } diff --git a/src/cloud-resources/src/bin/crd_writer.rs b/src/cloud-resources/src/bin/crd_writer.rs index 7a51c8386ba4e..b2355d5656751 100644 --- a/src/cloud-resources/src/bin/crd_writer.rs +++ b/src/cloud-resources/src/bin/crd_writer.rs @@ -8,9 +8,11 @@ // by the Apache License, Version 2.0. use std::cmp::Ordering; +use std::env; use indexmap::{IndexMap, IndexSet}; -use mz_cloud_resources::crd::materialize::v1alpha1::MaterializeSpec; +use mz_cloud_resources::crd::materialize::v1; +use mz_cloud_resources::crd::materialize::v1alpha1; use schemars::schema_for; use serde::Serialize; @@ -328,7 +330,14 @@ fn format_enum_variants_from_json( } fn main() { - let root_schema = schema_for!(MaterializeSpec); + let args: Vec = env::args().collect(); + let version = args.get(1).expect("usage: crd-writer "); + + let root_schema = match version.as_str() { + "v1alpha1" => schema_for!(v1alpha1::MaterializeSpec), + "v1" => schema_for!(v1::MaterializeSpec), + other => panic!("unknown version: {other}, expected v1alpha1 or v1"), + }; // Convert all to JSON for easier merging let schema_json = root_schema.to_value(); diff --git a/src/cloud-resources/src/crd.rs b/src/cloud-resources/src/crd.rs index 971939859658e..62878e8c23bab 100644 --- a/src/cloud-resources/src/crd.rs +++ b/src/cloud-resources/src/crd.rs @@ -13,7 +13,9 @@ use std::collections::BTreeMap; use std::time::Duration; use futures::future::join_all; -use k8s_openapi::apiextensions_apiserver::pkg::apis::apiextensions::v1::CustomResourceDefinition; +use k8s_openapi::apiextensions_apiserver::pkg::apis::apiextensions::v1::{ + CustomResourceConversion, CustomResourceDefinition, +}; use k8s_openapi::apimachinery::pkg::apis::meta::v1::OwnerReference; use kube::{ Api, Client, Resource, ResourceExt, @@ -105,6 +107,9 @@ fn owner_reference>(t: &T) -> OwnerReference { pub struct VersionedCrd { pub crds: Vec, pub stored_version: String, + /// Conversion configuration to apply after merging CRDs. + /// `merge_crds` drops the conversion field, so we must set it after merging. + pub conversion: Option, } pub async fn register_versioned_crds( @@ -158,7 +163,10 @@ async fn register_custom_resource( let crd_name = format!("{}.{}", &crds[0].spec.names.plural, &crds[0].spec.group); info!("Registering {} crd", &crd_name); let crd_api = Api::::all(kube_client); - let crd = merge_crds(crds, &versioned_crds.stored_version).unwrap(); + let mut crd = merge_crds(crds, &versioned_crds.stored_version).unwrap(); + if let Some(conversion) = versioned_crds.conversion { + crd.spec.conversion = Some(conversion); + } let crd_json = serde_json::to_string(&serde_json::json!(&crd))?; info!(crd_json = %crd_json); crd_api diff --git a/src/cloud-resources/src/crd/materialize.rs b/src/cloud-resources/src/crd/materialize.rs index 91d79dd686024..afe22d675accd 100644 --- a/src/cloud-resources/src/crd/materialize.rs +++ b/src/cloud-resources/src/crd/materialize.rs @@ -22,6 +22,7 @@ use kube::{CustomResource, Resource, ResourceExt}; use schemars::JsonSchema; use semver::Version; use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; use uuid::Uuid; use crate::crd::{ManagedResource, MaterializeCertSpec, new_resource_id}; @@ -29,83 +30,848 @@ use mz_server_core::listeners::AuthenticatorKind; pub const LAST_KNOWN_ACTIVE_GENERATION_ANNOTATION: &str = "materialize.cloud/last-known-active-generation"; +pub const FORCE_ROLLOUT_ANNOTATION: &str = "materialize.cloud/force-rollout"; + +#[derive(Clone, Debug, Default, PartialEq, Deserialize, Serialize, JsonSchema)] +pub enum MaterializeRolloutStrategy { + /// Create a new generation of pods, leaving the old generation around until the + /// new ones are ready to take over. + /// This minimizes downtime, and is what almost everyone should use. + #[default] + WaitUntilReady, + + /// Create a new generation of pods, leaving the old generation as the serving generation + /// until the user manually promotes the new generation. + /// + /// When using `ManuallyPromote`, the new generation can be promoted at any + /// time, even if it has dataflows that are not fully caught up, by setting + /// `forcePromote` to the same value as `requestRollout` in the Materialize spec. + /// + /// To minimize downtime, promotion should occur when the new generation + /// has caught up to the prior generation. To determine if the new + /// generation has caught up, consult the `UpToDate` condition in the + /// status of the Materialize Resource. If the condition's reason is + /// `ReadyToPromote` the new generation is ready to promote. + /// + /// {{}} + /// Do not leave new generations unpromoted indefinitely. + /// + /// The new generation keeps open read holds which prevent compaction. Once promoted or + /// cancelled, those read holds are released. If left unpromoted for an extended time, this + /// data can build up, and can cause extreme deletion load on the metadata backend database + /// when finally promoted or cancelled. + /// + /// To guard against this, a rollout that remains in progress longer + /// than `rolloutRequestTimeout` (default 24h) is automatically + /// cancelled. + /// {{}} + ManuallyPromote, + + /// {{}} + /// THIS WILL CAUSE YOUR MATERIALIZE INSTANCE TO BE UNAVAILABLE FOR SOME TIME!!! + /// + /// This strategy should ONLY be used by customers with physical hardware who do not have + /// enough hardware for the `WaitUntilReady` strategy. If you think you want this, please + /// consult with Materialize engineering to discuss your situation. + /// {{}} + /// + /// Tear down the old generation of pods and promote the new generation of pods immediately, + /// without waiting for the new generation of pods to be ready. + ImmediatelyPromoteCausingDowntime, +} + +/// Default for [`RolloutRequestTimeout`]. A new generation that sits +/// un-promoted holds back compaction via read holds, and promoting it +/// after a long delay can cause incident-inducing load; 24h is a +/// conservative upper bound on how long any rollout should take. +pub const DEFAULT_ROLLOUT_REQUEST_TIMEOUT: &str = "24h"; + +/// The maximum time [`v1alpha1::MaterializeSpec::rollout_request_timeout`] allows a +/// rollout to remain in progress. +/// +/// A transparent wrapper around the duration string whose [`Default`] is +/// [`DEFAULT_ROLLOUT_REQUEST_TIMEOUT`]. Routing the default through `Default` +/// keeps a single source of truth: the derived `Default` for +/// [`v1alpha1::MaterializeSpec`], serde's `#[serde(default)]` (applied when the field +/// is omitted on deserialize), and the schema default surfaced in the +/// generated CRD (so the API server fills it in and `kubectl explain` shows +/// it) all resolve to the same value. +#[derive(Clone, Debug, PartialEq, Deserialize, Serialize, JsonSchema)] +#[serde(transparent)] +pub struct RolloutRequestTimeout(pub String); + +impl Default for RolloutRequestTimeout { + fn default() -> Self { + RolloutRequestTimeout(DEFAULT_ROLLOUT_REQUEST_TIMEOUT.to_owned()) + } +} pub mod v1alpha1 { use super::*; - #[derive(Clone, Debug, Default, PartialEq, Deserialize, Serialize, JsonSchema)] - pub enum MaterializeRolloutStrategy { - /// Create a new generation of pods, leaving the old generation around until the - /// new ones are ready to take over. - /// This minimizes downtime, and is what almost everyone should use. - #[default] - WaitUntilReady, + #[derive( + CustomResource, + Clone, + Debug, + Default, + PartialEq, + Deserialize, + Serialize, + JsonSchema + )] + #[serde(rename_all = "camelCase")] + #[kube( + namespaced, + group = "materialize.cloud", + version = "v1alpha1", + kind = "Materialize", + singular = "materialize", + plural = "materializes", + shortname = "mzs", + status = "MaterializeStatus", + printcolumn = r#"{"name": "ImageRefRunning", "type": "string", "description": "Reference to the Docker image that is currently in use.", "jsonPath": ".status.lastCompletedRolloutEnvironmentdImageRef", "priority": 1}"#, + printcolumn = r#"{"name": "ImageRefToDeploy", "type": "string", "description": "Reference to the Docker image which will be deployed on the next rollout.", "jsonPath": ".spec.environmentdImageRef", "priority": 1}"#, + printcolumn = r#"{"name": "UpToDate", "type": "string", "description": "Whether the spec has been applied", "jsonPath": ".status.conditions[?(@.type==\"UpToDate\")].status", "priority": 1}"# + )] + pub struct MaterializeSpec { + /// The environmentd image to run. + pub environmentd_image_ref: String, + /// Extra args to pass to the environmentd binary. + pub environmentd_extra_args: Option>, + /// Extra environment variables to pass to the environmentd binary. + pub environmentd_extra_env: Option>, + /// {{}} + /// Deprecated. + /// + /// Use `service_account_annotations` to set "eks.amazonaws.com/role-arn" instead. + /// {{}} + /// + /// If running in AWS, override the IAM role to use to give + /// environmentd access to the persist S3 bucket. + #[kube(deprecated)] + pub environmentd_iam_role_arn: Option, + /// If running in AWS, override the IAM role to use to support + /// the CREATE CONNECTION feature. + pub environmentd_connection_role_arn: Option, + /// Resource requirements for the environmentd pod. + pub environmentd_resource_requirements: Option, + /// Amount of disk to allocate, if a storage class is provided. + pub environmentd_scratch_volume_storage_requirement: Option, + /// Resource requirements for the balancerd pod. + pub balancerd_resource_requirements: Option, + /// Resource requirements for the console pod. + pub console_resource_requirements: Option, + /// Number of balancerd pods to create. + pub balancerd_replicas: Option, + /// Number of console pods to create. + pub console_replicas: Option, + + /// Name of the kubernetes service account to use. + /// If not set, we will create one with the same name as this Materialize object. + pub service_account_name: Option, + /// Annotations to apply to the service account. + /// + /// Annotations on service accounts are commonly used by cloud providers for IAM. + /// AWS uses "eks.amazonaws.com/role-arn". + /// Azure uses "azure.workload.identity/client-id", but + /// additionally requires "azure.workload.identity/use": "true" on the pods. + pub service_account_annotations: Option>, + /// Labels to apply to the service account. + pub service_account_labels: Option>, + /// Annotations to apply to the pods. + pub pod_annotations: Option>, + /// Labels to apply to the pods. + pub pod_labels: Option>, + + /// When changes are made to the environmentd resources (either via + /// modifying fields in the spec here or by deploying a new + /// orchestratord version which changes how resources are generated), + /// existing environmentd processes won't be automatically restarted. + /// In order to trigger a restart, the request_rollout field should be + /// set to a new (random) value. Once the rollout completes, the value + /// of `status.lastCompletedRolloutRequest` will be set to this value + /// to indicate completion. + /// + /// Defaults to a random value in order to ensure that the first + /// generation rollout is automatically triggered. + #[serde(default)] + pub request_rollout: Uuid, + /// If `forcePromote` is set to the same value as `requestRollout`, the + /// current rollout will skip waiting for clusters in the new + /// generation to rehydrate before promoting the new environmentd to + /// leader. + #[serde(default)] + pub force_promote: String, + /// This value will be written to an annotation in the generated + /// environmentd statefulset, in order to force the controller to + /// detect the generated resources as changed even if no other changes + /// happened. This can be used to force a rollout to a new generation + /// even without making any meaningful changes, by setting it to the + /// same value as `requestRollout`. + #[serde(default)] + pub force_rollout: Uuid, + /// {{}} + /// Deprecated and ignored. Use `rolloutStrategy` instead. + /// {{}} + #[kube(deprecated)] + #[serde(default)] + pub in_place_rollout: bool, + /// Rollout strategy to use when upgrading this Materialize instance. + #[serde(default)] + pub rollout_strategy: MaterializeRolloutStrategy, + /// The maximum amount of time a rollout may remain in progress before + /// it is automatically cancelled. + /// + /// While a rollout is in progress, the new generation of `environmentd` + /// runs in a read-only, un-promoted state and holds back compaction via + /// read holds. Leaving it in this state for too long can cause + /// incident-inducing load when it is eventually promoted, so the + /// operator cancels the rollout once this timeout is exceeded: the new + /// generation is torn down and the previously-active generation + /// continues serving. A new rollout can then be triggered by setting + /// `requestRollout` to a new value. + /// + /// This does not apply to the `ImmediatelyPromoteCausingDowntime` + /// rollout strategy or to force-promoted rollouts, since by the time + /// those are in progress the old generation may already be gone. + /// + /// The value is parsed as a human-readable duration, e.g. `24h`, + /// `90m`, or `1h 30m`. Defaults to [`DEFAULT_ROLLOUT_REQUEST_TIMEOUT`] + /// when omitted (the API server fills it in); an unparseable value also + /// falls back to that default. + #[serde(default)] + pub rollout_request_timeout: RolloutRequestTimeout, + /// The name of a secret containing `metadata_backend_url` and `persist_backend_url`. + /// It may also contain `external_login_password_mz_system`, which will be used as + /// the password for the `mz_system` user if `authenticatorKind` is `Password`, + /// `Sasl`, or `Oidc`. + pub backend_secret_name: String, + /// How to authenticate with Materialize. + #[serde(default)] + pub authenticator_kind: AuthenticatorKind, + /// Whether to enable role based access control. Defaults to false. + #[serde(default)] + pub enable_rbac: bool, + + /// The value used by environmentd (via the --environment-id flag) to + /// uniquely identify this instance. Must be globally unique, and + /// is required if a license key is not provided. + /// NOTE: This value MUST NOT be changed in an existing instance, + /// since it affects things like the way data is stored in the persist + /// backend. + #[serde(default)] + pub environment_id: Uuid, + + /// The name of a ConfigMap containing system parameters in JSON format. + /// The ConfigMap must contain a `system-params.json` key whose value + /// is a valid JSON object containing valid system parameters. + /// + /// Run `SHOW ALL` in SQL to see a subset of configurable system parameters. + /// + /// Example ConfigMap: + /// ```yaml + /// data: + /// system-params.json: | + /// { + /// "max_connections": 1000 + /// } + /// ``` + pub system_parameter_configmap_name: Option, + + /// The configuration for generating an x509 certificate using cert-manager for balancerd + /// to present to incoming connections. + /// The `dnsNames` and `issuerRef` fields are required. + pub balancerd_external_certificate_spec: Option, + /// The configuration for generating an x509 certificate using cert-manager for the console + /// to present to incoming connections. + /// The `dnsNames` and `issuerRef` fields are required. + /// Not yet implemented. + pub console_external_certificate_spec: Option, + /// The cert-manager Issuer or ClusterIssuer to use for database internal communication. + /// The `issuerRef` field is required. + /// This currently is only used for environmentd, but will eventually support clusterd. + /// Not yet implemented. + pub internal_certificate_spec: Option, + } + + impl Materialize { + pub fn backend_secret_name(&self) -> String { + self.spec.backend_secret_name.clone() + } + + pub fn namespace(&self) -> String { + self.meta().namespace.clone().unwrap() + } + + pub fn create_service_account(&self) -> bool { + self.spec.service_account_name.is_none() + } + + pub fn service_account_name(&self) -> String { + self.spec + .service_account_name + .clone() + .unwrap_or_else(|| self.name_unchecked()) + } + + pub fn role_name(&self) -> String { + self.name_unchecked() + } + + pub fn role_binding_name(&self) -> String { + self.name_unchecked() + } + + pub fn environmentd_statefulset_name(&self, generation: u64) -> String { + self.name_prefixed(&format!("environmentd-{generation}")) + } + + pub fn environmentd_app_name(&self) -> String { + "environmentd".to_owned() + } + + pub fn environmentd_service_name(&self) -> String { + self.name_prefixed("environmentd") + } + + pub fn environmentd_service_internal_fqdn(&self) -> String { + format!( + "{}.{}.svc.cluster.local", + self.environmentd_service_name(), + self.meta().namespace.as_ref().unwrap() + ) + } + + pub fn environmentd_generation_service_name(&self, generation: u64) -> String { + self.name_prefixed(&format!("environmentd-{generation}")) + } + + pub fn balancerd_app_name(&self) -> String { + "balancerd".to_owned() + } + + pub fn environmentd_certificate_name(&self) -> String { + self.name_prefixed("environmentd-external") + } + + pub fn environmentd_certificate_secret_name(&self) -> String { + self.name_prefixed("environmentd-tls") + } + + pub fn balancerd_deployment_name(&self) -> String { + self.name_prefixed("balancerd") + } + + pub fn balancerd_service_name(&self) -> String { + self.name_prefixed("balancerd") + } + + pub fn console_app_name(&self) -> String { + "console".to_owned() + } + + pub fn balancerd_external_certificate_name(&self) -> String { + self.name_prefixed("balancerd-external") + } + + pub fn balancerd_external_certificate_secret_name(&self) -> String { + self.name_prefixed("balancerd-external-tls") + } + + pub fn balancerd_replicas(&self) -> i32 { + self.spec.balancerd_replicas.unwrap_or(2) + } + + pub fn console_replicas(&self) -> i32 { + self.spec.console_replicas.unwrap_or(2) + } + + pub fn console_configmap_name(&self) -> String { + self.name_prefixed("console") + } + + pub fn console_deployment_name(&self) -> String { + self.name_prefixed("console") + } + + pub fn console_service_name(&self) -> String { + self.name_prefixed("console") + } + + pub fn console_external_certificate_name(&self) -> String { + self.name_prefixed("console-external") + } + + pub fn console_external_certificate_secret_name(&self) -> String { + self.name_prefixed("console-external-tls") + } + + pub fn persist_pubsub_service_name(&self, generation: u64) -> String { + self.name_prefixed(&format!("persist-pubsub-{generation}")) + } + + pub fn listeners_configmap_name(&self, generation: u64) -> String { + self.name_prefixed(&format!("listeners-{generation}")) + } + + pub fn name_prefixed(&self, suffix: &str) -> String { + format!("mz{}-{}", self.resource_id(), suffix) + } + + pub fn resource_id(&self) -> &str { + &self.status.as_ref().unwrap().resource_id + } + + pub fn system_parameter_configmap_name(&self) -> Option { + self.spec.system_parameter_configmap_name.clone() + } + + pub fn environmentd_scratch_volume_storage_requirement(&self) -> Quantity { + self.spec + .environmentd_scratch_volume_storage_requirement + .clone() + .unwrap_or_else(|| { + self.spec + .environmentd_resource_requirements + .as_ref() + .and_then(|requirements| { + requirements + .requests + .as_ref() + .or(requirements.limits.as_ref()) + }) + // TODO: in cloud, we've been defaulting to twice the + // memory limit, but k8s-openapi doesn't seem to + // provide any way to parse Quantity values, so there + // isn't an easy way to do arithmetic on it + .and_then(|requirements| requirements.get("memory").cloned()) + // TODO: is there a better default to use here? + .unwrap_or_else(|| Quantity("4096Mi".to_string())) + }) + } + + pub fn environment_id(&self, cloud_provider: &str, region: &str) -> String { + format!( + "{}-{}-{}-0", + cloud_provider, region, self.spec.environment_id, + ) + } + + pub fn requested_reconciliation_id(&self) -> Uuid { + self.spec.request_rollout + } + + pub fn rollout_requested(&self) -> bool { + self.requested_reconciliation_id() + != self + .status + .as_ref() + .map_or_else(Uuid::nil, |status| status.last_completed_rollout_request) + } + + /// The maximum amount of time a rollout may remain in progress before + /// it is automatically cancelled. Parsed from + /// [`MaterializeSpec::rollout_request_timeout`], falling back to + /// [`DEFAULT_ROLLOUT_REQUEST_TIMEOUT`] when unset or unparseable. + pub fn rollout_request_timeout(&self) -> Duration { + let timeout = &self.spec.rollout_request_timeout.0; + humantime::parse_duration(timeout) + .or_else(|e| { + tracing::warn!( + rollout_request_timeout = %timeout, + "failed to parse rolloutRequestTimeout, using default: {e}", + ); + humantime::parse_duration(DEFAULT_ROLLOUT_REQUEST_TIMEOUT) + }) + .expect("DEFAULT_ROLLOUT_REQUEST_TIMEOUT must be a valid duration") + } + + /// If a timeout-eligible rollout is currently in progress, returns the + /// time at which it entered the in-progress (`Unknown`) state. Used to + /// enforce the rollout timeout. + /// + /// The `Applying` and `ReadyToPromote` phases are both reported as a + /// single in-progress window: [`Self::up_to_date_transition_time`] + /// carries the timestamp forward across them (they share the `Unknown` + /// status), so the timeout spans the whole pre-promotion rollout rather + /// than resetting at each phase. + /// + /// The `Promoting` phase is deliberately excluded even though it is + /// also `Unknown`: once a rollout has reached promotion it must never + /// be cancelled by the timeout, since the previously-active generation + /// may already be torn down, leaving nothing to fall back to. (The + /// controller also never reaches the timeout check while promoting, + /// because `is_promoting` takes priority; this is belt-and-suspenders.) + pub fn rollout_in_progress_since(&self) -> Option { + self.status + .as_ref()? + .conditions + .iter() + .find_map(|condition| { + if condition.type_ == "UpToDate" + && condition.status == "Unknown" + && condition.reason != "Promoting" + { + Some(condition.last_transition_time.0) + } else { + None + } + }) + } + + /// The `last_transition_time` to record for a new `UpToDate` condition + /// with `new_status`, following the Kubernetes convention that + /// `last_transition_time` marks when the condition's *status* last + /// changed — not its reason or message. While the status is unchanged + /// the existing timestamp is carried forward; it only resets to `now` + /// when the status actually changes (or there is no prior condition). + /// + /// This is what lets a rollout that moves through several same-status + /// phases (`Applying` -> `ReadyToPromote`, both `Unknown`) be measured + /// from when it first entered that status, so the rollout timeout + /// covers the phases together instead of restarting at each one. + pub fn up_to_date_transition_time(&self, new_status: &str, now: Timestamp) -> Timestamp { + self.status + .as_ref() + .and_then(|status| { + status + .conditions + .iter() + .find(|condition| condition.type_ == "UpToDate") + }) + .filter(|condition| condition.status == new_status) + .map_or(now, |condition| condition.last_transition_time.0) + } + + /// Returns the environmentd image ref of the currently-active + /// generation: the image of the last completed rollout, falling back + /// to the spec image when no rollout has completed yet. Downstream + /// resources (balancerd, console) should track this rather than + /// [`MaterializeSpec::environmentd_image_ref`] so they stay aligned + /// with the running environmentd when the spec is mid-rollout or has + /// been partially reverted (DEP-42). + pub fn active_environmentd_image_ref(&self) -> &str { + self.status + .as_ref() + .and_then(|s| s.last_completed_rollout_environmentd_image_ref.as_deref()) + .unwrap_or(&self.spec.environmentd_image_ref) + } + + pub fn set_force_promote(&mut self) { + self.spec.force_promote = self.spec.request_rollout.hyphenated().to_string(); + } + + pub fn should_force_promote(&self) -> bool { + self.spec.force_promote == self.spec.request_rollout.hyphenated().to_string() + || self.spec.force_promote + == super::v1::Materialize::from(self.clone()).generate_rollout_hash() + || self.spec.rollout_strategy + == MaterializeRolloutStrategy::ImmediatelyPromoteCausingDowntime + } + + pub fn conditions_need_update(&self) -> bool { + let Some(status) = self.status.as_ref() else { + return true; + }; + if status.conditions.is_empty() { + return true; + } + for condition in &status.conditions { + if condition.observed_generation != self.meta().generation { + return true; + } + } + false + } + + pub fn is_ready_to_promote(&self, resources_hash: &str) -> bool { + let Some(status) = self.status.as_ref() else { + return false; + }; + if status.conditions.is_empty() { + return false; + } + status + .conditions + .iter() + .any(|condition| condition.reason == "ReadyToPromote") + && &status.resources_hash == resources_hash + } + + pub fn is_promoting(&self) -> bool { + let Some(status) = self.status.as_ref() else { + return false; + }; + if status.conditions.is_empty() { + return false; + } + status + .conditions + .iter() + .any(|condition| condition.reason == "Promoting") + } + + pub fn update_in_progress(&self) -> bool { + let Some(status) = self.status.as_ref() else { + return false; + }; + if status.conditions.is_empty() { + return false; + } + for condition in &status.conditions { + if condition.type_ == "UpToDate" && condition.status == "Unknown" { + return true; + } + } + false + } + + /// Checks that the given version is greater than or equal + /// to the existing version, if the existing version + /// can be parsed. + pub fn meets_minimum_version(&self, minimum: &Version) -> bool { + let version = parse_image_ref(&self.spec.environmentd_image_ref); + match version { + // Use cmp_precedence() to ignore build metadata per SemVer 2.0.0 spec + Some(version) => version.cmp_precedence(minimum).is_ge(), + // In the rare case that we see an image reference + // that we can't parse, we assume that it satisfies all + // version checks. Usually these are custom images that have + // been by a developer on a branch forked from a recent copy + // of main, and so this works out reasonably well in practice. + None => { + tracing::warn!( + image_ref = %self.spec.environmentd_image_ref, + "failed to parse image ref", + ); + true + } + } + } + + /// This check isn't strictly required since environmentd will still be able to determine + /// if the upgrade is allowed or not. However, doing this check allows us to provide + /// the error as soon as possible and in a more user friendly way. + pub fn is_valid_upgrade_version(active_version: &Version, next_version: &Version) -> bool { + // Don't allow rolling back + // Note: semver comparison handles RC versions correctly: + // v26.0.0-rc.1 < v26.0.0-rc.2 < v26.0.0 + // Use cmp_precedence() to ignore build metadata + if next_version.cmp_precedence(active_version) == std::cmp::Ordering::Less { + return false; + } + + if active_version.major == 0 { + if next_version.major != active_version.major { + if next_version.major == 26 { + // We require customers to upgrade from 0.147.20 (Self Managed 25.2) or v0.164.X (Cloud) + // before upgrading to 26.0.0 + return (active_version.minor == 147 && active_version.patch >= 20) + || active_version.minor >= 164; + } else { + return false; + } + } + // Self managed 25.1 to 25.2 + if next_version.minor == 147 && active_version.minor == 130 { + return true; + } + // only allow upgrading a single minor version at a time + return next_version.minor <= active_version.minor + 1; + } else if active_version.major >= 26 { + // For versions 26.X.X and onwards, we deny upgrades past 1 major version of the active version + return next_version.major <= active_version.major + 1; + } + + true + } + + /// Checks if the current environmentd image ref is within the upgrade window of the last + /// successful rollout. + pub fn within_upgrade_window(&self) -> bool { + let active_environmentd_version = self + .status + .as_ref() + .and_then(|status| { + status + .last_completed_rollout_environmentd_image_ref + .as_ref() + }) + .and_then(|image_ref| parse_image_ref(image_ref)); + + if let (Some(next_environmentd_version), Some(active_environmentd_version)) = ( + parse_image_ref(&self.spec.environmentd_image_ref), + active_environmentd_version, + ) { + Self::is_valid_upgrade_version( + &active_environmentd_version, + &next_environmentd_version, + ) + } else { + // If we fail to parse either version, + // we still allow the upgrade since environmentd will still error if the upgrade is not allowed. + true + } + } + + pub fn status(&self) -> MaterializeStatus { + self.status.clone().unwrap_or_else(|| { + let mut status = MaterializeStatus::default(); + + status.resource_id = new_resource_id(); + + // If we're creating the initial status on an un-soft-deleted + // Environment we need to ensure that the last active generation + // is restored, otherwise the env will crash loop indefinitely + // as its catalog would have durably recorded a greater generation + if let Some(last_active_generation) = self + .annotations() + .get(LAST_KNOWN_ACTIVE_GENERATION_ANNOTATION) + { + status.active_generation = last_active_generation + .parse() + .expect("valid int generation"); + } + + // Initialize the last completed rollout environmentd image ref to + // the current image ref if not already set. + status.last_completed_rollout_environmentd_image_ref = + Some(self.spec.environmentd_image_ref.clone()); + + status + }) + } + } - /// Create a new generation of pods, leaving the old generation as the serving generation - /// until the user manually promotes the new generation. - /// - /// When using `ManuallyPromote`, the new generation can be promoted at any - /// time, even if it has dataflows that are not fully caught up, by setting - /// `forcePromote` to the same value as `requestRollout` in the Materialize spec. - /// - /// To minimize downtime, promotion should occur when the new generation - /// has caught up to the prior generation. To determine if the new - /// generation has caught up, consult the `UpToDate` condition in the - /// status of the Materialize Resource. If the condition's reason is - /// `ReadyToPromote` the new generation is ready to promote. - /// - /// {{}} - /// Do not leave new generations unpromoted indefinitely. - /// - /// The new generation keeps open read holds which prevent compaction. Once promoted or - /// cancelled, those read holds are released. If left unpromoted for an extended time, this - /// data can build up, and can cause extreme deletion load on the metadata backend database - /// when finally promoted or cancelled. - /// - /// To guard against this, a rollout that remains in progress longer - /// than `rolloutRequestTimeout` (default 24h) is automatically - /// cancelled. - /// {{}} - ManuallyPromote, + #[derive(Clone, Debug, Default, Deserialize, Serialize, JsonSchema, PartialEq)] + #[serde(rename_all = "camelCase")] + pub struct MaterializeStatus { + /// Resource identifier used as a name prefix to avoid pod name collisions. + pub resource_id: String, + /// The generation of Materialize pods actively capable of servicing requests. + pub active_generation: u64, + /// The UUID of the last successfully completed rollout. + pub last_completed_rollout_request: Uuid, + /// The image ref of the environmentd image that was last successfully rolled out. + /// Used to deny upgrades past 1 major version from the last successful rollout. + /// When None, we upgrade anyways. + pub last_completed_rollout_environmentd_image_ref: Option, + /// A hash calculated from the spec of resources to be created based on this Materialize + /// spec. This is used for detecting when the existing resources are up to date. + /// If you want to trigger a rollout without making other changes that would cause this + /// hash to change, you must set forceRollout to the same UUID as requestRollout. + pub resources_hash: String, + /// The last completed rollout hash from v1. + /// This exists on this older version only for round-trip conversion support. + pub last_completed_rollout_hash: Option, + pub conditions: Vec, + } - /// {{}} - /// THIS WILL CAUSE YOUR MATERIALIZE INSTANCE TO BE UNAVAILABLE FOR SOME TIME!!! - /// - /// This strategy should ONLY be used by customers with physical hardware who do not have - /// enough hardware for the `WaitUntilReady` strategy. If you think you want this, please - /// consult with Materialize engineering to discuss your situation. - /// {{}} - /// - /// Tear down the old generation of pods and promote the new generation of pods immediately, - /// without waiting for the new generation of pods to be ready. - ImmediatelyPromoteCausingDowntime, + impl MaterializeStatus { + pub fn needs_update(&self, other: &Self) -> bool { + let now = Timestamp::now(); + let mut a = self.clone(); + for condition in &mut a.conditions { + condition.last_transition_time = Time(now); + } + let mut b = other.clone(); + for condition in &mut b.conditions { + condition.last_transition_time = Time(now); + } + a != b + } } - /// Default for [`RolloutRequestTimeout`]. A new generation that sits - /// un-promoted holds back compaction via read holds, and promoting it - /// after a long delay can cause incident-inducing load; 24h is a - /// conservative upper bound on how long any rollout should take. - pub const DEFAULT_ROLLOUT_REQUEST_TIMEOUT: &str = "24h"; + impl ManagedResource for Materialize { + fn default_labels(&self) -> BTreeMap { + BTreeMap::from_iter([ + ( + "materialize.cloud/organization-name".to_owned(), + self.name_unchecked(), + ), + ( + "materialize.cloud/organization-namespace".to_owned(), + self.namespace(), + ), + ( + "materialize.cloud/mz-resource-id".to_owned(), + self.resource_id().to_owned(), + ), + ]) + } + } - /// The maximum time [`MaterializeSpec::rollout_request_timeout`] allows a - /// rollout to remain in progress. - /// - /// A transparent wrapper around the duration string whose [`Default`] is - /// [`DEFAULT_ROLLOUT_REQUEST_TIMEOUT`]. Routing the default through `Default` - /// keeps a single source of truth: the derived `Default` for - /// [`MaterializeSpec`], serde's `#[serde(default)]` (applied when the field - /// is omitted on deserialize), and the schema default surfaced in the - /// generated CRD (so the API server fills it in and `kubectl explain` shows - /// it) all resolve to the same value. - #[derive(Clone, Debug, PartialEq, Deserialize, Serialize, JsonSchema)] - #[serde(transparent)] - pub struct RolloutRequestTimeout(pub String); - - impl Default for RolloutRequestTimeout { - fn default() -> Self { - RolloutRequestTimeout(DEFAULT_ROLLOUT_REQUEST_TIMEOUT.to_owned()) + impl From for Materialize { + fn from(value: v1::Materialize) -> Self { + let rollout_hash = value.generate_rollout_hash(); + // Derive a deterministic UUID from the rollout hash so that the + // same v1 spec always produces the same requestRollout, + // making re-applies of an unchanged spec idempotent. + let request_rollout = Uuid::new_v5(&Uuid::NAMESPACE_OID, rollout_hash.as_bytes()); + Materialize { + metadata: value.metadata, + spec: MaterializeSpec { + environmentd_image_ref: value.spec.environmentd_image_ref, + environmentd_extra_args: value.spec.environmentd_extra_args, + environmentd_extra_env: value.spec.environmentd_extra_env, + environmentd_iam_role_arn: None, + environmentd_connection_role_arn: value.spec.environmentd_connection_role_arn, + environmentd_resource_requirements: value + .spec + .environmentd_resource_requirements, + environmentd_scratch_volume_storage_requirement: value + .spec + .environmentd_scratch_volume_storage_requirement, + balancerd_resource_requirements: value.spec.balancerd_resource_requirements, + console_resource_requirements: value.spec.console_resource_requirements, + balancerd_replicas: value.spec.balancerd_replicas, + console_replicas: value.spec.console_replicas, + service_account_name: value.spec.service_account_name, + service_account_annotations: value.spec.service_account_annotations, + service_account_labels: value.spec.service_account_labels, + pod_annotations: value.spec.pod_annotations, + pod_labels: value.spec.pod_labels, + force_promote: value.spec.force_promote.unwrap_or_default(), + force_rollout: value.spec.force_rollout, + rollout_strategy: value.spec.rollout_strategy, + rollout_request_timeout: value.spec.rollout_request_timeout, + backend_secret_name: value.spec.backend_secret_name, + authenticator_kind: value.spec.authenticator_kind, + enable_rbac: value.spec.enable_rbac, + environment_id: value.spec.environment_id, + system_parameter_configmap_name: value.spec.system_parameter_configmap_name, + balancerd_external_certificate_spec: value + .spec + .balancerd_external_certificate_spec, + console_external_certificate_spec: value.spec.console_external_certificate_spec, + internal_certificate_spec: value.spec.internal_certificate_spec, + request_rollout, + in_place_rollout: false, + }, + status: value.status.map(|status| MaterializeStatus { + resource_id: status.resource_id, + active_generation: status.active_generation, + last_completed_rollout_environmentd_image_ref: status + .last_completed_rollout_environmentd_image_ref, + conditions: status.conditions, + // Derive the same deterministic UUID from the last + // completed hash so that request_rollout == this value + // when the spec hasn't changed (no rollout needed). + last_completed_rollout_request: status + .last_completed_rollout_hash + .as_ref() + .map(|hash| Uuid::new_v5(&Uuid::NAMESPACE_OID, hash.as_bytes())) + .unwrap_or(Uuid::nil()), + last_completed_rollout_hash: status.last_completed_rollout_hash, + resources_hash: "".to_owned(), + }), + } } } +} + +pub mod v1 { + use super::*; #[derive( CustomResource, @@ -121,7 +887,7 @@ pub mod v1alpha1 { #[kube( namespaced, group = "materialize.cloud", - version = "v1alpha1", + version = "v1", kind = "Materialize", singular = "materialize", plural = "materializes", @@ -138,16 +904,6 @@ pub mod v1alpha1 { pub environmentd_extra_args: Option>, /// Extra environment variables to pass to the environmentd binary. pub environmentd_extra_env: Option>, - /// {{}} - /// Deprecated. - /// - /// Use `service_account_annotations` to set "eks.amazonaws.com/role-arn" instead. - /// {{}} - /// - /// If running in AWS, override the IAM role to use to give - /// environmentd access to the persist S3 bucket. - #[kube(deprecated)] - pub environmentd_iam_role_arn: Option, /// If running in AWS, override the IAM role to use to support /// the CREATE CONNECTION feature. pub environmentd_connection_role_arn: Option, @@ -156,12 +912,20 @@ pub mod v1alpha1 { /// Amount of disk to allocate, if a storage class is provided. pub environmentd_scratch_volume_storage_requirement: Option, /// Resource requirements for the balancerd pod. + /// + /// This field is excluded from the rollout hash and changes will not trigger a rollout. pub balancerd_resource_requirements: Option, /// Resource requirements for the console pod. + /// + /// This field is excluded from the rollout hash and changes will not trigger a rollout. pub console_resource_requirements: Option, /// Number of balancerd pods to create. + /// + /// This field is excluded from the rollout hash and changes will not trigger a rollout. pub balancerd_replicas: Option, /// Number of console pods to create. + /// + /// This field is excluded from the rollout hash and changes will not trigger a rollout. pub console_replicas: Option, /// Name of the kubernetes service account to use. @@ -181,39 +945,18 @@ pub mod v1alpha1 { /// Labels to apply to the pods. pub pod_labels: Option>, - /// When changes are made to the environmentd resources (either via - /// modifying fields in the spec here or by deploying a new - /// orchestratord version which changes how resources are generated), - /// existing environmentd processes won't be automatically restarted. - /// In order to trigger a restart, the request_rollout field should be - /// set to a new (random) value. Once the rollout completes, the value - /// of `status.lastCompletedRolloutRequest` will be set to this value - /// to indicate completion. - /// - /// Defaults to a random value in order to ensure that the first - /// generation rollout is automatically triggered. - #[serde(default)] - pub request_rollout: Uuid, - /// If `forcePromote` is set to the same value as `requestRollout`, the + /// If `forcePromote` is set to the same value as the `status.requestedRolloutHash`, /// current rollout will skip waiting for clusters in the new /// generation to rehydrate before promoting the new environmentd to /// leader. - #[serde(default)] - pub force_promote: Uuid, - /// This value will be written to an annotation in the generated - /// environmentd statefulset, in order to force the controller to - /// detect the generated resources as changed even if no other changes - /// happened. This can be used to force a rollout to a new generation - /// even without making any meaningful changes, by setting it to the - /// same value as `requestRollout`. + /// + /// This field is excluded from the rollout hash and changes will not trigger a rollout. + pub force_promote: Option, + /// This value will force the controller to detect the spec as changed + /// even if no other changes happened. This can be used to force a rollout + /// to a new generation even without making any meaningful changes. #[serde(default)] pub force_rollout: Uuid, - /// {{}} - /// Deprecated and ignored. Use `rolloutStrategy` instead. - /// {{}} - #[kube(deprecated)] - #[serde(default)] - pub in_place_rollout: bool, /// Rollout strategy to use when upgrading this Materialize instance. #[serde(default)] pub rollout_strategy: MaterializeRolloutStrategy, @@ -241,8 +984,7 @@ pub mod v1alpha1 { pub rollout_request_timeout: RolloutRequestTimeout, /// The name of a secret containing `metadata_backend_url` and `persist_backend_url`. /// It may also contain `external_login_password_mz_system`, which will be used as - /// the password for the `mz_system` user if `authenticatorKind` is `Password`, - /// `Sasl`, or `Oidc`. + /// the password for the `mz_system` user if `authenticatorKind` is `Password`. pub backend_secret_name: String, /// How to authenticate with Materialize. #[serde(default)] @@ -279,11 +1021,15 @@ pub mod v1alpha1 { /// The configuration for generating an x509 certificate using cert-manager for balancerd /// to present to incoming connections. /// The `dnsNames` and `issuerRef` fields are required. + /// + /// This field is excluded from the rollout hash and changes will not trigger a rollout. pub balancerd_external_certificate_spec: Option, /// The configuration for generating an x509 certificate using cert-manager for the console /// to present to incoming connections. /// The `dnsNames` and `issuerRef` fields are required. /// Not yet implemented. + /// + /// This field is excluded from the rollout hash and changes will not trigger a rollout. pub console_external_certificate_spec: Option, /// The cert-manager Issuer or ClusterIssuer to use for database internal communication. /// The `issuerRef` field is required. @@ -293,6 +1039,60 @@ pub mod v1alpha1 { } impl Materialize { + pub fn generate_rollout_hash(&self) -> String { + let mut hasher = Sha256::new(); + // Remove fields that don't affect the resources generated per generation, + // and we don't want to trigger a rollout from. + let spec = MaterializeSpec { + environmentd_image_ref: self.spec.environmentd_image_ref.clone(), + environmentd_extra_args: self.spec.environmentd_extra_args.clone(), + environmentd_extra_env: self.spec.environmentd_extra_env.clone(), + environmentd_connection_role_arn: self + .spec + .environmentd_connection_role_arn + .clone(), + environmentd_resource_requirements: self + .spec + .environmentd_resource_requirements + .clone(), + environmentd_scratch_volume_storage_requirement: self + .spec + .environmentd_scratch_volume_storage_requirement + .clone(), + balancerd_resource_requirements: None, + console_resource_requirements: None, + balancerd_replicas: None, + console_replicas: None, + service_account_name: self.spec.service_account_name.clone(), + service_account_annotations: self.spec.service_account_annotations.clone(), + service_account_labels: self.spec.service_account_labels.clone(), + pod_annotations: self.spec.pod_annotations.clone(), + pod_labels: self.spec.pod_labels.clone(), + force_promote: None, + force_rollout: self.spec.force_rollout, + rollout_strategy: self.spec.rollout_strategy.clone(), + rollout_request_timeout: self.spec.rollout_request_timeout.clone(), + backend_secret_name: self.spec.backend_secret_name.clone(), + authenticator_kind: self.spec.authenticator_kind, + enable_rbac: self.spec.enable_rbac, + environment_id: self.spec.environment_id, + system_parameter_configmap_name: self.spec.system_parameter_configmap_name.clone(), + balancerd_external_certificate_spec: None, + console_external_certificate_spec: None, + internal_certificate_spec: self.spec.internal_certificate_spec.clone(), + }; + hasher.update(&serde_json::to_vec(&spec).unwrap()); + if let Some(annotation) = self + .metadata + .annotations + .as_ref() + .and_then(|annotations| annotations.get(FORCE_ROLLOUT_ANNOTATION)) + { + hasher.update(annotation); + } + format!("{:x}", hasher.finalize()) + } + pub fn backend_secret_name(&self) -> String { self.spec.backend_secret_name.clone() } @@ -455,112 +1255,23 @@ pub mod v1alpha1 { ) } - pub fn requested_reconciliation_id(&self) -> Uuid { - self.spec.request_rollout - } - pub fn rollout_requested(&self) -> bool { - self.requested_reconciliation_id() - != self - .status - .as_ref() - .map_or_else(Uuid::nil, |status| status.last_completed_rollout_request) - } - - /// The maximum amount of time a rollout may remain in progress before - /// it is automatically cancelled. Parsed from - /// [`MaterializeSpec::rollout_request_timeout`], falling back to - /// [`DEFAULT_ROLLOUT_REQUEST_TIMEOUT`] when unset or unparseable. - pub fn rollout_request_timeout(&self) -> Duration { - let timeout = &self.spec.rollout_request_timeout.0; - humantime::parse_duration(timeout) - .or_else(|e| { - tracing::warn!( - rollout_request_timeout = %timeout, - "failed to parse rolloutRequestTimeout, using default: {e}", - ); - humantime::parse_duration(DEFAULT_ROLLOUT_REQUEST_TIMEOUT) - }) - .expect("DEFAULT_ROLLOUT_REQUEST_TIMEOUT must be a valid duration") - } - - /// If a timeout-eligible rollout is currently in progress, returns the - /// time at which it entered the in-progress (`Unknown`) state. Used to - /// enforce the rollout timeout. - /// - /// The `Applying` and `ReadyToPromote` phases are both reported as a - /// single in-progress window: [`Self::up_to_date_transition_time`] - /// carries the timestamp forward across them (they share the `Unknown` - /// status), so the timeout spans the whole pre-promotion rollout rather - /// than resetting at each phase. - /// - /// The `Promoting` phase is deliberately excluded even though it is - /// also `Unknown`: once a rollout has reached promotion it must never - /// be cancelled by the timeout, since the previously-active generation - /// may already be torn down, leaving nothing to fall back to. (The - /// controller also never reaches the timeout check while promoting, - /// because `is_promoting` takes priority; this is belt-and-suspenders.) - pub fn rollout_in_progress_since(&self) -> Option { - self.status - .as_ref()? - .conditions - .iter() - .find_map(|condition| { - if condition.type_ == "UpToDate" - && condition.status == "Unknown" - && condition.reason != "Promoting" - { - Some(condition.last_transition_time.0) - } else { - None - } - }) - } - - /// The `last_transition_time` to record for a new `UpToDate` condition - /// with `new_status`, following the Kubernetes convention that - /// `last_transition_time` marks when the condition's *status* last - /// changed — not its reason or message. While the status is unchanged - /// the existing timestamp is carried forward; it only resets to `now` - /// when the status actually changes (or there is no prior condition). - /// - /// This is what lets a rollout that moves through several same-status - /// phases (`Applying` -> `ReadyToPromote`, both `Unknown`) be measured - /// from when it first entered that status, so the rollout timeout - /// covers the phases together instead of restarting at each one. - pub fn up_to_date_transition_time(&self, new_status: &str, now: Timestamp) -> Timestamp { - self.status - .as_ref() - .and_then(|status| { - status - .conditions - .iter() - .find(|condition| condition.type_ == "UpToDate") - }) - .filter(|condition| condition.status == new_status) - .map_or(now, |condition| condition.last_transition_time.0) - } - - /// Returns the environmentd image ref of the currently-active - /// generation: the image of the last completed rollout, falling back - /// to the spec image when no rollout has completed yet. Downstream - /// resources (balancerd, console) should track this rather than - /// [`MaterializeSpec::environmentd_image_ref`] so they stay aligned - /// with the running environmentd when the spec is mid-rollout or has - /// been partially reverted (DEP-42). - pub fn active_environmentd_image_ref(&self) -> &str { self.status .as_ref() - .and_then(|s| s.last_completed_rollout_environmentd_image_ref.as_deref()) - .unwrap_or(&self.spec.environmentd_image_ref) + .map(|status| status.last_completed_rollout_hash != status.requested_rollout_hash) + .unwrap_or(false) } pub fn set_force_promote(&mut self) { - self.spec.force_promote = self.spec.request_rollout; + self.spec.force_promote = Some(self.generate_rollout_hash()); } pub fn should_force_promote(&self) -> bool { - self.spec.force_promote == self.spec.request_rollout + self.spec.force_promote.as_ref() + == self + .status + .as_ref() + .and_then(|status| status.requested_rollout_hash.as_ref()) || self.spec.rollout_strategy == MaterializeRolloutStrategy::ImmediatelyPromoteCausingDowntime } @@ -580,7 +1291,7 @@ pub mod v1alpha1 { false } - pub fn is_ready_to_promote(&self, resources_hash: &str) -> bool { + pub fn is_ready_to_promote(&self, rollout_hash: &str) -> bool { let Some(status) = self.status.as_ref() else { return false; }; @@ -591,7 +1302,7 @@ pub mod v1alpha1 { .conditions .iter() .any(|condition| condition.reason == "ReadyToPromote") - && &status.resources_hash == resources_hash + && status.requested_rollout_hash.as_deref() == Some(rollout_hash) } pub fn is_promoting(&self) -> bool { @@ -747,17 +1458,15 @@ pub mod v1alpha1 { pub resource_id: String, /// The generation of Materialize pods actively capable of servicing requests. pub active_generation: u64, - /// The UUID of the last successfully completed rollout. - pub last_completed_rollout_request: Uuid, /// The image ref of the environmentd image that was last successfully rolled out. /// Used to deny upgrades past 1 major version from the last successful rollout. /// When None, we upgrade anyways. pub last_completed_rollout_environmentd_image_ref: Option, - /// A hash calculated from the spec of resources to be created based on this Materialize - /// spec. This is used for detecting when the existing resources are up to date. - /// If you want to trigger a rollout without making other changes that would cause this - /// hash to change, you must set forceRollout to the same UUID as requestRollout. - pub resources_hash: String, + /// The last completed rollout's requestedRolloutHash. + pub last_completed_rollout_hash: Option, + /// Hash of a subset of the Materialize spec and other fields. + /// This is used to determine when the spec has changed and we need to rollout. + pub requested_rollout_hash: Option, pub conditions: Vec, } @@ -798,6 +1507,109 @@ pub mod v1alpha1 { Some("environmentd") } } + + impl From for Materialize { + fn from(value: v1alpha1::Materialize) -> Self { + let is_promoting = value.is_promoting(); + let service_account_annotations = if let Some(environmentd_iam_role_arn) = + value.spec.environmentd_iam_role_arn + { + let mut annotations = value.spec.service_account_annotations.unwrap_or_default(); + annotations + .entry("eks.amazonaws.com/role-arn".to_owned()) + .or_insert(environmentd_iam_role_arn); + Some(annotations) + } else { + value.spec.service_account_annotations + }; + let mut mz = Materialize { + metadata: value.metadata, + spec: MaterializeSpec { + environmentd_image_ref: value.spec.environmentd_image_ref, + environmentd_extra_args: value.spec.environmentd_extra_args, + environmentd_extra_env: value.spec.environmentd_extra_env, + environmentd_connection_role_arn: value.spec.environmentd_connection_role_arn, + environmentd_resource_requirements: value + .spec + .environmentd_resource_requirements, + environmentd_scratch_volume_storage_requirement: value + .spec + .environmentd_scratch_volume_storage_requirement, + balancerd_resource_requirements: value.spec.balancerd_resource_requirements, + console_resource_requirements: value.spec.console_resource_requirements, + balancerd_replicas: value.spec.balancerd_replicas, + console_replicas: value.spec.console_replicas, + service_account_name: value.spec.service_account_name, + service_account_annotations, + service_account_labels: value.spec.service_account_labels, + pod_annotations: value.spec.pod_annotations, + pod_labels: value.spec.pod_labels, + force_promote: if value.spec.force_promote.is_empty() + || &value.spec.force_promote == "00000000-0000-0000-0000-000000000000" + { + None + } else { + Some(value.spec.force_promote.to_string()) + }, + force_rollout: value.spec.force_rollout, + rollout_strategy: value.spec.rollout_strategy, + rollout_request_timeout: value.spec.rollout_request_timeout, + backend_secret_name: value.spec.backend_secret_name, + authenticator_kind: value.spec.authenticator_kind, + enable_rbac: value.spec.enable_rbac, + environment_id: value.spec.environment_id, + system_parameter_configmap_name: value.spec.system_parameter_configmap_name, + balancerd_external_certificate_spec: value + .spec + .balancerd_external_certificate_spec, + console_external_certificate_spec: value.spec.console_external_certificate_spec, + internal_certificate_spec: value.spec.internal_certificate_spec, + }, + status: None, + }; + let calculated_rollout_hash = mz.generate_rollout_hash(); + let last_completed_rollout_hash = match value + .status + .as_ref() + .and_then(|status| status.last_completed_rollout_hash.to_owned()) + { + Some(last_completed_rollout_hash) => Some(last_completed_rollout_hash), + None => { + let currently_rolling_out = value + .status + .as_ref() + .map(|status| { + status.last_completed_rollout_request != value.spec.request_rollout + // If this is the first apply, + // these could both be nil and we still need to do a rollout. + || status.last_completed_rollout_request.is_nil() + }) + .unwrap_or(true); + if currently_rolling_out { + // If they store a change, we're going to start over on a new rollout. + None + } else { + Some(calculated_rollout_hash.clone()) + } + } + }; + let requested_rollout_hash = if is_promoting { + None + } else { + Some(calculated_rollout_hash) + }; + mz.status = value.status.map(|status| MaterializeStatus { + resource_id: status.resource_id, + active_generation: status.active_generation, + last_completed_rollout_environmentd_image_ref: status + .last_completed_rollout_environmentd_image_ref, + last_completed_rollout_hash, + requested_rollout_hash, + conditions: status.conditions, + }); + mz + } + } } fn parse_image_ref(image_ref: &str) -> Option { @@ -823,10 +1635,8 @@ mod tests { use kube::core::ObjectMeta; use semver::Version; - use super::v1alpha1::{ - DEFAULT_ROLLOUT_REQUEST_TIMEOUT, Materialize, MaterializeSpec, MaterializeStatus, - RolloutRequestTimeout, - }; + use super::v1alpha1::{Materialize, MaterializeSpec, MaterializeStatus}; + use super::{DEFAULT_ROLLOUT_REQUEST_TIMEOUT, RolloutRequestTimeout}; #[mz_ore::test] fn meets_minimum_version() { @@ -875,8 +1685,6 @@ mod tests { #[mz_ore::test] fn within_upgrade_window() { - use super::v1alpha1::MaterializeStatus; - let mut mz = Materialize { spec: MaterializeSpec { environmentd_image_ref: "materialize/environmentd:v26.0.0".to_owned(), @@ -1144,8 +1952,6 @@ mod tests { #[mz_ore::test] fn active_environmentd_image_ref() { - use super::v1alpha1::MaterializeStatus; - const OLD: &str = "materialize/environmentd:v26.0.0"; const NEW: &str = "materialize/environmentd:v27.0.0"; diff --git a/src/mz-debug/src/k8s_dumper.rs b/src/mz-debug/src/k8s_dumper.rs index f00a3e9345c3d..5ce9fb39c7298 100644 --- a/src/mz-debug/src/k8s_dumper.rs +++ b/src/mz-debug/src/k8s_dumper.rs @@ -38,7 +38,7 @@ use k8s_openapi::apiextensions_apiserver::pkg::apis::apiextensions::v1::CustomRe use kube::api::{ListParams, LogParams}; use kube::{Api, Client}; use mz_cloud_resources::crd::generated::cert_manager::certificates::Certificate; -use mz_cloud_resources::crd::materialize::v1alpha1::Materialize; +use mz_cloud_resources::crd::materialize::v1::Materialize; use serde::{Serialize, de::DeserializeOwned}; use tracing::{info, warn}; diff --git a/src/mz-debug/src/utils.rs b/src/mz-debug/src/utils.rs index 0857447560697..cbd8be6f38d60 100644 --- a/src/mz-debug/src/utils.rs +++ b/src/mz-debug/src/utils.rs @@ -21,7 +21,7 @@ use std::str::FromStr; use chrono::{DateTime, Utc}; use kube::{Api, Client}; -use mz_cloud_resources::crd::materialize::v1alpha1::Materialize; +use mz_cloud_resources::crd::materialize::v1::Materialize; use mz_server_core::listeners::AuthenticatorKind; use zip::ZipWriter; use zip::write::SimpleFileOptions; diff --git a/src/orchestratord/Cargo.toml b/src/orchestratord/Cargo.toml index b7f034f442928..0533f87cab0c3 100644 --- a/src/orchestratord/Cargo.toml +++ b/src/orchestratord/Cargo.toml @@ -13,6 +13,7 @@ workspace = true anyhow.workspace = true async-trait.workspace = true axum.workspace = true +axum-server.workspace = true clap.workspace = true futures.workspace = true http.workspace = true diff --git a/src/orchestratord/src/bin/orchestratord.rs b/src/orchestratord/src/bin/orchestratord.rs index c25cfd424bd6c..8db7441a11d8a 100644 --- a/src/orchestratord/src/bin/orchestratord.rs +++ b/src/orchestratord/src/bin/orchestratord.rs @@ -11,8 +11,10 @@ use std::{ future, net::SocketAddr, sync::{Arc, LazyLock}, + time::Duration, }; +use axum_server::tls_openssl::OpenSSLConfig; use http::HeaderValue; use k8s_openapi::{ api::{ @@ -37,9 +39,10 @@ use mz_orchestrator_kubernetes::{KubernetesImagePullPolicy, util::create_client} use mz_orchestrator_tracing::{StaticTracingConfig, TracingCliArgs}; use mz_orchestratord::{ controller, - k8s::register_crds, + k8s::{ConversionWebhookConfig, register_crds}, metrics::{self, Metrics}, tls::DefaultCertificateSpecs, + webhook, }; use mz_ore::{ cli::{self, CliConfig, KeyValueArg}, @@ -60,6 +63,35 @@ pub struct Args { profiling_listen_address: SocketAddr, #[clap(long, default_value = "[::]:3100")] metrics_listen_address: SocketAddr, + #[clap(long, default_value = "[::]:8001")] + webhook_listen_address: SocketAddr, + + /// Whether to install the v1 version of the Materialize CRD and the + /// conversion webhook between v1 and v1alpha1. When false, only the + /// v1alpha1 version is installed and the webhook server is not started. + #[clap(long)] + install_v1_crd: bool, + /// Required when --install-v1-crd is set. + #[clap(long, required_if_eq("install_v1_crd", "true"))] + webhook_service_name: Option, + /// Required when --install-v1-crd is set. + #[clap(long, required_if_eq("install_v1_crd", "true"))] + webhook_service_namespace: Option, + #[clap(long, default_value = "8001")] + webhook_service_port: u16, + #[clap(long, default_value = "/etc/tls/ca.crt")] + tls_ca: String, + #[clap(long, default_value = "/etc/tls/tls.crt")] + tls_cert: String, + #[clap(long, default_value = "/etc/tls/tls.key")] + tls_key: String, + /// How often to reload the webhook TLS serving certificate from disk and, + /// when the CA changes, refresh the conversion webhook's CA bundle. The + /// certificate is rotated out-of-band (e.g. by cert-manager), so this must + /// be short enough that rotations are picked up before the old certificate + /// expires. + #[clap(long, default_value = "1h", value_parser = humantime::parse_duration)] + webhook_cert_reload_interval: Duration, #[clap(long)] cloud_provider: CloudProvider, @@ -265,13 +297,112 @@ async fn run(args: Args) -> Result<(), anyhow::Error> { let metrics = Arc::new(Metrics::register_into(&metrics_registry)); + let tls_cert = args.tls_cert; + let tls_key = args.tls_key; + let tls_ca = args.tls_ca; + let reload_config = if args.install_v1_crd { + let config = OpenSSLConfig::from_pem_file(&tls_cert, &tls_key).unwrap(); + let reload_config = config.clone(); + let webhook_listen_address = args.webhook_listen_address; + + mz_ore::task::spawn(|| "webhook server", async move { + if let Err(e) = axum_server::bind_openssl(webhook_listen_address, config) + .serve(webhook::router().into_make_service()) + .await + { + panic!("webhook server failed: {}", e.display_with_causes()); + } + }); + + Some(reload_config) + } else { + None + }; + let (client, namespace) = create_client(args.kubernetes_context.clone()).await?; + let additional_crd_columns = args.additional_crd_columns.unwrap_or_default(); + let conversion_webhook = args.install_v1_crd.then(|| ConversionWebhookConfig { + service_name: args + .webhook_service_name + .expect("clap requires --webhook-service-name with --install-v1-crd"), + service_namespace: args + .webhook_service_namespace + .expect("clap requires --webhook-service-namespace with --install-v1-crd"), + service_port: args.webhook_service_port, + ca_cert_path: tls_ca.clone(), + }); register_crds( client.clone(), - args.additional_crd_columns.unwrap_or_default(), + additional_crd_columns.clone(), + conversion_webhook.clone(), ) .await?; + // Periodically reload the webhook serving certificate from disk, and + // refresh the conversion webhook's CA bundle whenever the CA changes. + // + // The certificate is rotated out-of-band (e.g. by cert-manager). The + // serving certificate is signed by a stable root CA, so routine rotations + // reuse the same CA and the CA bundle registered into the CRD at startup + // keeps working. But if the CA itself rotates (e.g. on root CA renewal), + // that startup CA bundle would not trust the served certificate, and the + // Kubernetes API server would reject every conversion request. Refreshing + // the CA bundle when the CA changes keeps the webhook working across CA + // rotations. + if let Some(reload_config) = reload_config { + let conversion_webhook = conversion_webhook + .expect("conversion webhook config is set whenever the webhook server is started"); + let reload_interval = args.webhook_cert_reload_interval; + let client = client.clone(); + let additional_crd_columns = additional_crd_columns.clone(); + mz_ore::task::spawn(|| "webhook certificate reload", async move { + let mut last_ca = tokio::fs::read(&tls_ca).await.ok(); + let mut interval = tokio::time::interval(reload_interval); + // The first tick completes immediately; skip it so we don't + // re-register the CRDs we just registered above. + interval.tick().await; + loop { + interval.tick().await; + if let Err(err) = reload_config.reload_from_pem_file(&tls_cert, &tls_key) { + tracing::error!("failed to reload webhook TLS certificate: {err}"); + continue; + } + let current_ca = match tokio::fs::read(&tls_ca).await { + Ok(ca) => ca, + Err(err) => { + tracing::error!("failed to read webhook CA certificate: {err}"); + continue; + } + }; + if last_ca.as_deref() == Some(current_ca.as_slice()) { + continue; + } + // The CA changed, meaning the certificate was rotated. Re-register + // the CRDs so the conversion webhook's caBundle matches the + // newly-served certificate. + match register_crds( + client.clone(), + additional_crd_columns.clone(), + Some(conversion_webhook.clone()), + ) + .await + { + Ok(()) => { + tracing::info!( + "refreshed conversion webhook CA bundle after certificate rotation" + ); + last_ca = Some(current_ca); + } + Err(err) => { + tracing::error!( + "failed to refresh conversion webhook CA bundle after rotation: {err}" + ); + } + } + } + }); + } + let crd_api: Api = Api::all(client.clone()); let crds = crd_api.list(&ListParams::default()).await?; let has_cert_manager = crds @@ -546,3 +677,42 @@ async fn run(args: Args) -> Result<(), anyhow::Error> { future::pending().await } + +#[cfg(test)] +mod tests { + use clap::Parser; + + use super::Args; + + const REQUIRED_ARGS: &[&str] = &[ + "orchestratord", + "--cloud-provider=local", + "--region=kind", + "--console-image-tag-default=latest", + ]; + + #[mz_ore::test] + fn webhook_service_args_required_with_install_v1_crd() { + let args = Args::try_parse_from(REQUIRED_ARGS).expect("parses without webhook args"); + assert!(!args.install_v1_crd); + + assert!( + Args::try_parse_from(REQUIRED_ARGS.iter().copied().chain(["--install-v1-crd"])) + .is_err(), + "--install-v1-crd should require the webhook service args" + ); + + let args = Args::try_parse_from(REQUIRED_ARGS.iter().copied().chain([ + "--install-v1-crd", + "--webhook-service-name=orchestratord", + "--webhook-service-namespace=materialize", + ])) + .expect("parses with webhook args"); + assert!(args.install_v1_crd); + assert_eq!(args.webhook_service_name.as_deref(), Some("orchestratord")); + assert_eq!( + args.webhook_service_namespace.as_deref(), + Some("materialize") + ); + } +} diff --git a/src/orchestratord/src/controller/materialize.rs b/src/orchestratord/src/controller/materialize.rs index 63cc5648dbed0..41b22d0462529 100644 --- a/src/orchestratord/src/controller/materialize.rs +++ b/src/orchestratord/src/controller/materialize.rs @@ -42,7 +42,8 @@ use mz_cloud_resources::crd::{ ManagedResource, balancer::v1alpha1::{Balancer, BalancerSpec}, console::v1alpha1::{BalancerdRef, Console, ConsoleSpec, HttpConnectionScheme}, - materialize::v1alpha1::{Materialize, MaterializeRolloutStrategy, MaterializeStatus}, + materialize::MaterializeRolloutStrategy, + materialize::v1alpha1::{Materialize, MaterializeStatus}, }; use mz_license_keys::validate; use mz_orchestrator_kubernetes::KubernetesImagePullPolicy; @@ -209,6 +210,7 @@ impl Context { ), resource_id: mz.status().resource_id, resources_hash, + last_completed_rollout_hash: None, conditions: vec![Condition { type_: "UpToDate".into(), status: "True".into(), @@ -450,6 +452,7 @@ impl k8s_controller::Context for Context { .clone(), resource_id: status.resource_id.clone(), resources_hash: status.resources_hash.clone(), + last_completed_rollout_hash: None, conditions: vec![Condition { type_: "UpToDate".into(), status: "False".into(), @@ -486,6 +489,7 @@ impl k8s_controller::Context for Context { last_completed_rollout_environmentd_image_ref.clone(), resource_id: status.resource_id, resources_hash: status.resources_hash, + last_completed_rollout_hash: None, conditions: vec![Condition { type_: "UpToDate".into(), status: "False".into(), @@ -541,6 +545,7 @@ impl k8s_controller::Context for Context { .last_completed_rollout_environmentd_image_ref, resource_id: status.resource_id.clone(), resources_hash: String::new(), + last_completed_rollout_hash: None, conditions: vec![Condition { type_: "UpToDate".into(), status: "Unknown".into(), @@ -596,6 +601,7 @@ impl k8s_controller::Context for Context { .last_completed_rollout_environmentd_image_ref, resource_id: status.resource_id, resources_hash, + last_completed_rollout_hash: None, conditions: vec![Condition { type_: "UpToDate".into(), status: "Unknown".into(), @@ -642,6 +648,7 @@ impl k8s_controller::Context for Context { .last_completed_rollout_environmentd_image_ref, resource_id: status.resource_id, resources_hash: resources_hash.clone(), + last_completed_rollout_hash: None, conditions: vec![Condition { type_: "UpToDate".into(), status: "Unknown".into(), @@ -682,6 +689,7 @@ impl k8s_controller::Context for Context { .last_completed_rollout_environmentd_image_ref, resource_id: status.resource_id, resources_hash: status.resources_hash, + last_completed_rollout_hash: None, conditions: vec![Condition { type_: "UpToDate".into(), status: "False".into(), @@ -721,6 +729,7 @@ impl k8s_controller::Context for Context { .last_completed_rollout_environmentd_image_ref, resource_id: status.resource_id.clone(), resources_hash: status.resources_hash, + last_completed_rollout_hash: None, conditions: vec![Condition { type_: "UpToDate".into(), status: "False".into(), @@ -763,6 +772,7 @@ impl k8s_controller::Context for Context { .last_completed_rollout_environmentd_image_ref, resource_id: status.resource_id.clone(), resources_hash: status.resources_hash, + last_completed_rollout_hash: None, conditions: vec![Condition { type_: "UpToDate".into(), status: "True".into(), diff --git a/src/orchestratord/src/k8s.rs b/src/orchestratord/src/k8s.rs index 78d85582ac626..1215a78788666 100644 --- a/src/orchestratord/src/k8s.rs +++ b/src/orchestratord/src/k8s.rs @@ -9,8 +9,13 @@ use std::time::Duration; -use apiextensions::v1::CustomResourceColumnDefinition; -use k8s_openapi::apiextensions_apiserver::pkg::apis::apiextensions; +use k8s_openapi::{ + ByteString, + apiextensions_apiserver::pkg::apis::apiextensions::v1::{ + CustomResourceColumnDefinition, CustomResourceConversion, ServiceReference, + WebhookClientConfig, WebhookConversion, + }, +}; use kube::{ Api, Client, CustomResourceExt, Resource, ResourceExt, api::{DeleteParams, Patch, PatchParams, PostParams}, @@ -94,16 +99,57 @@ where } } +/// Configuration for the conversion webhook that serves the v1 version of the +/// Materialize CRD. When present, the v1 version is registered alongside +/// v1alpha1 with webhook conversion between them; when absent, only v1alpha1 +/// is registered. +#[derive(Debug, Clone)] +pub struct ConversionWebhookConfig { + pub service_name: String, + pub service_namespace: String, + pub service_port: u16, + pub ca_cert_path: String, +} + pub async fn register_crds( client: Client, additional_crd_columns: Vec, + conversion_webhook: Option, ) -> Result<(), anyhow::Error> { - let mut mz_crd = crd::materialize::v1alpha1::Materialize::crd(); - let default_columns = mz_crd.spec.versions[0] + let (mut mz_crds, mz_conversion) = match conversion_webhook { + Some(config) => { + let ca_bytes = tokio::fs::read(config.ca_cert_path).await?; + let conversion = CustomResourceConversion { + strategy: "Webhook".to_owned(), + webhook: Some(WebhookConversion { + client_config: Some(WebhookClientConfig { + ca_bundle: Some(ByteString(ca_bytes)), + service: Some(ServiceReference { + name: config.service_name, + namespace: config.service_namespace, + path: Some("/convert".to_owned()), + port: Some(config.service_port.into()), + }), + url: None, + }), + conversion_review_versions: vec!["v1".to_owned()], + }), + }; + ( + vec![ + crd::materialize::v1::Materialize::crd(), + crd::materialize::v1alpha1::Materialize::crd(), + ], + Some(conversion), + ) + } + None => (vec![crd::materialize::v1alpha1::Materialize::crd()], None), + }; + let default_columns = mz_crds[0].spec.versions[0] .additional_printer_columns .take() .expect("should contain ImageRef and UpToDate columns"); - mz_crd.spec.versions[0].additional_printer_columns = Some( + mz_crds[0].spec.versions[0].additional_printer_columns = Some( additional_crd_columns .into_iter() .chain(default_columns) @@ -115,20 +161,24 @@ pub async fn register_crds( client.clone(), vec![ VersionedCrd { - crds: vec![mz_crd], + crds: mz_crds, stored_version: String::from("v1alpha1"), + conversion: mz_conversion, }, VersionedCrd { crds: vec![crd::balancer::v1alpha1::Balancer::crd()], stored_version: String::from("v1alpha1"), + conversion: None, }, VersionedCrd { crds: vec![crd::console::v1alpha1::Console::crd()], stored_version: String::from("v1alpha1"), + conversion: None, }, VersionedCrd { crds: vec![crd::vpc_endpoint::v1::VpcEndpoint::crd()], stored_version: String::from("v1"), + conversion: None, }, ], FIELD_MANAGER, diff --git a/src/orchestratord/src/lib.rs b/src/orchestratord/src/lib.rs index 5f6ae0c5040e5..2f7d422fdad3f 100644 --- a/src/orchestratord/src/lib.rs +++ b/src/orchestratord/src/lib.rs @@ -13,6 +13,7 @@ pub mod controller; pub mod k8s; pub mod metrics; pub mod tls; +pub mod webhook; #[derive(Debug, thiserror::Error)] pub enum Error { diff --git a/src/orchestratord/src/webhook.rs b/src/orchestratord/src/webhook.rs new file mode 100644 index 0000000000000..9b11df365a28c --- /dev/null +++ b/src/orchestratord/src/webhook.rs @@ -0,0 +1,164 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use anyhow::anyhow; +use axum::routing::{get, post}; +use axum::{Json, Router}; +use http::StatusCode; +use kube::core::Status; +use kube::core::conversion::{ConversionRequest, ConversionResponse, ConversionReview}; +use kube::core::response::reason; + +use mz_cloud_resources::crd::materialize::{v1, v1alpha1}; +use tracing::{debug, warn}; + +pub fn router() -> Router { + Router::new() + .route("/convert", post(post_convert)) + .route("/healthz", get(get_health)) +} + +#[derive(Clone, Copy)] +enum SupportedVersion { + V1alpha1, + V1, +} + +impl TryFrom<&str> for SupportedVersion { + type Error = anyhow::Error; + + fn try_from(value: &str) -> Result { + match value { + "materialize.cloud/v1alpha1" => Ok(SupportedVersion::V1alpha1), + "materialize.cloud/v1" => Ok(SupportedVersion::V1), + _ => Err(anyhow!("unexpected version: {}", value)), + } + } +} + +fn version_label(v: SupportedVersion) -> &'static str { + match v { + SupportedVersion::V1alpha1 => "v1alpha1", + SupportedVersion::V1 => "v1", + } +} + +fn convert( + desired_version: SupportedVersion, + value: serde_json::Value, +) -> Result { + let from_version = SupportedVersion::try_from( + value + .get("apiVersion") + .and_then(|version| version.as_str()) + .ok_or_else(|| anyhow!("missing version"))?, + )?; + debug!( + from = version_label(from_version), + to = version_label(desired_version), + input_spec = ?value.get("spec"), + input_status = ?value.get("status"), + "conversion webhook called", + ); + let result = match (from_version, desired_version) { + (SupportedVersion::V1alpha1, SupportedVersion::V1alpha1) => Ok(value), + (SupportedVersion::V1alpha1, SupportedVersion::V1) => { + serde_json::from_value::(value) + .and_then(|mz_v1alpha1| serde_json::to_value(v1::Materialize::from(mz_v1alpha1))) + .map_err(|e| e.into()) + } + (SupportedVersion::V1, SupportedVersion::V1alpha1) => { + serde_json::from_value::(value) + .and_then(|mz_v1| serde_json::to_value(v1alpha1::Materialize::from(mz_v1))) + .map_err(|e| e.into()) + } + (SupportedVersion::V1, SupportedVersion::V1) => Ok(value), + }; + match &result { + Ok(converted) => { + debug!( + from = version_label(from_version), + to = version_label(desired_version), + output_spec = ?converted.get("spec"), + output_status = ?converted.get("status"), + "conversion webhook succeeded", + ); + } + Err(e) => { + warn!( + from = version_label(from_version), + to = version_label(desired_version), + error = ?e, + "conversion webhook failed", + ); + } + } + result +} + +async fn post_convert( + Json(conversion_review): Json, +) -> (StatusCode, Json) { + let Ok(request) = ConversionRequest::from_review(conversion_review) else { + warn!("missing request"); + return ( + StatusCode::UNPROCESSABLE_ENTITY, + Json( + ConversionResponse::invalid(Status::failure("missing request", reason::INVALID)) + .into_review(), + ), + ); + }; + + let desired_version = match SupportedVersion::try_from(request.desired_api_version.as_str()) { + Ok(v) => v, + Err(e) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + Json( + ConversionResponse::for_request(request) + .failure(Status::failure(&e.to_string(), reason::BAD_REQUEST)) + .into_review(), + ), + ); + } + }; + + let converted_objects: Result, anyhow::Error> = request + .objects + .iter() + .cloned() + .map(|value| convert(desired_version, value)) + .collect(); + match converted_objects { + Ok(converted_objects) => ( + StatusCode::OK, + Json( + ConversionResponse::for_request(request) + .success(converted_objects) + .into_review(), + ), + ), + Err(e) => { + warn!("error when converting: {:?}\n{:?}", &e, request.objects); + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json( + ConversionResponse::for_request(request) + .failure(Status::failure(&e.to_string(), reason::UNKNOWN)) + .into_review(), + ), + ) + } + } +} + +async fn get_health() -> StatusCode { + StatusCode::OK +} diff --git a/test/orchestratord/mzcompose.py b/test/orchestratord/mzcompose.py index 904a9cddb22c0..68e734c088c52 100644 --- a/test/orchestratord/mzcompose.py +++ b/test/orchestratord/mzcompose.py @@ -20,6 +20,7 @@ import shutil import signal import subprocess +import tempfile import time import uuid from collections.abc import Callable, Iterator @@ -1828,6 +1829,233 @@ def validate(self, mods: dict[type[Modification], Any]) -> None: return +class MaterializeCRDVersion(Modification): + @classmethod + def values(cls, version: MzVersion) -> list[Any]: + return [ + "materialize.cloud/v1alpha1", + "materialize.cloud/v1", + ] + + @classmethod + def default(cls) -> Any: + return "materialize.cloud/v1" + + def modify(self, definition: dict[str, Any]) -> None: + if self.value == "materialize.cloud/v1" and operator_supports_v1(definition): + # The operator only installs and serves the v1 CRD version when + # explicitly asked to. + enable_v1_crd(definition) + definition["materialize"]["apiVersion"] = self.value + else: + # Older versions do not support v1, and without installV1CRD the + # operator does not serve it. + definition["materialize"]["apiVersion"] = "materialize.cloud/v1alpha1" + + def validate(self, mods: dict[type[Modification], Any]) -> None: + # This should be OK without additional validation, as we check we + # deployed in post_run_check and check the installed CRD versions in + # check_crd_versions. + return + + +class CertificateSource(Modification): + SECRET_NAME = "orchestratord-custom-cert" + + @classmethod + def values(cls, version: MzVersion) -> list[Any]: + return ["cert-manager", "secret"] + + @classmethod + def default(cls) -> Any: + return "cert-manager" + + def modify(self, definition: dict[str, Any]) -> None: + # The certificate is only used to serve the conversion webhook, which + # is only installed together with the v1 CRD. + if operator_supports_v1(definition): + enable_v1_crd(definition) + definition["operator"]["operator"]["certificate"]["source"] = self.value + if self.value == "secret": + definition["operator"]["operator"]["certificate"][ + "secretName" + ] = self.SECRET_NAME + self._create_cert_secret() + + @classmethod + def _create_cert_secret(cls) -> None: + dns_name = "operator-materialize-operator.materialize.svc" + + with tempfile.TemporaryDirectory() as tmpdir: + ca_key = os.path.join(tmpdir, "ca.key") + ca_crt = os.path.join(tmpdir, "ca.crt") + tls_key = os.path.join(tmpdir, "tls.key") + tls_crt = os.path.join(tmpdir, "tls.crt") + csr_path = os.path.join(tmpdir, "server.csr") + ext_path = os.path.join(tmpdir, "ext.cnf") + + # Generate CA key and self-signed cert + spawn.runv( + [ + "openssl", + "req", + "-x509", + "-newkey", + "rsa:2048", + "-keyout", + ca_key, + "-out", + ca_crt, + "-days", + "1", + "-nodes", + "-subj", + "/CN=Test CA", + ] + ) + + # Generate server key + spawn.runv( + [ + "openssl", + "genpkey", + "-algorithm", + "rsa", + "-pkeyopt", + "rsa_keygen_bits:2048", + "-out", + tls_key, + ] + ) + + # Generate CSR + spawn.runv( + [ + "openssl", + "req", + "-new", + "-key", + tls_key, + "-out", + csr_path, + "-subj", + f"/CN={dns_name}", + ] + ) + + # Write extension file for SAN + with open(ext_path, "w") as f: + f.write(f"subjectAltName=DNS:{dns_name}\n") + + # Sign CSR with CA + spawn.runv( + [ + "openssl", + "x509", + "-req", + "-in", + csr_path, + "-CA", + ca_crt, + "-CAkey", + ca_key, + "-CAcreateserial", + "-out", + tls_crt, + "-days", + "1", + "-extfile", + ext_path, + ] + ) + + # Delete existing secret if present + try: + spawn.capture( + [ + "kubectl", + "delete", + "secret", + cls.SECRET_NAME, + "-n", + "materialize", + ], + stderr=subprocess.DEVNULL, + ) + except subprocess.CalledProcessError: + pass + + # Create the secret with ca.crt, tls.crt, and tls.key + spawn.runv( + [ + "kubectl", + "create", + "secret", + "generic", + cls.SECRET_NAME, + f"--from-file=ca.crt={ca_crt}", + f"--from-file=tls.crt={tls_crt}", + f"--from-file=tls.key={tls_key}", + "-n", + "materialize", + ] + ) + + def validate(self, mods: dict[type[Modification], Any]) -> None: + def check() -> None: + orchestratord = get_orchestratord_data() + container = orchestratord["items"][0]["spec"]["containers"][0] + if "--install-v1-crd" not in container["args"]: + # The operator does not serve the conversion webhook (e.g. it + # is too old to know the flag), so no certificate is mounted. + return + volumes = orchestratord["items"][0]["spec"].get("volumes") or [] + cert_volume = next( + (v for v in volumes if v.get("name") == "certificate"), + None, + ) + assert cert_volume is not None, f"Expected certificate volume in {volumes}" + + secret_name = cert_volume["secret"]["secretName"] + if self.value == "cert-manager": + expected = "operator-materialize-operator-cert" + else: + expected = self.SECRET_NAME + assert ( + secret_name == expected + ), f"Expected certificate secret name '{expected}', got '{secret_name}'" + + retry(check, 120) + + +def operator_supports_v1(definition: dict[str, Any]): + operator_version = MzVersion.parse( + definition["operator"]["operator"]["image"]["tag"] + ) + # v1 first ships in v26.29; no released self-managed operator serves + # the v1 CRD, so anything older must use v1alpha1. Keep this in sync + # with the release that actually introduces the v1 CRD. + return operator_version >= MzVersion.parse("v26.29.0-dev.0") + + +def operator_serves_v1(definition: dict[str, Any]) -> bool: + """Whether the operator in this definition installs and serves the v1 CRD. + + Even operators that support v1 only install it when the installV1CRD helm + value is set.""" + return operator_supports_v1(definition) and bool( + definition["operator"]["operator"]["args"].get("installV1CRD") + ) + + +def enable_v1_crd(definition: dict[str, Any]) -> None: + """Make the operator install the v1 CRD and its conversion webhook.""" + assert operator_supports_v1( + definition + ), "the operator version is too old to install the v1 CRD" + definition["operator"]["operator"]["args"]["installV1CRD"] = True + + class Properties(Enum): Defaults = "defaults" Individual = "individual" @@ -1880,6 +2108,8 @@ def workflow_documentation_defaults( os.mkdir(dir) recreate_kind_cluster() + helm_install_cert_manager() + shutil.copyfile( "misc/helm-charts/operator/values.yaml", os.path.join(dir, "sample-values.yaml"), @@ -2245,7 +2475,8 @@ def measure_downtime() -> None: thread.start() time.sleep(10) # some time to make sure the thread runs fine request = str(uuid.uuid4()) - definition["materialize"]["spec"]["requestRollout"] = request + if definition["materialize"]["apiVersion"] == "materialize.cloud/v1alpha1": + definition["materialize"]["spec"]["requestRollout"] = request definition["materialize"]["spec"]["forceRollout"] = request run(definition, False) time.sleep(120) # some time to make sure there is no downtime later @@ -2293,6 +2524,563 @@ def workflow_balancer(c: Composition, parser: WorkflowArgumentParser) -> None: run_balancer(definition, False) +def get_materialize_v1alpha1() -> dict[str, Any]: + """Get the first Materialize resource at v1alpha1.""" + data = json.loads( + spawn.capture( + [ + "kubectl", + "get", + "materializes.v1alpha1.materialize.cloud", + "-n", + "materialize-environment", + "-o", + "json", + ], + stderr=subprocess.DEVNULL, + ) + ) + return data["items"][0] + + +def get_materialize_at_stored_version() -> dict[str, Any]: + """Get the first Materialize resource at the stored version (currently v1alpha1).""" + return get_materialize_v1alpha1() + + +def get_materialize_status_at_stored_version() -> dict[str, Any] | None: + """Get the status of the first Materialize resource at the stored version.""" + return get_materialize_at_stored_version().get("status") + + +def get_materialize_v1() -> dict[str, Any]: + """Get the first Materialize resource at v1.""" + data = json.loads( + spawn.capture( + [ + "kubectl", + "get", + "materializes.v1.materialize.cloud", + "-n", + "materialize-environment", + "-o", + "json", + ], + stderr=subprocess.DEVNULL, + ) + ) + return data["items"][0] + + +def workflow_v1_opt_in( + c: Composition, + parser: WorkflowArgumentParser, +) -> None: + """Test that applying a v1 resource triggers reconciliation only + when the spec has changed, and not when it is unchanged. + + The conversion webhook computes a rollout hash from the v1 spec. + When converting to v1alpha1 for storage, it derives a deterministic + requestRollout UUID from the hash. When the spec is unchanged, the + derived UUID matches lastCompletedRolloutRequest, so no rollout occurs. + When the spec changes, the derived UUID differs, triggering a rollout. + """ + parser.add_argument( + "--recreate-cluster", + action=argparse.BooleanOptionalAction, + help="Recreate cluster if it exists already", + ) + parser.add_argument( + "--tag", + type=str, + help="Custom version tag to use", + ) + parser.add_argument( + "--orchestratord-override", + default=True, + action=argparse.BooleanOptionalAction, + help="Override orchestratord tag", + ) + args = parser.parse_args() + + definition = setup(c, args) + enable_v1_crd(definition) + + # Step 1: Deploy with v1, complete initial rollout. + definition["materialize"]["apiVersion"] = "materialize.cloud/v1" + init(definition) + run(definition, False) + print("Initial v1 deployment completed") + + # Record the initial v1alpha1 status. + mz_v1 = get_materialize_v1alpha1() + initial_request_rollout = mz_v1["spec"]["requestRollout"] + initial_last_completed_rollout_request = mz_v1["status"][ + "lastCompletedRolloutRequest" + ] + assert ( + initial_request_rollout == initial_last_completed_rollout_request + ), f"Expected completed rollout: requestRollout={initial_request_rollout} != lastCompletedRolloutRequest={initial_last_completed_rollout_request}" + print(f"Initial requestRollout: {initial_request_rollout}") + + # Step 2: Re-apply the same spec at v1 (no changes). + # The conversion webhook should compute the same hash, deriving the + # same requestRollout UUID, so no new rollout should occur. + print("Re-applying same spec at v1 (expecting no rollout)...") + apply_materialize(definition) + + # Wait a bit and verify that no new rollout was triggered. + time.sleep(30) + mz_v1 = get_materialize_v1alpha1() + noop_request_rollout = mz_v1["spec"]["requestRollout"] + assert ( + noop_request_rollout == initial_request_rollout + ), f"Expected requestRollout unchanged, but changed from {initial_request_rollout} to {noop_request_rollout}" + noop_last_completed_rollout_request = mz_v1["status"]["lastCompletedRolloutRequest"] + assert ( + noop_last_completed_rollout_request == initial_last_completed_rollout_request + ), f"Expected lastCompletedRolloutRequest unchanged, but changed from {initial_last_completed_rollout_request} to {noop_last_completed_rollout_request}" + print("Confirmed: no rollout triggered by v1 apply with no changes") + + # Step 3: Apply at v1 with a spec change (extra env var). + # The conversion webhook should compute a different hash, deriving a + # different requestRollout UUID, triggering a rollout. + print("Applying v1 with changed environmentdExtraEnv (expecting rollout)...") + definition["materialize"]["spec"]["environmentdExtraEnv"] = [ + {"name": "V1_OPT_IN_TEST", "value": "true"}, + ] + run(definition, False) + + mz_v1 = get_materialize_v1alpha1() + changed_request_rollout = mz_v1["spec"]["requestRollout"] + assert ( + changed_request_rollout != initial_request_rollout + ), f"Expected requestRollout to change after spec change, but still {changed_request_rollout}" + + def check_rollout_complete(): + mz_v1 = get_materialize_v1alpha1() + changed_last_completed_rollout_request = mz_v1["status"][ + "lastCompletedRolloutRequest" + ] + assert ( + changed_last_completed_rollout_request == changed_request_rollout + ), f"Expected rollout to complete: requestRollout={changed_request_rollout} != lastCompletedRolloutRequest={changed_last_completed_rollout_request}" + + retry(check_rollout_complete, 120) + print( + f"Confirmed: rollout triggered by v1 spec change. " + f"requestRollout changed from {initial_request_rollout} to {changed_request_rollout}" + ) + print("v1 opt-in test PASSED") + + +OPERATOR_CERT_SECRET = "operator-materialize-operator-cert" +OPERATOR_CA_SECRET = "operator-materialize-operator-ca" +MATERIALIZE_CRD = "materializes.materialize.cloud" + + +def conversion_webhook_works() -> bool: + """Returns whether the conversion webhook is currently functional. + + Reading the Materialize resource at v1 forces the Kubernetes API + server to call the conversion webhook to convert the stored v1alpha1 + object. If the webhook's serving certificate isn't trusted by the + caBundle registered in the CRD, the API server rejects the call and this + returns False. + """ + result = subprocess.run( + [ + "kubectl", + "get", + "materializes.v1.materialize.cloud", + "-n", + "materialize-environment", + "-o", + "json", + ], + capture_output=True, + ) + if result.returncode != 0: + print( + f"conversion webhook probe failed: {result.stderr.decode(errors='replace')}" + ) + return False + items = json.loads(result.stdout).get("items", []) + return len(items) > 0 + + +def get_crd_ca_bundle() -> str: + """The caBundle the API server uses to trust the conversion webhook.""" + return spawn.capture( + [ + "kubectl", + "get", + "crd", + MATERIALIZE_CRD, + "-o", + "jsonpath={.spec.conversion.webhook.clientConfig.caBundle}", + ] + ).strip() + + +def get_secret_field(secret: str, field: str) -> str: + """A base64-encoded field from a secret in the materialize namespace.""" + return spawn.capture( + [ + "kubectl", + "get", + "secret", + secret, + "-n", + "materialize", + "-o", + rf"jsonpath={{.data.{field}}}", + ] + ).strip() + + +def get_serving_cert_ca() -> str: + """The ca.crt in the mounted serving-certificate secret (the root CA).""" + return get_secret_field(OPERATOR_CERT_SECRET, r"ca\.crt") + + +def get_serving_cert_leaf() -> str: + """The tls.crt (leaf) in the mounted serving-certificate secret.""" + return get_secret_field(OPERATOR_CERT_SECRET, r"tls\.crt") + + +def workflow_webhook_cert_rotation( + c: Composition, + parser: WorkflowArgumentParser, +) -> None: + """Test that the conversion webhook keeps working as its TLS certificate is + rotated, i.e. once the original certificate would have expired. + + The webhook is served by orchestratord using a certificate that + cert-manager rotates out-of-band, and orchestratord reloads the serving + certificate from disk periodically. The serving certificate is signed by a + stable root CA, so there are two distinct rotation cases, both tested here + without restarting orchestratord: + + 1. Serving-certificate rotation (the common case): the leaf rotates but + the CA -- and therefore the caBundle registered on the CRD -- stays the + same. The webhook must keep working with no caBundle change. + + 2. Root CA rotation (rare): ca.crt changes, so orchestratord must + re-register the CRD's caBundle to match the newly-served certificate, + otherwise the API server would reject every conversion request. + + Rather than wait out real certificate lifetimes, this test deploys with a + very short reload interval and forces cert-manager to reissue certificates + by deleting their secrets. + """ + parser.add_argument( + "--recreate-cluster", + action=argparse.BooleanOptionalAction, + help="Recreate cluster if it exists already", + ) + parser.add_argument( + "--tag", + type=str, + help="Custom version tag to use", + ) + parser.add_argument( + "--orchestratord-override", + default=True, + action=argparse.BooleanOptionalAction, + help="Override orchestratord tag", + ) + args = parser.parse_args() + + definition = setup(c, args) + enable_v1_crd(definition) + + # Use cert-manager (the default) so we exercise the real rotation path, + # and reload the certificate aggressively so a rotation is picked up in + # seconds rather than the default hour. + assert definition["operator"]["operator"]["certificate"]["source"] == "cert-manager" + definition["operator"]["operator"]["args"]["webhookCertReloadInterval"] = "5s" + + # Deploy a v1 resource. The initial apply already goes through the + # conversion webhook (v1 -> stored v1alpha1), so this confirms the + # webhook works before any rotation. + definition["materialize"]["apiVersion"] = "materialize.cloud/v1" + init(definition) + apply_materialize(definition) + + def webhook_works() -> None: + assert conversion_webhook_works(), "conversion webhook is not working" + + retry(webhook_works, 120) + print("Conversion webhook works before rotation") + + initial_ca_bundle = get_crd_ca_bundle() + assert initial_ca_bundle, "expected a caBundle to be registered on the CRD" + + # --- Case 1: serving-certificate rotation, CA unchanged ------------------ + # + # Deleting the serving-certificate secret makes cert-manager reissue the + # leaf (new key + new tls.crt) signed by the same stable root CA, so ca.crt + # is unchanged. The webhook must keep working and the caBundle must NOT + # change. + leaf_before = get_serving_cert_leaf() + ca_before = get_serving_cert_ca() + print("Rotating the serving certificate (deleting the serving cert secret)...") + spawn.runv( + ["kubectl", "delete", "secret", OPERATOR_CERT_SECRET, "-n", "materialize"] + ) + + def leaf_rotated() -> None: + leaf = get_serving_cert_leaf() + assert ( + leaf and leaf != leaf_before + ), "cert-manager has not reissued the leaf yet" + + retry(leaf_rotated, 120) + assert ( + get_serving_cert_ca() == ca_before + ), "root CA changed during a serving-certificate rotation; expected it to be stable" + print("Serving certificate rotated; root CA is unchanged") + + # Give orchestratord time to reload the new leaf (interval is 5s), then + # confirm the webhook still works and the caBundle was left untouched. + retry(webhook_works, 120) + assert ( + get_crd_ca_bundle() == initial_ca_bundle + ), "caBundle changed even though the CA did not" + apply_materialize(definition) + print("Conversion webhook still works after serving-certificate rotation") + + # --- Case 2: root CA rotation -------------------------------------------- + # + # Rotate the root CA, then reissue the leaf so its ca.crt reflects the new + # CA. orchestratord must notice ca.crt changed and re-register the CRD's + # caBundle, otherwise the API server would reject conversions. + print("Rotating the root CA (deleting the CA secret)...") + spawn.runv(["kubectl", "delete", "secret", OPERATOR_CA_SECRET, "-n", "materialize"]) + + def ca_secret_rotated() -> None: + ca = get_secret_field(OPERATOR_CA_SECRET, r"tls\.crt") + assert ca, "cert-manager has not reissued the root CA yet" + + retry(ca_secret_rotated, 120) + + # Force the leaf to be re-signed by the new CA so the mounted ca.crt updates. + spawn.runv( + ["kubectl", "delete", "secret", OPERATOR_CERT_SECRET, "-n", "materialize"] + ) + + def serving_ca_changed() -> None: + assert ( + get_serving_cert_ca() != ca_before + ), "serving cert's ca.crt has not picked up the new root CA yet" + + retry(serving_ca_changed, 180) + print("Root CA rotated and serving certificate re-signed by the new CA") + + # orchestratord must refresh the CRD's caBundle to match the new CA. Waiting + # for the caBundle to change proves the re-registration happened. + def ca_bundle_refreshed() -> None: + current = get_crd_ca_bundle() + assert ( + current and current != initial_ca_bundle + ), "orchestratord has not refreshed the conversion webhook caBundle yet" + + retry(ca_bundle_refreshed, 300) + print("orchestratord refreshed the conversion webhook caBundle after CA rotation") + + # Decisive check: the old CA is gone, so the webhook can only work if the + # newly-served certificate is trusted via the refreshed caBundle. We never + # restarted orchestratord, so this exercises the in-process reload + + # re-registration path that runs in production. + retry(webhook_works, 120) + apply_materialize(definition) + print("Conversion webhook still works after root CA rotation") + print("webhook cert rotation test PASSED") + + +def workflow_manually_promote( + c: Composition, + parser: WorkflowArgumentParser, +) -> None: + """Test ManuallyPromote rollout strategy with both v1alpha1 and v1 + force-promote mechanisms. + + Verifies that promotion can be triggered by setting forcePromote to either: + - The v1alpha1 requestRollout UUID + - The v1 requestedRolloutHash + """ + parser.add_argument( + "--recreate-cluster", + action=argparse.BooleanOptionalAction, + help="Recreate cluster if it exists already", + ) + parser.add_argument( + "--tag", + type=str, + help="Custom version tag to use", + ) + parser.add_argument( + "--orchestratord-override", + default=True, + action=argparse.BooleanOptionalAction, + help="Override orchestratord tag", + ) + args = parser.parse_args() + + definition = setup(c, args) + enable_v1_crd(definition) + + # Deploy with v1 and ManuallyPromote strategy. + definition["materialize"]["apiVersion"] = "materialize.cloud/v1" + definition["materialize"]["spec"]["rolloutStrategy"] = "ManuallyPromote" + init(definition) + run(definition, False) + print("Initial deployment with ManuallyPromote completed") + + # --- Test 1: Promote using v1alpha1 requestRollout UUID --- + print("Test 1: Promote using v1alpha1 requestRollout UUID") + + # Make a spec change to trigger a new rollout. + definition["materialize"]["spec"]["environmentdExtraEnv"] = [ + {"name": "MANUALLY_PROMOTE_TEST_1", "value": "true"}, + ] + apply_materialize(definition) + + wait_for_ready_to_promote() + + # Promote using v1alpha1 requestRollout. + mz_v1 = get_materialize_v1alpha1() + request_rollout = mz_v1["spec"]["requestRollout"] + mz_name = mz_v1["metadata"]["name"] + print(f"Promoting via v1alpha1 requestRollout: {request_rollout}") + spawn.runv( + [ + "kubectl", + "patch", + "materializes.v1alpha1.materialize.cloud", + mz_name, + "-n", + "materialize-environment", + "--type=merge", + "-p", + json.dumps({"spec": {"forcePromote": request_rollout}}), + ], + ) + wait_for_rollout_complete() + print("Test 1 PASSED: Promotion via v1alpha1 requestRollout succeeded") + + # --- Test 2: Promote using v1 requestedRolloutHash --- + print("Test 2: Promote using v1 requestedRolloutHash") + + # Make another spec change to trigger a new rollout. + definition["materialize"]["spec"]["environmentdExtraEnv"] = [ + {"name": "MANUALLY_PROMOTE_TEST_2", "value": "true"}, + ] + apply_materialize(definition) + + wait_for_ready_to_promote() + + # Read the v1 status to get the requestedRolloutHash. + mz_v2 = get_materialize_v1() + requested_rollout_hash = mz_v2["status"]["requestedRolloutHash"] + mz_name = mz_v2["metadata"]["name"] + print(f"Promoting via v1 requestedRolloutHash: {requested_rollout_hash}") + + # Patch at v1 using the hash as forcePromote. + spawn.runv( + [ + "kubectl", + "patch", + "materializes.v1.materialize.cloud", + mz_name, + "-n", + "materialize-environment", + "--type=merge", + "-p", + json.dumps({"spec": {"forcePromote": requested_rollout_hash}}), + ], + ) + wait_for_rollout_complete() + print("Test 2 PASSED: Promotion via v1 requestedRolloutHash succeeded") + + print("workflow_manually_promote PASSED") + + +def apply_materialize(definition: dict[str, Any]) -> None: + """Apply the materialize resource definition via kubectl.""" + defs = [ + definition["namespace"], + definition["secret"], + definition["materialize"], + ] + if "materialize2" in definition: + defs.append(definition["materialize2"]) + if "system_params_configmap" in definition: + defs.append(definition["system_params_configmap"]) + yaml_str = yaml.dump_all(defs) + print(f"Attempting to apply:\n{yaml_str}") + max_attempts = 120 + for attempt in range(max_attempts): + result = subprocess.run( + ["kubectl", "apply", "-f", "-"], + input=yaml_str.encode(), + capture_output=True, + ) + if result.returncode == 0: + break + stderr_str = result.stderr.decode(errors="replace") + if attempt < max_attempts - 1 and "connection refused" in stderr_str: + print(f"Webhook not yet reachable (attempt {attempt + 1}), retrying...") + time.sleep(2) + continue + print(f"Failed to apply: {result.stdout}\nSTDERR:{result.stderr}") + raise subprocess.CalledProcessError( + result.returncode, + result.args, + output=result.stdout, + stderr=result.stderr, + ) + + +def wait_for_ready_to_promote() -> None: + """Wait for the Materialize resource to reach ReadyToPromote status.""" + for _ in range(900): + time.sleep(1) + if is_ready_to_manually_promote(): + break + else: + print(yaml.dump(get_materialize_at_stored_version())) + raise RuntimeError("Never became ready for manual promotion") + + # Verify it stays in ReadyToPromote (doesn't auto-promote). + time.sleep(30) + if not is_ready_to_manually_promote(): + print(yaml.dump(get_materialize_at_stored_version())) + raise RuntimeError("Stopped being ready for manual promotion before promoting") + + +def wait_for_rollout_complete() -> None: + """Wait for the rollout to complete (UpToDate condition becomes True).""" + for _ in range(900): + time.sleep(1) + try: + status = get_materialize_status_at_stored_version() + if not status: + continue + conditions = status.get("conditions", []) + if ( + conditions + and conditions[0]["type"] == "UpToDate" + and conditions[0]["status"] == "True" + ): + return + except subprocess.CalledProcessError: + pass + print(yaml.dump(get_materialize_at_stored_version())) + raise RuntimeError("Rollout never completed") + + def workflow_orchestratord_upgrade( c: Composition, parser: WorkflowArgumentParser, @@ -2391,8 +3179,22 @@ def check(): versions = get_all_self_managed_versions() versions.append(get_version(args.tag)) + def set_latest_supported_crd_version(definition: dict[str, Any]): + if operator_supports_v1(definition): + enable_v1_crd(definition) + definition["materialize"]["apiVersion"] = "materialize.cloud/v1" + else: + definition["materialize"]["apiVersion"] = "materialize.cloud/v1alpha1" + + def request_rollout_if_needed(definition: dict[str, Any]): + if definition["materialize"]["apiVersion"] == "materialize.cloud/v1alpha1": + definition["materialize"]["spec"]["requestRollout"] = str(uuid.uuid4()) + else: + definition["materialize"]["spec"].pop("requestRollout", None) + print(f"running orchestratord {versions[-3]}") definition["operator"]["operator"]["image"]["tag"] = str(versions[-3]) + set_latest_supported_crd_version(definition) init(definition) check_orchestratord_version(versions[-3]) @@ -2409,7 +3211,13 @@ def check(): for version in versions[-2:]: print(f"running orchestratord {version}") definition["operator"]["operator"]["image"]["tag"] = str(version) + # Set before the helm upgrade so that operators which support the v1 + # CRD are deployed with --install-v1-crd and can serve the v1 + # apiVersion used below. + set_latest_supported_crd_version(definition) helm_install_operator(definition["operator"], upgrade=True) + wait_for_crd_established() + check_crd_versions(definition) check_orchestratord_version(version) print(f"running environmentd {version}") @@ -2417,7 +3225,8 @@ def check(): c.compose["services"]["environmentd"]["image"], str(version), ) - definition["materialize"]["spec"]["requestRollout"] = str(uuid.uuid4()) + set_latest_supported_crd_version(definition) + request_rollout_if_needed(definition) run(definition, False) check_environmentd_version(version) check_clusterd_version(version) @@ -2425,10 +3234,22 @@ def check(): if str(version) != "v26.4.0": check_balancerd_version(version) + # We cannot roll back orchestratord versions once the CRD is updated, + # so let's just get a clean cluster and start over. + spawn.runv( + [ + "kind", + "delete", + "cluster", + "--name", + "kind", + ] + ) definition = setup(c, args) print(f"running orchestratord {versions[-3]}") definition["operator"]["operator"]["image"]["tag"] = str(versions[-3]) + set_latest_supported_crd_version(definition) init(definition) check_orchestratord_version(versions[-3]) @@ -2444,7 +3265,13 @@ def check(): print(f"running orchestratord {versions[-1]}") definition["operator"]["operator"]["image"]["tag"] = str(versions[-1]) + # Set before the helm upgrade so that operators which support the v1 CRD + # are deployed with --install-v1-crd and can serve the v1 apiVersion used + # below. + set_latest_supported_crd_version(definition) helm_install_operator(definition["operator"], upgrade=True) + wait_for_crd_established() + check_crd_versions(definition) check_orchestratord_version(versions[-1]) print(f"running environmentd {versions[-1]}") @@ -2452,7 +3279,8 @@ def check(): c.compose["services"]["environmentd"]["image"], str(versions[-1]), ) - definition["materialize"]["spec"]["requestRollout"] = str(uuid.uuid4()) + set_latest_supported_crd_version(definition) + request_rollout_if_needed(definition) run(definition, False) check_environmentd_version(versions[-1]) check_clusterd_version(versions[-1]) @@ -2492,6 +3320,11 @@ def get_cr(plural: str) -> dict[str, Any]: # orchestratord creates the Balancer and Console CRs after writing # `lastCompletedRolloutRequest`, so `post_run_check` can return # before they exist. Retry until the resource shows up. + # + # Use this only for the Balancer/Console CRs. The Materialize CR must + # be read via `get_materialize_at_stored_version`, since a bare + # `materializes` resolves to the default-served v1 version, which + # lacks `requestRollout`/`lastCompletedRolloutRequest`. result: dict[str, Any] = {} def fetch() -> None: @@ -2538,7 +3371,7 @@ def fetch() -> None: init(definition) run(definition, False) - initial_mz = get_cr("materializes") + initial_mz = get_materialize_at_stored_version() initial_request = initial_mz["spec"]["requestRollout"] assert initial_mz["status"]["lastCompletedRolloutRequest"] == initial_request assert ( @@ -2587,7 +3420,7 @@ def fetch() -> None: ) raise RuntimeError("upgrade never became ready for manual promotion") - parked_mz = get_cr("materializes") + parked_mz = get_materialize_at_stored_version() assert parked_mz["status"]["lastCompletedRolloutRequest"] == initial_request assert ( parked_mz["status"]["lastCompletedRolloutEnvironmentdImageRef"] == initial_image @@ -2616,7 +3449,7 @@ def fetch() -> None: ) def check_revert_applied() -> None: - mz = get_cr("materializes") + mz = get_materialize_at_stored_version() assert mz["spec"]["requestRollout"] == initial_request assert mz["spec"]["environmentdImageRef"] == upgrade_image @@ -2625,7 +3458,7 @@ def check_revert_applied() -> None: # Let orchestratord reconcile several times after the revert. time.sleep(30) - final_mz = get_cr("materializes") + final_mz = get_materialize_at_stored_version() assert final_mz["status"]["lastCompletedRolloutRequest"] == initial_request assert ( final_mz["status"]["lastCompletedRolloutEnvironmentdImageRef"] == initial_image @@ -2678,22 +3511,6 @@ def workflow_rollout_timeout(c: Composition, parser: WorkflowArgumentParser) -> ) args = parser.parse_args() - def get_mz() -> dict[str, Any]: - return json.loads( - spawn.capture( - [ - "kubectl", - "get", - "materializes", - "-n", - "materialize-environment", - "-o", - "json", - ], - stderr=subprocess.DEVNULL, - ) - )["items"][0] - definition = setup(c, args) # Initial deploy on a prior released version so the upgrade target (current @@ -2717,11 +3534,9 @@ def get_mz() -> dict[str, Any]: init(definition) run(definition, False) - initial_mz = get_mz() - assert ( - initial_mz["status"]["lastCompletedRolloutEnvironmentdImageRef"] - == initial_image - ) + initial_status = get_materialize_status_at_stored_version() + assert initial_status is not None + assert initial_status["lastCompletedRolloutEnvironmentdImageRef"] == initial_image # Start a default (WaitUntilReady) upgrade with a tiny timeout. The new # generation cannot become ready within the timeout, so the rollout will be @@ -2744,7 +3559,7 @@ def get_mz() -> dict[str, Any]: # Wait for orchestratord to observe the timeout and cancel the rollout. def check_cancelled() -> None: - status = get_mz().get("status") or {} + status = get_materialize_status_at_stored_version() or {} conditions = status.get("conditions") or [] assert conditions, "no status conditions yet" condition = conditions[0] @@ -2935,6 +3750,8 @@ def setup(c: Composition, args) -> dict[str, Any]: if cluster not in clusters or args.recreate_cluster: recreate_kind_cluster() + helm_install_cert_manager() + spawn.runv(["kubectl", "create", "namespace", "materialize"]) spawn.runv( @@ -3067,7 +3884,13 @@ def run_scenario( values=definition["operator"], upgrade=True, ) - definition["materialize"]["spec"]["requestRollout"] = str(uuid.uuid4()) + # The set of served CRD versions may change between steps (e.g. + # when installV1CRD flips), so wait until the operator has + # re-registered the CRD before applying resources against it. + wait_for_crd_established() + check_crd_versions(definition) + if definition["materialize"]["apiVersion"] == "materialize.cloud/v1alpha1": + definition["materialize"]["spec"]["requestRollout"] = str(uuid.uuid4()) run(definition, expect_fail) mod_dict = {mod.__class__: mod.value for mod in mods} for subclass in all_subclasses(Modification): @@ -3169,6 +3992,24 @@ def helm_install_operator( ) +def helm_install_cert_manager(): + spawn.runv( + [ + "helm", + "install", + "cert-manager", + "oci://quay.io/jetstack/charts/cert-manager", + "--version", + "v1.19.2", + "--namespace", + "cert-manager", + "--create-namespace", + "--set", + "crds.enabled=true", + ] + ) + + def init(definition: dict[str, Any]) -> None: # `--wait=true` blocks until the namespace is fully terminated. If the # timeout is hit and we proceed anyway, the next `kubectl apply` races the @@ -3199,6 +4040,42 @@ def init(definition: dict[str, Any]) -> None: ) wait_for_crd_established() + check_crd_versions(definition) + + +def check_crd_versions(definition: dict[str, Any]) -> None: + """Check that the v1 CRD version and the conversion webhook are installed + if and only if the operator was asked to install them.""" + + def check() -> None: + crd = json.loads( + spawn.capture( + ["kubectl", "get", "crd", MATERIALIZE_CRD, "-o", "json"], + stderr=subprocess.DEVNULL, + ) + ) + versions = sorted(version["name"] for version in crd["spec"]["versions"]) + conversion_strategy = (crd["spec"].get("conversion") or {}).get("strategy") + if operator_serves_v1(definition): + assert versions == [ + "v1", + "v1alpha1", + ], f"expected v1 and v1alpha1 to be served, got {versions}" + assert ( + conversion_strategy == "Webhook" + ), f"expected Webhook conversion, got {conversion_strategy}" + else: + assert versions == [ + "v1alpha1" + ], f"expected only v1alpha1 to be served, got {versions}" + # The API server defaults spec.conversion to {"strategy": "None"} + # (the string "None") when no conversion is configured. + assert conversion_strategy in ( + None, + "None", + ), f"expected no conversion, got {conversion_strategy}" + + retry(check, 240) def wait_for_crd_established(): @@ -3239,82 +4116,32 @@ def wait_for_crd_established(): def run(definition: dict[str, Any], expect_fail: bool) -> None: - defs = [ - definition["namespace"], - definition["secret"], - definition["materialize"], - ] - if "materialize2" in definition: - defs.append(definition["materialize2"]) - if "system_params_configmap" in definition: - defs.append(definition["system_params_configmap"]) - try: - spawn.runv( - ["kubectl", "apply", "-f", "-"], - stdin=yaml.dump_all(defs).encode(), - ) - except subprocess.CalledProcessError as e: - print(f"Failed to apply: {e.stdout}\nSTDERR:{e.stderr}") - raise + apply_materialize(definition) if definition["materialize"]["spec"].get("rolloutStrategy") == "ManuallyPromote": - # First wait for it to become ready to promote, but not yet promoted - for _ in range(900): - time.sleep(1) - if is_ready_to_manually_promote(): - break - else: - spawn.runv( - [ - "kubectl", - "get", - "materializes", - "-n", - "materialize-environment", - "-o", - "yaml", - ], - ) - raise RuntimeError("Never became ready for manual promotion") - - # Wait to see that it doesn't promote - time.sleep(30) - if not is_ready_to_manually_promote(): + wait_for_ready_to_promote() + + # Manually promote it by reading the v1alpha1 resource to get the + # requestRollout UUID, then patching forcePromote to match it. + # Alternatively, forcePromote can be set to the v1 + # requestedRolloutHash (tested in workflow_manually_promote). + mz = get_materialize_v1alpha1() + request_rollout = mz["spec"]["requestRollout"] + assert request_rollout is not None + mz_name = mz["metadata"]["name"] + try: spawn.runv( [ "kubectl", - "get", - "materializes", - "-n", - "materialize-environment", - "-o", - "yaml", - ], - ) - raise RuntimeError( - "Stopped being ready for manual promotion before promoting" - ) - - # Manually promote it - mz = json.loads( - spawn.capture( - [ - "kubectl", - "get", - "materializes", + "patch", + "materializes.v1alpha1.materialize.cloud", + mz_name, "-n", "materialize-environment", - "-o", - "json", + "--type=merge", + "-p", + json.dumps({"spec": {"forcePromote": request_rollout}}), ], - stderr=subprocess.DEVNULL, - ) - )["items"][0] - definition["materialize"]["spec"]["forcePromote"] = mz["spec"]["requestRollout"] - try: - spawn.runv( - ["kubectl", "apply", "-f", "-"], - stdin=yaml.dump(definition["materialize"]).encode(), ) except subprocess.CalledProcessError as e: print(f"Failed to apply: {e.stdout}\nSTDERR:{e.stderr}") @@ -3324,21 +4151,8 @@ def run(definition: dict[str, Any], expect_fail: bool) -> None: def is_ready_to_manually_promote(): - data = json.loads( - spawn.capture( - [ - "kubectl", - "get", - "materializes", - "-n", - "materialize-environment", - "-o", - "json", - ], - stderr=subprocess.DEVNULL, - ) - ) - conditions = data["items"][0].get("status", {}).get("conditions") + mz = get_materialize_at_stored_version() + conditions = mz.get("status", {}).get("conditions") return ( conditions is not None and len(conditions) @@ -3349,24 +4163,12 @@ def is_ready_to_manually_promote(): def post_run_check(definition: dict[str, Any], expect_fail: bool) -> None: + # Read at the stored version explicitly to avoid going through the + # conversion webhook, which may not be ready yet during initial deployment. for i in range(900): time.sleep(1) try: - data = json.loads( - spawn.capture( - [ - "kubectl", - "get", - "materializes", - "-n", - "materialize-environment", - "-o", - "json", - ], - stderr=subprocess.DEVNULL, - ) - ) - status = data["items"][0].get("status") + status = get_materialize_status_at_stored_version() if not status: continue if expect_fail: @@ -3377,10 +4179,10 @@ def post_run_check(definition: dict[str, Any], expect_fail: bool) -> None: or status["conditions"][0]["status"] != "True" ): continue - if ( - status["lastCompletedRolloutRequest"] - == data["items"][0]["spec"]["requestRollout"] + if status.get("lastCompletedRolloutHash") or status.get( + "lastCompletedRolloutRequest" ): + # TODO should I check somehow that this is the latest to handle upgrades? break except subprocess.CalledProcessError: pass @@ -3389,7 +4191,7 @@ def post_run_check(definition: dict[str, Any], expect_fail: bool) -> None: [ "kubectl", "get", - "materializes", + "materializes.v1alpha1.materialize.cloud", "-n", "materialize-environment", "-o",