From d126a6ea451b7aaceddcc46d2833a076206a0424 Mon Sep 17 00:00:00 2001 From: Jason Lynch Date: Thu, 2 Jul 2026 15:46:49 -0400 Subject: [PATCH] fix: needs_recreate behavior Resources can sometimes fail partway through the creation process, leaving files or other traces behind on the host. An example of this scenario is when we're able to create a Swarm service, but it fails to start. Rather than rollback the create with a delete, which could also be broken or dangerous, we record the resource into the state with the `needs_recreate` flag set to `true`. This way, if the user deletes the database or makes another update that would delete the resource, we know to run that resource's `Delete` method to ensure that it's removed from the host. Conversely, we know that we should run this resource's `Create` method again rather than its `Update` method if the user re-attempts the operation. In this case, the intent was to call `Create` in whichever phase it would normally be recreated if it didn't exist. However, since our planning process only looks at differences between states, it sees resources that need to be recreated in the initial state, even if there aren't any diffs. This lead to problems when the resource's normal phase was later in the update plan, such as when you fix a misconfigured supporting service resource. This commit modifies an existing process that handled pending deletions so that it removes resources that are pending recreation from the start state. This enables us to plan their creation at the appropriate phase. I've included a golden test that demonstrates this behavior. I've also modified some existing PostgREST E2Es to test this behavior because because that's where we first noticed this issue. PLAT-668 --- e2e/postgrest_test.go | 26 +++++++++++ ...pdate_after_failed_service_deployment.json | 43 +++++++++++++++++++ .../database/operations/update_database.go | 5 ++- .../operations/update_database_test.go | 42 ++++++++++++++++++ server/internal/resource/state.go | 13 ++++-- 5 files changed, 125 insertions(+), 4 deletions(-) create mode 100644 server/internal/database/operations/golden_test/TestUpdateDatabase/update_after_failed_service_deployment.json diff --git a/e2e/postgrest_test.go b/e2e/postgrest_test.go index 4d634aa2..39717ff7 100644 --- a/e2e/postgrest_test.go +++ b/e2e/postgrest_test.go @@ -203,6 +203,18 @@ func TestPostgRESTPreflight_MissingSchema(t *testing.T) { }) require.Error(t, err, "expected update task to fail due to missing schema") assert.Contains(t, err.Error(), "nonexistent_schema", "error should mention the missing schema") + + // It should succeed when we remove the nonexistent schema. + err = db.Update(ctx, UpdateOptions{ + Spec: postgrestBaseSpec( + "test_postgrest_preflight_schema", + []string{host1}, + []*controlplane.ServiceSpec{ + postgrestSpec(host1, 0, nil), + }, + ), + }) + require.NoError(t, err) } // TestPostgRESTPreflight_MissingAnonRole verifies the preflight check rejects @@ -236,6 +248,20 @@ func TestPostgRESTPreflight_MissingAnonRole(t *testing.T) { }) require.Error(t, err, "expected update task to fail due to missing anon role") assert.Contains(t, err.Error(), "nonexistent_role", "error should mention the missing role") + + // It should succeed when we switch to an existing role. + err = db.Update(ctx, UpdateOptions{ + Spec: postgrestBaseSpec( + "test_postgrest_preflight_role", + []string{host1}, + []*controlplane.ServiceSpec{ + postgrestSpec(host1, 0, map[string]any{ + "db_anon_role": "anon", + }), + }, + ), + }) + require.NoError(t, err) } // TestPostgRESTHealthCheck verifies the service responds to HTTP requests once running. diff --git a/server/internal/database/operations/golden_test/TestUpdateDatabase/update_after_failed_service_deployment.json b/server/internal/database/operations/golden_test/TestUpdateDatabase/update_after_failed_service_deployment.json new file mode 100644 index 00000000..18994d9e --- /dev/null +++ b/server/internal/database/operations/golden_test/TestUpdateDatabase/update_after_failed_service_deployment.json @@ -0,0 +1,43 @@ +[ + [ + [ + { + "type": "create", + "resource_id": "swarm.network::database-id", + "reason": "does_not_exist", + "diff": null + } + ], + [ + { + "type": "create", + "resource_id": "swarm.service_instance_spec::database-id-test-svc-host-1-id", + "reason": "does_not_exist", + "diff": null + } + ], + [ + { + "type": "create", + "resource_id": "swarm.service_instance::database-id-test-svc-host-1-id", + "reason": "does_not_exist", + "diff": null + } + ], + [ + { + "type": "create", + "resource_id": "monitor.service_instance::database-id-test-svc-host-1-id", + "reason": "does_not_exist", + "diff": null + } + ], + [ + { + "type": "delete", + "resource_id": "swarm.network::database-id-should-delete", + "diff": null + } + ] + ] +] \ No newline at end of file diff --git a/server/internal/database/operations/update_database.go b/server/internal/database/operations/update_database.go index 69b019cb..3ec57341 100644 --- a/server/internal/database/operations/update_database.go +++ b/server/internal/database/operations/update_database.go @@ -40,6 +40,9 @@ func UpdateDatabase( nodes []*NodeResources, services []*ServiceResources, ) ([]resource.Plan, error) { + // avoid modifying the caller's copy of the start state + start = start.Clone() + update, err := updateFunc(options) if err != nil { return nil, err @@ -81,7 +84,7 @@ func UpdateDatabase( } // Mark resources not in the end state with PendingDeletion = true so that // we skip updating them. - start.MarkPendingDeletion(end) + start.PreparePendingDeletesAndRecreates(end) // The states produced by the *Nodes functions are just diffs. Here's where // we create a sequence of incremental updates by iteratively applying those diff --git a/server/internal/database/operations/update_database_test.go b/server/internal/database/operations/update_database_test.go index fbcd178c..7cf301a1 100644 --- a/server/internal/database/operations/update_database_test.go +++ b/server/internal/database/operations/update_database_test.go @@ -216,6 +216,35 @@ func TestUpdateDatabase(t *testing.T) { ), ) + failedService := makeServiceResources(t, "database-id", "test-svc", "host-1-id", nil) + // This first resource should have a planned delete since it won't be in the + // end state. + failedService.Resources[0].Identifier.ID += "-should-delete" + failedService.Resources[0].NeedsRecreate = true + // This resource should have a planned create since it is in the end state. + failedService.Resources[1].NeedsRecreate = true + failedServiceState := makeState(t, + []resource.Resource{ + n1Instance1.Instance, + makeMonitorResource(n1Instance1), + &database.NodeResource{ + Name: "n1", + PrimaryInstanceID: n1Instance1.InstanceID(), + InstanceIDs: []string{n1Instance1.InstanceID()}, + }, + &database.PostgresDatabaseResource{ + NodeName: "n1", + DatabaseName: "test", + }, + }, + slices.Concat( + n1Instance1.InstanceDependencies, + // Only the first two resources got deployed, and both are marked + // with needs_recreate = true. + failedService.Resources[:2], + ), + ) + for _, tc := range []struct { name string options operations.UpdateDatabaseOptions @@ -629,6 +658,19 @@ func TestUpdateDatabase(t *testing.T) { }, services: []*operations.ServiceResources{svcRes}, }, + { + name: "update after failed service deployment", + options: operations.UpdateDatabaseOptions{}, + start: failedServiceState, + nodes: []*operations.NodeResources{ + { + NodeName: "n1", + InstanceResources: []*database.InstanceResources{n1Instance1}, + DatabaseName: "test", + }, + }, + services: []*operations.ServiceResources{svcRes}, + }, } { t.Run(tc.name, func(t *testing.T) { plans, err := operations.UpdateDatabase( diff --git a/server/internal/resource/state.go b/server/internal/resource/state.go index ce4520a8..a12b4b15 100644 --- a/server/internal/resource/state.go +++ b/server/internal/resource/state.go @@ -139,9 +139,11 @@ func (s *State) Merge(other *State) { } } -// MarkPendingDeletion takes an end state and marks all current resources that -// aren't in the end state with PendingDeletion = true. -func (s *State) MarkPendingDeletion(end *State) { +// PreparePendingDeletesAndRecreates takes an end state and marks all current +// resources that aren't in the end state with PendingDeletion = true. Resources +// that need to be recreated and are in the end state are removed from this +// state so that we can plan their creation in the appropriate phase. +func (s *State) PreparePendingDeletesAndRecreates(end *State) { for t, byType := range s.Resources { endByType, ok := end.Resources[t] if !ok { @@ -153,6 +155,11 @@ func (s *State) MarkPendingDeletion(end *State) { for id, resource := range byType { if _, ok := endByType[id]; !ok { resource.PendingDeletion = true + } else if resource.NeedsRecreate { + // This resource will be recreated since it's in the end state. + // Removing the old copy of it from the start state avoids an + // accidental create before it would normally occur. + delete(byType, id) } } }