Skip to content

Commit 17e34e1

Browse files
committed
Merge FA patch
1 parent bddf9dc commit 17e34e1

2 files changed

Lines changed: 72 additions & 29 deletions

File tree

pkg/remoteconfig/experiment.go

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"encoding/json"
1111
"fmt"
1212

13+
"dario.cat/mergo"
1314
apiequality "k8s.io/apimachinery/pkg/api/equality"
1415
apierrors "k8s.io/apimachinery/pkg/api/errors"
1516
kubeclient "sigs.k8s.io/controller-runtime/pkg/client"
@@ -108,38 +109,48 @@ func (r *RemoteConfigUpdater) handleStartExperiment(ctx context.Context, dda *v2
108109
return fmt.Errorf("cannot start experiment: no currentRevision set (DDA not yet reconciled)")
109110
}
110111

111-
// Reject if any experiment lifecycle is active — FA must wait for the
112-
// current experiment to be fully resolved (promoted, rolled back, timed out,
113-
// or cleared) before starting a new one. This covers:
114-
// - running: experiment in progress
115-
// - rollback: stop received, restore pending
116-
// - timeout: auto-rollback pending or just completed
117-
// - aborted: awaiting FA acknowledgment
118-
// - promoted: being cleared by reconciler
119-
//
120-
// The only exception is retrying the same experiment after a partial start
121-
// failure (status updated but spec patch failed): ExpectedSpecHash is still
122-
// set and the experiment ID matches.
112+
// Guard against starting an experiment while one is actively in progress.
113+
// Terminal phases (aborted, timeout, promoted) are safe to overwrite —
114+
// these experiments are done and a new start should clear them.
115+
// Active phases (running, rollback) must be resolved first.
123116
if dda.Status.Experiment != nil {
124117
exp := dda.Status.Experiment
125-
// Allow retry of the same experiment if spec wasn't applied yet
126-
if exp.Phase == v2alpha1.ExperimentPhaseRunning &&
127-
exp.ExpectedSpecHash != "" &&
128-
signal.ExperimentID == exp.ID {
129-
r.logger.Info("Retrying startExperiment (prior attempt may have failed during spec patch)",
130-
"id", signal.ExperimentID)
131-
} else {
118+
switch exp.Phase {
119+
case v2alpha1.ExperimentPhaseAborted, v2alpha1.ExperimentPhaseTimeout, v2alpha1.ExperimentPhasePromoted:
120+
// Terminal — safe to start a new experiment, will overwrite
121+
r.logger.Info("Starting new experiment, replacing terminal experiment",
122+
"oldID", exp.ID, "oldPhase", exp.Phase, "newID", signal.ExperimentID)
123+
124+
case v2alpha1.ExperimentPhaseRunning:
125+
// Allow retry of the same experiment if spec wasn't applied yet
126+
if exp.ExpectedSpecHash != "" && signal.ExperimentID == exp.ID {
127+
r.logger.Info("Retrying startExperiment (prior attempt may have failed during spec patch)",
128+
"id", signal.ExperimentID)
129+
} else {
130+
return fmt.Errorf("cannot start experiment %s: experiment %s is active (phase=%s)",
131+
signal.ExperimentID, exp.ID, exp.Phase)
132+
}
133+
134+
case v2alpha1.ExperimentPhaseRollback:
135+
// Rollback in progress — must complete before starting new experiment
132136
return fmt.Errorf("cannot start experiment %s: experiment %s is active (phase=%s)",
133137
signal.ExperimentID, exp.ID, exp.Phase)
134138
}
135139
}
136140

137-
// Compute hash of the FA payload AFTER applying defaults, so it matches
138-
// the canonical form the reconciler uses (defaults.DefaultDatadogAgentSpec
139-
// runs before HandleExperimentLifecycle).
140-
configCopy := signal.Config.DeepCopy()
141-
defaults.DefaultDatadogAgentSpec(configCopy)
142-
specHash, err := experiment.ComputeSpecHash(configCopy)
141+
// Merge the FA patch into the current spec. FA sends only the fields it
142+
// wants to change; everything else is preserved from the current DDA spec.
143+
mergedSpec := dda.Spec.DeepCopy()
144+
if err := mergo.Merge(mergedSpec, signal.Config, mergo.WithOverride); err != nil {
145+
return fmt.Errorf("failed to merge experiment config into current spec: %w", err)
146+
}
147+
148+
// Compute hash of the MERGED+DEFAULTED spec, so it matches the canonical
149+
// form the reconciler uses (defaults.DefaultDatadogAgentSpec runs before
150+
// HandleExperimentLifecycle).
151+
defaultedSpec := mergedSpec.DeepCopy()
152+
defaults.DefaultDatadogAgentSpec(defaultedSpec)
153+
specHash, err := experiment.ComputeSpecHash(defaultedSpec)
143154
if err != nil {
144155
return fmt.Errorf("failed to compute spec hash for experiment config: %w", err)
145156
}
@@ -160,10 +171,10 @@ func (r *RemoteConfigUpdater) handleStartExperiment(ctx context.Context, dda *v2
160171
return fmt.Errorf("failed to re-fetch DDA after status update: %w", err)
161172
}
162173

163-
// Step 3: Patch DDA spec with experiment config
164-
refreshed.Spec = *signal.Config
174+
// Step 3: Apply merged spec to DDA
175+
refreshed.Spec = *mergedSpec
165176
if err := r.kubeClient.Update(ctx, refreshed); err != nil {
166-
return fmt.Errorf("failed to patch DDA spec for startExperiment: %w", err)
177+
return fmt.Errorf("failed to update DDA spec for startExperiment: %w", err)
167178
}
168179

169180
r.logger.Info("Started experiment", "id", signal.ExperimentID)

pkg/remoteconfig/experiment_test.go

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ func newTestDDA() *v2alpha1.DatadogAgent {
4949
APM: &v2alpha1.APMFeatureConfig{
5050
Enabled: apiutils.NewBoolPointer(false),
5151
},
52+
NPM: &v2alpha1.NPMFeatureConfig{
53+
Enabled: apiutils.NewBoolPointer(true),
54+
},
5255
},
5356
},
5457
}
@@ -222,9 +225,11 @@ func TestHandleStartExperiment_Success(t *testing.T) {
222225
assert.NotEmpty(t, updated.Status.Experiment.ExpectedSpecHash,
223226
"ExpectedSpecHash should be set for first-reconcile validation")
224227

225-
// Verify spec was patched
228+
// Verify spec was merged (not replaced)
226229
assert.True(t, apiutils.BoolValue(updated.Spec.Features.APM.Enabled),
227230
"APM should be enabled after startExperiment")
231+
assert.True(t, apiutils.BoolValue(updated.Spec.Features.NPM.Enabled),
232+
"NPM should be preserved from original spec (merge, not replace)")
228233
}
229234

230235
func TestHandleStartExperiment_MissingConfig(t *testing.T) {
@@ -398,6 +403,33 @@ func TestHandleStartExperiment_DuringRollback(t *testing.T) {
398403
assert.Equal(t, "exp-001", updated.Status.Experiment.ID)
399404
}
400405

406+
func TestHandleStartExperiment_AfterAborted(t *testing.T) {
407+
// A prior experiment was aborted (external edit). A new start should
408+
// clear the aborted state and start fresh — not get stuck forever.
409+
dda := newTestDDAWithExperiment(v2alpha1.ExperimentPhaseAborted)
410+
r := newUpdater(dda)
411+
412+
signal := &ExperimentSignal{
413+
Action: ExperimentActionStart,
414+
ExperimentID: "exp-002",
415+
Config: &v2alpha1.DatadogAgentSpec{
416+
Features: &v2alpha1.DatadogFeatures{
417+
APM: &v2alpha1.APMFeatureConfig{
418+
Enabled: apiutils.NewBoolPointer(true),
419+
},
420+
},
421+
},
422+
}
423+
424+
err := r.handleExperimentSignal(context.TODO(), signal)
425+
require.NoError(t, err, "should allow new experiment after aborted")
426+
427+
updated := getDDA(t, r)
428+
require.NotNil(t, updated.Status.Experiment)
429+
assert.Equal(t, v2alpha1.ExperimentPhaseRunning, updated.Status.Experiment.Phase)
430+
assert.Equal(t, "exp-002", updated.Status.Experiment.ID)
431+
}
432+
401433
// ============================================================================
402434
// handleStopExperiment tests
403435
// ============================================================================

0 commit comments

Comments
 (0)