@@ -10,6 +10,7 @@ import (
1010 "encoding/json"
1111 "fmt"
1212
13+ "dario.cat/mergo"
1314 apiequality "k8s.io/apimachinery/pkg/api/equality"
1415 apierrors "k8s.io/apimachinery/pkg/api/errors"
1516 kubeclient "sigs.k8s.io/controller-runtime/pkg/client"
@@ -108,38 +109,48 @@ func (r *RemoteConfigUpdater) handleStartExperiment(ctx context.Context, dda *v2
108109 return fmt .Errorf ("cannot start experiment: no currentRevision set (DDA not yet reconciled)" )
109110 }
110111
111- // Reject if any experiment lifecycle is active — FA must wait for the
112- // current experiment to be fully resolved (promoted, rolled back, timed out,
113- // or cleared) before starting a new one. This covers:
114- // - running: experiment in progress
115- // - rollback: stop received, restore pending
116- // - timeout: auto-rollback pending or just completed
117- // - aborted: awaiting FA acknowledgment
118- // - promoted: being cleared by reconciler
119- //
120- // The only exception is retrying the same experiment after a partial start
121- // failure (status updated but spec patch failed): ExpectedSpecHash is still
122- // set and the experiment ID matches.
112+ // Guard against starting an experiment while one is actively in progress.
113+ // Terminal phases (aborted, timeout, promoted) are safe to overwrite —
114+ // these experiments are done and a new start should clear them.
115+ // Active phases (running, rollback) must be resolved first.
123116 if dda .Status .Experiment != nil {
124117 exp := dda .Status .Experiment
125- // Allow retry of the same experiment if spec wasn't applied yet
126- if exp .Phase == v2alpha1 .ExperimentPhaseRunning &&
127- exp .ExpectedSpecHash != "" &&
128- signal .ExperimentID == exp .ID {
129- r .logger .Info ("Retrying startExperiment (prior attempt may have failed during spec patch)" ,
130- "id" , signal .ExperimentID )
131- } else {
118+ switch exp .Phase {
119+ case v2alpha1 .ExperimentPhaseAborted , v2alpha1 .ExperimentPhaseTimeout , v2alpha1 .ExperimentPhasePromoted :
120+ // Terminal — safe to start a new experiment, will overwrite
121+ r .logger .Info ("Starting new experiment, replacing terminal experiment" ,
122+ "oldID" , exp .ID , "oldPhase" , exp .Phase , "newID" , signal .ExperimentID )
123+
124+ case v2alpha1 .ExperimentPhaseRunning :
125+ // Allow retry of the same experiment if spec wasn't applied yet
126+ if exp .ExpectedSpecHash != "" && signal .ExperimentID == exp .ID {
127+ r .logger .Info ("Retrying startExperiment (prior attempt may have failed during spec patch)" ,
128+ "id" , signal .ExperimentID )
129+ } else {
130+ return fmt .Errorf ("cannot start experiment %s: experiment %s is active (phase=%s)" ,
131+ signal .ExperimentID , exp .ID , exp .Phase )
132+ }
133+
134+ case v2alpha1 .ExperimentPhaseRollback :
135+ // Rollback in progress — must complete before starting new experiment
132136 return fmt .Errorf ("cannot start experiment %s: experiment %s is active (phase=%s)" ,
133137 signal .ExperimentID , exp .ID , exp .Phase )
134138 }
135139 }
136140
137- // Compute hash of the FA payload AFTER applying defaults, so it matches
138- // the canonical form the reconciler uses (defaults.DefaultDatadogAgentSpec
139- // runs before HandleExperimentLifecycle).
140- configCopy := signal .Config .DeepCopy ()
141- defaults .DefaultDatadogAgentSpec (configCopy )
142- specHash , err := experiment .ComputeSpecHash (configCopy )
141+ // Merge the FA patch into the current spec. FA sends only the fields it
142+ // wants to change; everything else is preserved from the current DDA spec.
143+ mergedSpec := dda .Spec .DeepCopy ()
144+ if err := mergo .Merge (mergedSpec , signal .Config , mergo .WithOverride ); err != nil {
145+ return fmt .Errorf ("failed to merge experiment config into current spec: %w" , err )
146+ }
147+
148+ // Compute hash of the MERGED+DEFAULTED spec, so it matches the canonical
149+ // form the reconciler uses (defaults.DefaultDatadogAgentSpec runs before
150+ // HandleExperimentLifecycle).
151+ defaultedSpec := mergedSpec .DeepCopy ()
152+ defaults .DefaultDatadogAgentSpec (defaultedSpec )
153+ specHash , err := experiment .ComputeSpecHash (defaultedSpec )
143154 if err != nil {
144155 return fmt .Errorf ("failed to compute spec hash for experiment config: %w" , err )
145156 }
@@ -160,10 +171,10 @@ func (r *RemoteConfigUpdater) handleStartExperiment(ctx context.Context, dda *v2
160171 return fmt .Errorf ("failed to re-fetch DDA after status update: %w" , err )
161172 }
162173
163- // Step 3: Patch DDA spec with experiment config
164- refreshed .Spec = * signal . Config
174+ // Step 3: Apply merged spec to DDA
175+ refreshed .Spec = * mergedSpec
165176 if err := r .kubeClient .Update (ctx , refreshed ); err != nil {
166- return fmt .Errorf ("failed to patch DDA spec for startExperiment: %w" , err )
177+ return fmt .Errorf ("failed to update DDA spec for startExperiment: %w" , err )
167178 }
168179
169180 r .logger .Info ("Started experiment" , "id" , signal .ExperimentID )
0 commit comments