@@ -22,9 +22,14 @@ const (
2222 pollInterval = 30 * time .Second
2323
2424 // localConfigVersionStart is the starting version for local config updates.
25- // We use a high number to avoid conflicts with remote config versions which
26- // start at 0. This ensures local and remote configs use separate version spaces.
27- localConfigVersionStart = 1_000_000
25+ // Local config uses high positive versions (1_000_000+) to avoid conflicts with
26+ // remote config versions (0, 1, 2, ...). At typical change rates (<100/day),
27+ // collision would require decades of continuous operation.
28+ localConfigVersionStart int32 = 1_000_000
29+
30+ // maxReloadRetries limits consecutive reloads when config keeps changing.
31+ // This prevents infinite loops if the file is constantly being modified.
32+ maxReloadRetries = 3
2833)
2934
3035// LocalConfigWatcher watches a local configuration file for changes and updates
@@ -48,11 +53,15 @@ type LocalConfigWatcher struct {
4853}
4954
5055// NewLocalConfigWatcher creates a new LocalConfigWatcher.
56+ // Panics if orchestrator is nil (programming error, not recoverable).
5157func NewLocalConfigWatcher (
5258 orchestrator * Orchestrator ,
5359 configPath string ,
5460 log * zerolog.Logger ,
5561) * LocalConfigWatcher {
62+ if orchestrator == nil {
63+ panic ("orchestrator cannot be nil" )
64+ }
5665 return & LocalConfigWatcher {
5766 orchestrator : orchestrator ,
5867 configPath : configPath ,
@@ -63,41 +72,65 @@ func NewLocalConfigWatcher(
6372}
6473
6574// Run starts the config watcher. It watches for file changes and listens
66- // for manual reload signals on reloadC. Blocks until ctx is cancelled.
67- func (w * LocalConfigWatcher ) Run (ctx context.Context , reloadC <- chan struct {}) error {
75+ // for manual reload signals on reloadC.
76+ //
77+ // Returns a channel that is closed when the watcher is ready to receive signals.
78+ // Callers should wait on this channel before starting the signal handler to avoid
79+ // race conditions where signals arrive before the watcher is listening.
80+ func (w * LocalConfigWatcher ) Run (ctx context.Context , reloadC <- chan struct {}) <- chan struct {} {
81+ readyC := make (chan struct {})
82+
6883 fileWatcher , err := watcher .NewFile ()
6984 if err != nil {
70- w .log .Warn ().Err (err ).Msg ("Failed to create file watcher, hot reload disabled" )
71- return w .runWithoutFileWatcher (ctx , reloadC )
85+ w .log .Warn ().Err (err ).Msg ("Failed to create file watcher, falling back to SIGHUP only" )
86+ go func () {
87+ w .log .Info ().Str ("config" , w .configPath ).Msg ("Configuration reload available via SIGHUP signal" )
88+ close (readyC )
89+ w .runWithoutFileWatcher (ctx , reloadC )
90+ }()
91+ return readyC
7292 }
7393
7494 if err := fileWatcher .Add (w .configPath ); err != nil {
75- w .log .Warn ().Err (err ).Str ("config" , w .configPath ).Msg ("Failed to watch config file, hot reload disabled" )
76- return w .runWithoutFileWatcher (ctx , reloadC )
95+ w .log .Warn ().Err (err ).Str ("config" , w .configPath ).Msg ("Failed to watch config file, falling back to SIGHUP only" )
96+ go func () {
97+ w .log .Info ().Str ("config" , w .configPath ).Msg ("Configuration reload available via SIGHUP signal" )
98+ close (readyC )
99+ w .runWithoutFileWatcher (ctx , reloadC )
100+ }()
101+ return readyC
77102 }
78103
79104 w .log .Info ().Str ("config" , w .configPath ).Msg ("Started watching configuration file for changes" )
80105
81106 go fileWatcher .Start (w )
82107
83- return w .runLoop (ctx , reloadC , fileWatcher )
108+ // Initialize lastModTime before signaling ready to avoid race with early SIGHUP
109+ w .initLastModTime ()
110+
111+ go func () {
112+ close (readyC )
113+ w .runLoop (ctx , reloadC , fileWatcher )
114+ }()
115+
116+ return readyC
84117}
85118
86119// runWithoutFileWatcher runs the watcher loop without file watching.
87120// Only manual SIGHUP reloads will work.
88- func (w * LocalConfigWatcher ) runWithoutFileWatcher (ctx context.Context , reloadC <- chan struct {}) error {
121+ func (w * LocalConfigWatcher ) runWithoutFileWatcher (ctx context.Context , reloadC <- chan struct {}) {
89122 for {
90123 select {
91124 case <- ctx .Done ():
92- return ctx . Err ()
125+ return
93126 case <- reloadC :
94127 w .doReload ()
95128 }
96129 }
97130}
98131
99132// runLoop is the main event loop that handles file changes and reload signals.
100- func (w * LocalConfigWatcher ) runLoop (ctx context.Context , reloadC <- chan struct {}, fileWatcher * watcher.File ) error {
133+ func (w * LocalConfigWatcher ) runLoop (ctx context.Context , reloadC <- chan struct {}, fileWatcher * watcher.File ) {
101134 // Use a stopped timer initially; we'll reset it when file changes occur
102135 debounceTimer := time .NewTimer (0 )
103136 if ! debounceTimer .Stop () {
@@ -108,9 +141,6 @@ func (w *LocalConfigWatcher) runLoop(ctx context.Context, reloadC <-chan struct{
108141 // Poll timer as fallback for when fsnotify misses changes
109142 pollTicker := time .NewTicker (pollInterval )
110143
111- // Initialize lastModTime
112- w .initLastModTime ()
113-
114144 defer func () {
115145 debounceTimer .Stop ()
116146 pollTicker .Stop ()
@@ -120,19 +150,17 @@ func (w *LocalConfigWatcher) runLoop(ctx context.Context, reloadC <-chan struct{
120150 for {
121151 select {
122152 case <- ctx .Done ():
123- return ctx . Err ()
153+ return
124154
125155 case <- reloadC :
126156 w .log .Info ().Msg ("Received reload signal" )
127157 w .doReload ()
128158
129159 case <- w .reloadChan :
130- // Stop existing timer and drain if necessary
160+ // Stop existing timer and drain if necessary.
161+ // If Stop() returns false, timer already expired and channel has value.
131162 if ! debounceTimer .Stop () && debounceActive {
132- select {
133- case <- debounceTimer .C :
134- default :
135- }
163+ <- debounceTimer .C
136164 }
137165 debounceTimer .Reset (debounceInterval )
138166 debounceActive = true
@@ -181,6 +209,16 @@ func (w *LocalConfigWatcher) checkFileChanged() bool {
181209 return false
182210}
183211
212+ // getModTime returns the modification time of the config file.
213+ // Returns zero time if file cannot be stat'd.
214+ func (w * LocalConfigWatcher ) getModTime () time.Time {
215+ info , err := os .Stat (w .configPath )
216+ if err != nil {
217+ return time.Time {}
218+ }
219+ return info .ModTime ()
220+ }
221+
184222// WatcherItemDidChange implements watcher.Notification interface.
185223// Called when the config file is modified.
186224func (w * LocalConfigWatcher ) WatcherItemDidChange (filepath string ) {
@@ -208,51 +246,72 @@ func (w *LocalConfigWatcher) WatcherDidError(err error) {
208246}
209247
210248// doReload performs the actual configuration reload.
211- // It is protected by a mutex to prevent concurrent reloads from racing on version numbers.
249+ // Uses TryLock to skip if another reload is already in progress.
250+ // If the config file changes during reload, it will retry up to maxReloadRetries times.
212251func (w * LocalConfigWatcher ) doReload () {
213- w .mu .Lock ()
214- defer w .mu .Unlock ()
215-
216- cfg , err := ReadLocalConfig (w .configPath )
217- if err != nil {
218- w .log .Error ().Err (err ).Str ("config" , w .configPath ).Msg ("Failed to read config file, keeping current configuration" )
252+ if ! w .mu .TryLock () {
253+ w .log .Info ().Msg ("Reload already in progress, skipping" )
219254 return
220255 }
256+ defer w .mu .Unlock ()
221257
222- // Convert and validate in single pass to avoid double serialization
223- configJSON , err := ConvertAndValidateLocalConfig (cfg )
224- if err != nil {
225- w .log .Error ().Err (err ).Msg ("Invalid configuration, keeping current configuration" )
226- return
227- }
258+ for i := range maxReloadRetries {
259+ startModTime := w .getModTime ()
228260
229- nextVersion := w .version + 1
261+ cfg , err := ReadLocalConfig (w .configPath )
262+ if err != nil {
263+ w .log .Error ().Err (err ).Str ("config" , w .configPath ).
264+ Msg ("Failed to read config file, keeping current configuration" )
265+ return
266+ }
230267
231- // Call UpdateConfig synchronously. If it hangs, the watcher blocks - same behavior
232- // as remote config updates. No timeout wrapper to avoid goroutine leaks and mutex issues.
233- resp := w .orchestrator .UpdateConfig (nextVersion , configJSON )
268+ configJSON , err := ConvertAndValidateLocalConfig (cfg )
269+ if err != nil {
270+ w .log .Error ().Err (err ).Msg ("Invalid configuration, keeping current configuration" )
271+ return
272+ }
234273
235- if resp .Err != nil {
236- w .log .Error ().Err (resp .Err ).Int32 ("version" , nextVersion ).Msg ("Orchestrator rejected configuration update" )
237- return
238- }
274+ nextVersion := w .version + 1
275+ resp := w .orchestrator .UpdateConfig (nextVersion , configJSON )
276+
277+ if resp .Err != nil {
278+ w .log .Error ().Err (resp .Err ).Int32 ("version" , nextVersion ).
279+ Msg ("Orchestrator rejected configuration update" )
280+ return
281+ }
239282
240- // Only increment version after successful apply
241- w .version = nextVersion
283+ w .version = resp .LastAppliedVersion
242284
243- // Update lastModTime to prevent poll from triggering duplicate reload
244- if info , err := os .Stat (w .configPath ); err == nil {
245- w .lastModTime = info .ModTime ()
285+ // Get mtime once to avoid TOCTOU race
286+ currentModTime := w .getModTime ()
287+ w .lastModTime = currentModTime
288+
289+ w .log .Info ().Int32 ("version" , resp .LastAppliedVersion ).
290+ Msg ("Configuration reloaded successfully" )
291+
292+ // Check if file changed during reload (using same mtime value)
293+ if ! currentModTime .After (startModTime ) {
294+ return // No changes during reload, done
295+ }
296+
297+ if i < maxReloadRetries - 1 {
298+ w .log .Debug ().Msg ("Config file changed during reload, reloading again" )
299+ }
246300 }
247301
248- w .log .Info ().
249- Int32 ("version" , nextVersion ).
250- Int32 ("applied_version" , resp .LastAppliedVersion ).
251- Msg ("Configuration reloaded successfully" )
302+ w .log .Warn ().Int ("retries" , maxReloadRetries ).
303+ Msg ("Config file keeps changing, giving up after max retries" )
252304}
253305
254306// ReloadConfig triggers a manual configuration reload.
255307// This is useful for programmatic reloads without SIGHUP.
256308func (w * LocalConfigWatcher ) ReloadConfig () {
257309 w .doReload ()
258310}
311+
312+ // Version returns the current config version (thread-safe).
313+ func (w * LocalConfigWatcher ) Version () int32 {
314+ w .mu .Lock ()
315+ defer w .mu .Unlock ()
316+ return w .version
317+ }
0 commit comments