@@ -21,6 +21,7 @@ type lifecycleManager struct {
2121 provider * fractionProvider // Provider for fraction operations
2222 flags * StateManager // Storage state flags
2323 registry * fractionRegistry // Fraction state registry
24+ tasks * TaskManager // Background offloading tasks
2425
2526 sealingWg sync.WaitGroup
2627}
@@ -36,16 +37,21 @@ func newLifecycleManager(
3637 provider : provider ,
3738 flags : flags ,
3839 registry : registry ,
40+ tasks : NewTaskManager (),
3941 }
4042}
4143
4244// Maintain performs periodic lifecycle management tasks.
4345// It is a CORE method of lifecycleManager
4446// Coordinates rotation, offloading, cleanup based on configuration.
4547func (lc * lifecycleManager ) Maintain (ctx context.Context , config * Config , wg * sync.WaitGroup ) {
46- lc .Rotate (config .FracSize , wg )
48+ maxTotalSize := config .TotalSize + config .OffloadingQueueSize
49+ lc .ManageRotation (config .FracSize , maxTotalSize , wg )
4750 if config .OffloadingEnabled {
48- lc .OffloadLocal (ctx , config .TotalSize , wg )
51+ lc .OffloadLocal (ctx , config .TotalSize , config .OffloadingRetryDelay , wg )
52+ if config .OffloadingQueueSize > 0 {
53+ lc .RemoveOverflowed (config .OffloadingQueueSize , wg )
54+ }
4955 lc .CleanRemote (config .OffloadingRetention , wg )
5056 } else {
5157 lc .CleanLocal (config .TotalSize , wg )
@@ -85,57 +91,95 @@ func (lc *lifecycleManager) Seal(active *activeProxy) error {
8591 return nil
8692}
8793
88- // RotateIfNeeded checks if active fraction needs rotation based on size limit
89- // Creates new active fraction and starts sealing the previous one.
90- func (lc * lifecycleManager ) Rotate (sizeLimit uint64 , wg * sync.WaitGroup ) {
91- if lc .registry .Active ().instance .Info ().DocsOnDisk > sizeLimit {
92- active := lc .rotate ()
93-
94- wg .Add (1 )
95- lc .sealingWg .Add (1 )
96- go func () {
97- defer wg .Done ()
98- defer lc .sealingWg .Done ()
99- if err := lc .Seal (active ); err != nil {
100- logger .Fatal ("sealing error" , zap .Error (err ))
101- }
102- }()
94+ // ManageRotation checks if rotation is needed and manages the entire process
95+ // including suspension checks, fraction rotation, and asynchronous sealing.
96+ func (lc * lifecycleManager ) ManageRotation (maxActiveSize , maxTotalSize uint64 , wg * sync.WaitGroup ) {
97+ if maxTotalSize > 0 && lc .registry .SuspendIfOverCapacity (maxTotalSize ) {
98+ return
10399 }
104- }
105100
106- func (lc * lifecycleManager ) rotate () * activeProxy {
107- active , err := lc .registry .Rotate (newActiveProxy (lc .provider .CreateActive ()))
101+ activeToSeal , err := lc .registry .RotateIfNeeded (maxActiveSize , func () * activeProxy {
102+ return newActiveProxy (lc .provider .CreateActive ())
103+ })
108104 if err != nil {
109105 logger .Fatal ("active fraction rotation error" , zap .Error (err ))
110106 }
111- return active
107+ if activeToSeal == nil {
108+ return
109+ }
110+
111+ wg .Add (1 )
112+ lc .sealingWg .Add (1 )
113+ go func () {
114+ defer wg .Done ()
115+ defer lc .sealingWg .Done ()
116+ if err := lc .Seal (activeToSeal ); err != nil {
117+ logger .Fatal ("sealing error" , zap .Error (err ))
118+ }
119+ }()
112120}
113121
114122// OffloadLocal starts offloading of local fractions to remote storage
115123// Selects fractions based on disk space usage and retention policy.
116- func (lc * lifecycleManager ) OffloadLocal (ctx context.Context , sizeLimit uint64 , wg * sync.WaitGroup ) {
124+ func (lc * lifecycleManager ) OffloadLocal (ctx context.Context , sizeLimit uint64 , retryDelay time. Duration , wg * sync.WaitGroup ) {
117125 toOffload , err := lc .registry .EvictLocal (true , sizeLimit )
118126 if err != nil {
119127 logger .Fatal ("error releasing old fractions:" , zap .Error (err ))
120128 }
121129 for _ , sealed := range toOffload {
122130 wg .Add (1 )
123- go func () {
131+ lc . tasks . Run ( sealed . instance . BaseFileName , ctx , func (ctx context. Context ) {
124132 defer wg .Done ()
125133
126- remote , _ := lc .TryOffload (ctx , sealed .instance )
134+ remote := lc .OffloadWithRetry (ctx , sealed .instance , retryDelay )
135+
127136 lc .registry .PromoteToRemote (sealed , remote )
128137
129138 if remote == nil {
130139 sealed .proxy .Redirect (emptyFraction {})
140+ lc .infoCache .Remove (sealed .instance .Info ().Name ())
131141 } else {
132142 sealed .proxy .Redirect (remote )
133143 }
134144
135145 // Free up local resources
136146 sealed .instance .Suicide ()
137147 maintenanceTruncateTotal .Add (1 )
138- }()
148+ })
149+ }
150+ }
151+
152+ // OffloadWithRetry attempts to offload a fraction with retries until success or cancellation.
153+ // Returns the remote fraction instance and a boolean indicating whether offloading was not canceled.
154+ func (lc * lifecycleManager ) OffloadWithRetry (ctx context.Context , sealed * frac.Sealed , retryDelay time.Duration ) * frac.Remote {
155+ start := time .Now ()
156+ for i := 0 ; ; i ++ {
157+ remote , err := lc .TryOffload (ctx , sealed )
158+ if err == nil {
159+ return remote
160+ }
161+
162+ logger .Warn (
163+ "fail to offload fraction" ,
164+ zap .String ("name" , sealed .BaseFileName ),
165+ zap .Duration ("offloading_time" , time .Since (start )),
166+ zap .Int ("attempts" , i ),
167+ zap .Error (err ),
168+ )
169+
170+ select {
171+ case <- ctx .Done ():
172+ logger .Info (
173+ "fraction offloading was stopped" ,
174+ zap .String ("name" , sealed .BaseFileName ),
175+ zap .Duration ("offloading_time" , time .Since (start )),
176+ zap .Int ("attempts" , i ),
177+ zap .Error (ctx .Err ()),
178+ )
179+ return nil
180+ case <- time .After (retryDelay ):
181+ // Wait before next retry attempt
182+ }
139183 }
140184}
141185
@@ -201,6 +245,21 @@ func (lc *lifecycleManager) CleanLocal(sizeLimit uint64, wg *sync.WaitGroup) {
201245 }()
202246}
203247
248+ // RemoveOverflowed removes fractions from offloading queue that exceed size limit
249+ // Stops ongoing offloading tasks and cleans up both local and remote resources.
250+ func (lc * lifecycleManager ) RemoveOverflowed (sizeLimit uint64 , wg * sync.WaitGroup ) {
251+ evicted := lc .registry .EvictOverflowed (sizeLimit )
252+ for _ , item := range evicted {
253+ wg .Add (1 )
254+ go func () {
255+ defer wg .Done ()
256+ // Cancel the offloading task - this operation may take significant time
257+ // hence executed in a separate goroutine to avoid blocking
258+ lc .tasks .Cancel (item .instance .BaseFileName )
259+ }()
260+ }
261+ }
262+
204263// UpdateOldestMetric updates the prometheus metric with oldest fraction timestamp
205264func (lc * lifecycleManager ) UpdateOldestMetric () {
206265 oldestFracTime .WithLabelValues ("remote" ).Set ((time .Duration (lc .registry .OldestTotal ()) * time .Millisecond ).Seconds ())
0 commit comments