Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
4ee369f
feat: add scheduled snapshots with retention cleanup
sjmiller609 Mar 9, 2026
c16caac
fix: aggregate scheduled snapshot runner errors
sjmiller609 Mar 9, 2026
c13fcdc
perf: scan schedule index and reduce lock contention
sjmiller609 Mar 9, 2026
51c97f3
fix: validate scheduled snapshot name prefix length
sjmiller609 Mar 9, 2026
12c7c55
refactor: move scheduled snapshot domain logic to lib package
sjmiller609 Mar 9, 2026
b8cca6c
fix: ignore missing snapshots during retention cleanup
sjmiller609 Mar 9, 2026
d172b0a
chore: remove dead wrapper and align retention schema
sjmiller609 Mar 9, 2026
e8a4a3d
fix: tighten schedule run due-check under write lock
sjmiller609 Mar 9, 2026
666e14c
feat: auto-select scheduled snapshot kind and retain post-delete sche…
sjmiller609 Mar 9, 2026
84bd548
fix: return 404 when scheduling a missing instance
sjmiller609 Mar 9, 2026
0430e6c
fix: retire converged deleted-instance schedules
sjmiller609 Mar 9, 2026
77e55a1
chore: remove test-only snapshot schedule aliases
sjmiller609 Mar 9, 2026
6aa101f
fix: preserve schedule runtime history on updates
sjmiller609 Mar 9, 2026
c346a1f
fix: tighten schedule retention request handling
sjmiller609 Mar 9, 2026
84ef7c8
fix: honor cancellation in schedule runner loop
sjmiller609 Mar 9, 2026
1108cd8
fix: preserve explicit zero retention settings
sjmiller609 Mar 9, 2026
b9013e7
Merge origin/main into codex/snapshot-schedule-retention
sjmiller609 Mar 10, 2026
6db9b1c
fix: evaluate schedule timing per-instance
sjmiller609 Mar 10, 2026
c94c292
fix: make schedule interval step arithmetic explicit
sjmiller609 Mar 10, 2026
1104580
Merge origin/main into codex/snapshot-schedule-retention
sjmiller609 Mar 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 152 additions & 0 deletions cmd/api/api/snapshots.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package api
import (
"context"
"errors"
"time"

"github.com/kernel/hypeman/lib/hypervisor"
"github.com/kernel/hypeman/lib/instances"
Expand Down Expand Up @@ -193,6 +194,124 @@ func (s *ApiService) ForkSnapshot(ctx context.Context, request oapi.ForkSnapshot
return oapi.ForkSnapshot201JSONResponse(instanceToOAPI(*result)), nil
}

// GetInstanceSnapshotSchedule gets a snapshot schedule for the resolved instance.
func (s *ApiService) GetInstanceSnapshotSchedule(ctx context.Context, request oapi.GetInstanceSnapshotScheduleRequestObject) (oapi.GetInstanceSnapshotScheduleResponseObject, error) {
inst := mw.GetResolvedInstance[instances.Instance](ctx)
if inst == nil {
return oapi.GetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "resource not resolved"}, nil
}

scheduleManager, ok := s.InstanceManager.(instances.SnapshotScheduleManager)
if !ok {
return oapi.GetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "snapshot scheduling is not available"}, nil
}

schedule, err := scheduleManager.GetSnapshotSchedule(ctx, inst.Id)
if err != nil {
log := logger.FromContext(ctx)
switch {
case errors.Is(err, instances.ErrSnapshotScheduleNotFound):
return oapi.GetInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "snapshot schedule not found"}, nil
case errors.Is(err, instances.ErrNotFound):
return oapi.GetInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "instance not found"}, nil
default:
log.ErrorContext(ctx, "failed to get snapshot schedule", "error", err)
return oapi.GetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "failed to get snapshot schedule"}, nil
}
}

return oapi.GetInstanceSnapshotSchedule200JSONResponse(snapshotScheduleToOAPI(*schedule)), nil
}

// SetInstanceSnapshotSchedule creates or updates a snapshot schedule for the resolved instance.
func (s *ApiService) SetInstanceSnapshotSchedule(ctx context.Context, request oapi.SetInstanceSnapshotScheduleRequestObject) (oapi.SetInstanceSnapshotScheduleResponseObject, error) {
inst := mw.GetResolvedInstance[instances.Instance](ctx)
if inst == nil {
return oapi.SetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "resource not resolved"}, nil
}
if request.Body == nil {
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: "request body is required"}, nil
}

scheduleManager, ok := s.InstanceManager.(instances.SnapshotScheduleManager)
if !ok {
return oapi.SetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "snapshot scheduling is not available"}, nil
}

interval, err := time.ParseDuration(request.Body.Interval)
if err != nil {
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: "interval must be a valid duration"}, nil
}
if request.Body.Retention.MaxCount == nil && request.Body.Retention.MaxAge == nil {
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: "retention must include max_count or max_age"}, nil
}

retention := instances.SnapshotScheduleRetention{}
if request.Body.Retention.MaxCount != nil {
retention.MaxCount = *request.Body.Retention.MaxCount
}
if request.Body.Retention.MaxAge != nil {
maxAge, parseErr := time.ParseDuration(*request.Body.Retention.MaxAge)
if parseErr != nil {
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: "retention.max_age must be a valid duration"}, nil
}
retention.MaxAge = maxAge
}
req := instances.SetSnapshotScheduleRequest{
Interval: interval,
Metadata: toMapTags(request.Body.Metadata),
Retention: retention,
}
if request.Body.NamePrefix != nil {
req.NamePrefix = *request.Body.NamePrefix
}

schedule, err := scheduleManager.SetSnapshotSchedule(ctx, inst.Id, req)
if err != nil {
log := logger.FromContext(ctx)
switch {
case errors.Is(err, instances.ErrInvalidRequest):
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: err.Error()}, nil
case errors.Is(err, instances.ErrNotFound):
return oapi.SetInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "instance not found"}, nil
default:
log.ErrorContext(ctx, "failed to set snapshot schedule", "error", err)
return oapi.SetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "failed to set snapshot schedule"}, nil
}
}

return oapi.SetInstanceSnapshotSchedule200JSONResponse(snapshotScheduleToOAPI(*schedule)), nil
}

// DeleteInstanceSnapshotSchedule deletes a snapshot schedule for the resolved instance.
func (s *ApiService) DeleteInstanceSnapshotSchedule(ctx context.Context, request oapi.DeleteInstanceSnapshotScheduleRequestObject) (oapi.DeleteInstanceSnapshotScheduleResponseObject, error) {
inst := mw.GetResolvedInstance[instances.Instance](ctx)
if inst == nil {
return oapi.DeleteInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "resource not resolved"}, nil
}

scheduleManager, ok := s.InstanceManager.(instances.SnapshotScheduleManager)
if !ok {
return oapi.DeleteInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "snapshot scheduling is not available"}, nil
}

err := scheduleManager.DeleteSnapshotSchedule(ctx, inst.Id)
if err != nil {
log := logger.FromContext(ctx)
switch {
case errors.Is(err, instances.ErrSnapshotScheduleNotFound):
return oapi.DeleteInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "snapshot schedule not found"}, nil
case errors.Is(err, instances.ErrNotFound):
return oapi.DeleteInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "instance not found"}, nil
default:
log.ErrorContext(ctx, "failed to delete snapshot schedule", "error", err)
return oapi.DeleteInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "failed to delete snapshot schedule"}, nil
}
}

return oapi.DeleteInstanceSnapshotSchedule204Response{}, nil
}

func snapshotToOAPI(snapshot instances.Snapshot) oapi.Snapshot {
kind := oapi.SnapshotKind(snapshot.Kind)
sourceHypervisor := oapi.SnapshotSourceHypervisor(snapshot.SourceHypervisor)
Expand All @@ -212,3 +331,36 @@ func snapshotToOAPI(snapshot instances.Snapshot) oapi.Snapshot {
}
return out
}

func snapshotScheduleToOAPI(schedule instances.SnapshotSchedule) oapi.SnapshotSchedule {
retention := oapi.SnapshotScheduleRetention{
MaxCount: lo.ToPtr(schedule.Retention.MaxCount),
}
if schedule.Retention.MaxAge > 0 {
maxAge := schedule.Retention.MaxAge.String()
retention.MaxAge = &maxAge
}

out := oapi.SnapshotSchedule{
InstanceId: schedule.InstanceID,
Interval: schedule.Interval.String(),
Metadata: toOAPITags(schedule.Metadata),
Retention: retention,
NextRunAt: schedule.NextRunAt,
CreatedAt: schedule.CreatedAt,
UpdatedAt: schedule.UpdatedAt,
}
if schedule.NamePrefix != "" {
out.NamePrefix = lo.ToPtr(schedule.NamePrefix)
}
if schedule.LastRunAt != nil {
out.LastRunAt = schedule.LastRunAt
}
if schedule.LastSnapshotID != nil {
out.LastSnapshotId = schedule.LastSnapshotID
}
if schedule.LastError != nil {
out.LastError = schedule.LastError
}
return out
}
32 changes: 32 additions & 0 deletions cmd/api/api/snapshots_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package api

import (
"testing"
"time"

"github.com/kernel/hypeman/lib/instances"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestSnapshotScheduleToOAPIPreservesZeroMaxCount(t *testing.T) {
t.Parallel()

schedule := instances.SnapshotSchedule{
InstanceID: "inst-1",
Interval: time.Hour,
Retention: instances.SnapshotScheduleRetention{
MaxCount: 0,
MaxAge: 24 * time.Hour,
},
NextRunAt: time.Now().UTC().Add(time.Hour),
CreatedAt: time.Now().UTC(),
UpdatedAt: time.Now().UTC(),
}

out := snapshotScheduleToOAPI(schedule)
require.NotNil(t, out.Retention.MaxCount)
assert.Equal(t, 0, *out.Retention.MaxCount)
require.NotNil(t, out.Retention.MaxAge)
assert.Equal(t, "24h0m0s", *out.Retention.MaxAge)
}
23 changes: 23 additions & 0 deletions cmd/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,29 @@ func run() error {
}
})

// Snapshot schedule scheduler
if scheduleManager, ok := app.InstanceManager.(instances.SnapshotScheduleManager); ok {
const snapshotSchedulePollInterval = time.Minute
grp.Go(func() error {
ticker := time.NewTicker(snapshotSchedulePollInterval)
defer ticker.Stop()

logger.Info("snapshot schedule scheduler started", "interval", snapshotSchedulePollInterval)
for {
select {
case <-gctx.Done():
return nil
case <-ticker.C:
if err := scheduleManager.RunSnapshotSchedules(gctx); err != nil {
logger.Error("snapshot schedule run completed with errors", "error", err)
}
}
}
})
} else {
logger.Warn("snapshot schedule manager unavailable; scheduled snapshots disabled")
}

err = grp.Wait()
slog.Info("all goroutines finished")
return err
Expand Down
19 changes: 19 additions & 0 deletions lib/instances/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,25 @@ Any State → Stopped
- Don't prefault pages (lazy loading)
- Parallel with TAP device setup

## Scheduled Snapshot Behavior

- Schedules are configured per instance and persisted in the server data store (outside snapshot payloads).
- A background scheduler evaluates due schedules every minute.
- Each due run chooses snapshot behavior from current source state:
- `Running`/`Standby` sources use `Standby` snapshots.
- `Stopped` sources use `Stopped` snapshots.
- `Standby` runs from `Running` sources perform a brief pause/resume cycle during capture.
- Scheduled snapshot `name_prefix` is optional and capped at 47 chars so generated names stay within the 63-char snapshot name limit.
- Schedule runs advance to the next future interval (no backfill flood after downtime).
- Each schedule stores operational status:
- `next_run_at`
- `last_run_at`
- `last_snapshot_id`
- `last_error`
- Retention cleanup runs after successful scheduled snapshot creation and only affects scheduled snapshots for that instance.
- If an instance is deleted, its schedule is retained so retention can continue cleaning existing scheduled snapshots.
- Once the deleted instance has no scheduled snapshots left, the scheduler removes that schedule automatically.

## Reference Handling

Instances use OCI image references directly:
Expand Down
3 changes: 3 additions & 0 deletions lib/instances/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,7 @@ var (

// ErrSnapshotNotFound is returned when a snapshot is not found.
ErrSnapshotNotFound = errors.New("snapshot not found")

// ErrSnapshotScheduleNotFound is returned when a snapshot schedule is not found.
ErrSnapshotScheduleNotFound = errors.New("snapshot schedule not found")
)
Loading