Skip to content

Commit 7632137

Browse files
authored
Update commitments with support for non-standard units (#624)
1 parent c8d2bac commit 7632137

16 files changed

Lines changed: 578 additions & 42 deletions

File tree

Tiltfile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,13 @@ if 'nova' in ACTIVE_DEPLOYMENTS:
189189
trigger_mode=TRIGGER_MODE_MANUAL,
190190
auto_init=False,
191191
)
192+
local_resource(
193+
'Commitments E2E Tests',
194+
'/bin/sh -c "kubectl exec deploy/cortex-nova-scheduling-controller-manager -- /manager e2e-commitments"',
195+
labels=['Cortex-Nova'],
196+
trigger_mode=TRIGGER_MODE_MANUAL,
197+
auto_init=False,
198+
)
192199

193200
if 'manila' in ACTIVE_DEPLOYMENTS:
194201
print("Activating Cortex Manila bundle")

cmd/main.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,10 @@ func main() {
110110
manilaChecksConfig := conf.GetConfigOrDie[manila.ChecksConfig]()
111111
manila.RunChecks(ctx, client, manilaChecksConfig)
112112
return
113+
case "e2e-commitments":
114+
commitmentsChecksConfig := conf.GetConfigOrDie[commitments.E2EChecksConfig]()
115+
commitments.RunCommitmentsE2EChecks(ctx, commitmentsChecksConfig)
116+
return
113117
}
114118
}
115119

@@ -665,7 +669,9 @@ func main() {
665669

666670
if slices.Contains(mainConfig.EnabledTasks, "commitments-sync-task") {
667671
setupLog.Info("starting commitments syncer")
668-
syncer := commitments.NewSyncer(multiclusterClient)
672+
syncerMonitor := commitments.NewSyncerMonitor()
673+
must.Succeed(metrics.Registry.Register(syncerMonitor))
674+
syncer := commitments.NewSyncer(multiclusterClient, syncerMonitor)
669675
syncerConfig := conf.GetConfigOrDie[commitments.SyncerConfig]()
670676
syncerDefaults := commitments.DefaultSyncerConfig()
671677
if syncerConfig.SyncInterval == 0 {

go.mod

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ require (
7272
github.com/poy/onpar v0.3.5 // indirect
7373
github.com/prometheus/common v0.67.5 // indirect
7474
github.com/prometheus/procfs v0.17.0 // indirect
75-
github.com/sapcc/go-api-declarations v1.20.2
75+
github.com/sapcc/go-api-declarations v1.21.0
7676
github.com/sirupsen/logrus v1.9.3 // indirect
7777
github.com/spf13/cobra v1.10.1 // indirect
7878
github.com/spf13/pflag v1.0.10 // indirect
@@ -98,7 +98,7 @@ require (
9898
golang.org/x/oauth2 v0.34.0 // indirect
9999
golang.org/x/sync v0.19.0 // indirect
100100
golang.org/x/sys v0.42.0 // indirect
101-
golang.org/x/term v0.41.0 // indirect
101+
golang.org/x/term v0.41.0
102102
golang.org/x/text v0.33.0 // indirect
103103
golang.org/x/time v0.14.0 // indirect
104104
gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect
@@ -108,7 +108,7 @@ require (
108108
google.golang.org/protobuf v1.36.11 // indirect
109109
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
110110
gopkg.in/inf.v0 v0.9.1 // indirect
111-
gopkg.in/yaml.v3 v3.0.1 // indirect
111+
gopkg.in/yaml.v3 v3.0.1
112112
gotest.tools v2.2.0+incompatible // indirect
113113
k8s.io/apiextensions-apiserver v0.35.0 // indirect
114114
k8s.io/apiserver v0.35.0 // indirect

go.sum

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -176,8 +176,8 @@ github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUO
176176
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
177177
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
178178
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
179-
github.com/sapcc/go-api-declarations v1.20.2 h1:GWqv8VgsF4k9id6N051AVTaEpcjT02APsOuz2yCvTPQ=
180-
github.com/sapcc/go-api-declarations v1.20.2/go.mod h1:eiRrXXUeQS5C/1kKn8/KMjk0Y0goUzgDQswj30rH0Zc=
179+
github.com/sapcc/go-api-declarations v1.21.0 h1:Ag6GXgJLTFdBDKmrJU4QFllQbgGSenSGeHpLuvuxeDk=
180+
github.com/sapcc/go-api-declarations v1.21.0/go.mod h1:eiRrXXUeQS5C/1kKn8/KMjk0Y0goUzgDQswj30rH0Zc=
181181
github.com/sapcc/go-bits v0.0.0-20260312170110-034b497ebb7e h1:4wgkrfAlnL6ffM7HTNoHn1HrBBurCRR71WNOszdiDNQ=
182182
github.com/sapcc/go-bits v0.0.0-20260312170110-034b497ebb7e/go.mod h1:NZjMiGVm04U25vwR6ZWvMw0XOOnvS1jkmXpjiepOeUw=
183183
github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw=
@@ -251,12 +251,8 @@ golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
251251
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
252252
golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
253253
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
254-
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
255-
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
256254
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
257255
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
258-
golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY=
259-
golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww=
260256
golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU=
261257
golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A=
262258
golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE=

helm/bundles/cortex-nova/alerts/nova.alerts.yaml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,25 @@ groups:
276276
configuration. It is recommended to investigate the
277277
pipeline status and logs for more details.
278278
279-
# Committed Resource (Limes Integration) Alerts
279+
# Committed Resource Info API Alerts
280+
- alert: CortexNovaCommittedResourceInfoHttpRequest500sTooHigh
281+
expr: rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
282+
for: 5m
283+
labels:
284+
context: committed-resource-api
285+
dashboard: cortex/cortex
286+
service: cortex
287+
severity: warning
288+
support_group: workload-management
289+
annotations:
290+
summary: "Committed Resource info API HTTP 500 errors too high"
291+
description: >
292+
The committed resource info API (Limes LIQUID integration) is responding
293+
with HTTP 5xx errors. This indicates internal problems building service info,
294+
such as invalid flavor group data. Limes will not be able to discover available
295+
resources until the issue is resolved.
296+
297+
# Committed Resource Change API Alerts
280298
- alert: CortexNovaCommittedResourceHttpRequest400sTooHigh
281299
expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
282300
for: 5m

internal/scheduling/reservations/commitments/api.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ type HTTPAPI struct {
2828
monitor ChangeCommitmentsAPIMonitor
2929
usageMonitor ReportUsageAPIMonitor
3030
capacityMonitor ReportCapacityAPIMonitor
31+
infoMonitor InfoAPIMonitor
3132
// Mutex to serialize change-commitments requests
3233
changeMutex sync.Mutex
3334
}
@@ -44,13 +45,15 @@ func NewAPIWithConfig(client client.Client, config Config, novaClient UsageNovaC
4445
monitor: NewChangeCommitmentsAPIMonitor(),
4546
usageMonitor: NewReportUsageAPIMonitor(),
4647
capacityMonitor: NewReportCapacityAPIMonitor(),
48+
infoMonitor: NewInfoAPIMonitor(),
4749
}
4850
}
4951

5052
func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer, log logr.Logger) {
5153
registry.MustRegister(&api.monitor)
5254
registry.MustRegister(&api.usageMonitor)
5355
registry.MustRegister(&api.capacityMonitor)
56+
registry.MustRegister(&api.infoMonitor)
5457
mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments)
5558
mux.HandleFunc("/v1/commitments/report-capacity", api.HandleReportCapacity)
5659
mux.HandleFunc("/v1/commitments/info", api.HandleInfo)

internal/scheduling/reservations/commitments/api_info.go

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,29 @@ package commitments
66
import (
77
"context"
88
"encoding/json"
9+
"errors"
910
"fmt"
1011
"net/http"
12+
"strconv"
1113
"strings"
14+
"time"
1215

1316
"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
1417
"github.com/go-logr/logr"
1518
"github.com/google/uuid"
1619
liquid "github.com/sapcc/go-api-declarations/liquid"
1720
)
1821

22+
// errInternalServiceInfo indicates an internal error while building service info (e.g., invalid unit configuration)
23+
var errInternalServiceInfo = errors.New("internal error building service info")
24+
1925
// handles GET /v1/info requests from Limes:
2026
// See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go
2127
// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid
2228
func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) {
29+
startTime := time.Now()
30+
statusCode := http.StatusOK
31+
2332
// Extract or generate request ID for tracing
2433
requestID := r.Header.Get("X-Request-ID")
2534
if requestID == "" {
@@ -33,7 +42,9 @@ func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) {
3342

3443
// Only accept GET method
3544
if r.Method != http.MethodGet {
36-
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
45+
statusCode = http.StatusMethodNotAllowed
46+
http.Error(w, "Method not allowed", statusCode)
47+
api.recordInfoMetrics(statusCode, startTime)
3748
return
3849
}
3950

@@ -42,20 +53,35 @@ func (api *HTTPAPI) HandleInfo(w http.ResponseWriter, r *http.Request) {
4253
// Build info response
4354
info, err := api.buildServiceInfo(ctx, logger)
4455
if err != nil {
45-
// Use Info level for expected conditions like knowledge not being ready yet
46-
logger.Info("service info not available yet", "error", err.Error())
47-
http.Error(w, "Service temporarily unavailable: "+err.Error(),
48-
http.StatusServiceUnavailable)
56+
if errors.Is(err, errInternalServiceInfo) {
57+
logger.Error(err, "internal error building service info")
58+
statusCode = http.StatusInternalServerError
59+
http.Error(w, "Internal server error: "+err.Error(), statusCode)
60+
} else {
61+
// Use Info level for expected conditions like knowledge not being ready yet
62+
logger.Info("service info not available yet", "error", err.Error())
63+
statusCode = http.StatusServiceUnavailable
64+
http.Error(w, "Service temporarily unavailable: "+err.Error(), statusCode)
65+
}
66+
api.recordInfoMetrics(statusCode, startTime)
4967
return
5068
}
5169

5270
// Return response
5371
w.Header().Set("Content-Type", "application/json")
54-
w.WriteHeader(http.StatusOK)
72+
w.WriteHeader(statusCode)
5573
if err := json.NewEncoder(w).Encode(info); err != nil {
5674
logger.Error(err, "failed to encode service info")
57-
return
5875
}
76+
api.recordInfoMetrics(statusCode, startTime)
77+
}
78+
79+
// recordInfoMetrics records Prometheus metrics for an info API request.
80+
func (api *HTTPAPI) recordInfoMetrics(statusCode int, startTime time.Time) {
81+
duration := time.Since(startTime).Seconds()
82+
statusCodeStr := strconv.Itoa(statusCode)
83+
api.infoMonitor.requestCounter.WithLabelValues(statusCodeStr).Inc()
84+
api.infoMonitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration)
5985
}
6086

6187
// resourceAttributes holds the custom attributes for a resource in the info API response.
@@ -108,9 +134,22 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l
108134
attrsJSON = nil
109135
}
110136

137+
// Build unit from smallest flavor memory (e.g., "131072 MiB" for 128 GiB)
138+
// Validate memory is positive to avoid panic in MultiplyBy (which panics on factor=0)
139+
if groupData.SmallestFlavor.MemoryMB == 0 {
140+
return liquid.ServiceInfo{}, fmt.Errorf("%w: flavor group %q has invalid smallest flavor with memoryMB=0",
141+
errInternalServiceInfo, groupName)
142+
}
143+
unit, err := liquid.UnitMebibytes.MultiplyBy(groupData.SmallestFlavor.MemoryMB)
144+
if err != nil {
145+
// Note: This error only occurs on uint64 overflow, which is unrealistic for memory values
146+
return liquid.ServiceInfo{}, fmt.Errorf("%w: failed to create unit for flavor group %q: %w",
147+
errInternalServiceInfo, groupName, err)
148+
}
149+
111150
resources[resourceName] = liquid.ResourceInfo{
112151
DisplayName: displayName,
113-
Unit: liquid.UnitNone, // Countable: multiples of smallest flavor instances
152+
Unit: unit, // Non-standard unit: multiples of smallest flavor RAM
114153
Topology: liquid.AZAwareTopology, // Commitments are per-AZ
115154
NeedsResourceDemand: false, // Capacity planning out of scope for now
116155
HasCapacity: handlesCommitments, // We report capacity via /v1/report-capacity only for groups that accept commitments
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// Copyright SAP SE
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package commitments
5+
6+
import (
7+
"github.com/prometheus/client_golang/prometheus"
8+
)
9+
10+
// InfoAPIMonitor provides metrics for the CR info API.
11+
type InfoAPIMonitor struct {
12+
requestCounter *prometheus.CounterVec
13+
requestDuration *prometheus.HistogramVec
14+
}
15+
16+
// NewInfoAPIMonitor creates a new monitor with Prometheus metrics.
17+
// Metrics are pre-initialized with zero values for common HTTP status codes
18+
// to ensure they appear in Prometheus before the first request.
19+
func NewInfoAPIMonitor() InfoAPIMonitor {
20+
m := InfoAPIMonitor{
21+
requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
22+
Name: "cortex_committed_resource_info_api_requests_total",
23+
Help: "Total number of committed resource info API requests by HTTP status code",
24+
}, []string{"status_code"}),
25+
requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
26+
Name: "cortex_committed_resource_info_api_request_duration_seconds",
27+
Help: "Duration of committed resource info API requests in seconds by HTTP status code",
28+
Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10},
29+
}, []string{"status_code"}),
30+
}
31+
32+
// Pre-initialize metrics with zero values for common HTTP status codes.
33+
// This ensures metrics exist in Prometheus before the first request,
34+
// preventing "metric missing" warnings in alerting rules.
35+
for _, statusCode := range []string{"200", "405", "500", "503"} {
36+
m.requestCounter.WithLabelValues(statusCode)
37+
m.requestDuration.WithLabelValues(statusCode)
38+
}
39+
40+
return m
41+
}
42+
43+
// Describe implements prometheus.Collector.
44+
func (m *InfoAPIMonitor) Describe(ch chan<- *prometheus.Desc) {
45+
m.requestCounter.Describe(ch)
46+
m.requestDuration.Describe(ch)
47+
}
48+
49+
// Collect implements prometheus.Collector.
50+
func (m *InfoAPIMonitor) Collect(ch chan<- prometheus.Metric) {
51+
m.requestCounter.Collect(ch)
52+
m.requestDuration.Collect(ch)
53+
}

internal/scheduling/reservations/commitments/api_info_test.go

Lines changed: 64 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,7 @@ func TestHandleInfo_KnowledgeNotReady(t *testing.T) {
2828
WithScheme(scheme).
2929
Build()
3030

31-
api := &HTTPAPI{
32-
client: k8sClient,
33-
}
31+
api := NewAPI(k8sClient)
3432

3533
req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody)
3634
w := httptest.NewRecorder()
@@ -62,9 +60,7 @@ func TestHandleInfo_MethodNotAllowed(t *testing.T) {
6260
WithScheme(scheme).
6361
Build()
6462

65-
api := &HTTPAPI{
66-
client: k8sClient,
67-
}
63+
api := NewAPI(k8sClient)
6864

6965
// Use POST instead of GET
7066
req := httptest.NewRequest(http.MethodPost, "/v1/info", http.NoBody)
@@ -80,6 +76,67 @@ func TestHandleInfo_MethodNotAllowed(t *testing.T) {
8076
}
8177
}
8278

79+
func TestHandleInfo_InvalidFlavorMemory(t *testing.T) {
80+
// Test that a 500 Internal Server Error is returned when a flavor group has invalid data.
81+
//
82+
// A flavor with memoryMB=0 is invalid and should trigger an HTTP 500 error.
83+
// Such data could occur from a bug in the flavor groups extractor.
84+
scheme := runtime.NewScheme()
85+
if err := v1alpha1.AddToScheme(scheme); err != nil {
86+
t.Fatalf("failed to add scheme: %v", err)
87+
}
88+
89+
// Create flavor group with memoryMB=0 (invalid data that could come from a buggy extractor)
90+
features := []map[string]interface{}{
91+
{
92+
"name": "invalid_group",
93+
"flavors": []map[string]interface{}{
94+
{"name": "zero_memory_flavor", "vcpus": 4, "memoryMB": 0, "diskGB": 50},
95+
},
96+
"largestFlavor": map[string]interface{}{"name": "zero_memory_flavor", "vcpus": 4, "memoryMB": 0, "diskGB": 50},
97+
"smallestFlavor": map[string]interface{}{"name": "zero_memory_flavor", "vcpus": 4, "memoryMB": 0, "diskGB": 50},
98+
"ramCoreRatio": 4096,
99+
},
100+
}
101+
102+
raw, err := v1alpha1.BoxFeatureList(features)
103+
if err != nil {
104+
t.Fatalf("failed to box features: %v", err)
105+
}
106+
107+
knowledge := &v1alpha1.Knowledge{
108+
ObjectMeta: v1.ObjectMeta{Name: "flavor-groups"},
109+
Spec: v1alpha1.KnowledgeSpec{
110+
SchedulingDomain: v1alpha1.SchedulingDomainNova,
111+
Extractor: v1alpha1.KnowledgeExtractorSpec{Name: "flavor_groups"},
112+
},
113+
Status: v1alpha1.KnowledgeStatus{
114+
Conditions: []v1.Condition{{Type: v1alpha1.KnowledgeConditionReady, Status: "True"}},
115+
Raw: raw,
116+
LastContentChange: v1.Now(),
117+
},
118+
}
119+
120+
k8sClient := fake.NewClientBuilder().
121+
WithScheme(scheme).
122+
WithObjects(knowledge).
123+
Build()
124+
125+
api := NewAPI(k8sClient)
126+
127+
req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody)
128+
w := httptest.NewRecorder()
129+
api.HandleInfo(w, req)
130+
131+
resp := w.Result()
132+
defer resp.Body.Close()
133+
134+
// Should return 500 Internal Server Error when unit creation fails
135+
if resp.StatusCode != http.StatusInternalServerError {
136+
t.Errorf("expected status code %d (Internal Server Error), got %d", http.StatusInternalServerError, resp.StatusCode)
137+
}
138+
}
139+
83140
func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) {
84141
// Test that HasCapacity == HandlesCommitments for all resources
85142
// Both should be true only for groups with fixed RAM/core ratio
@@ -138,7 +195,7 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) {
138195
WithObjects(knowledge).
139196
Build()
140197

141-
api := &HTTPAPI{client: k8sClient}
198+
api := NewAPI(k8sClient)
142199

143200
req := httptest.NewRequest(http.MethodGet, "/v1/info", http.NoBody)
144201
w := httptest.NewRecorder()

0 commit comments

Comments
 (0)