diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go index 638df91da..8200ead36 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm.go @@ -9,9 +9,12 @@ import ( "strconv" "strings" + "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/db" "github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins" "github.com/cobaltcore-dev/cortex/pkg/conf" @@ -29,14 +32,17 @@ func getBuildingBlock(hostName string) string { return "unknown" } +// hostReservationResources holds aggregated CPU and memory reservation quantities for a single hypervisor. +type hostReservationResources struct { + cpu resource.Quantity + memory resource.Quantity +} + type KVMResourceCapacityKPI struct { // Common base for all KPIs that provides standard functionality. plugins.BaseKPI[struct{}] // No options passed through yaml config - utilizedCapacityPerHost *prometheus.Desc - paygCapacityPerHost *prometheus.Desc - failoverCapacityPerHost *prometheus.Desc - reservedCapacityPerHost *prometheus.Desc totalCapacityPerHost *prometheus.Desc + capacityPerHost *prometheus.Desc } func (KVMResourceCapacityKPI) GetName() string { @@ -47,60 +53,9 @@ func (k *KVMResourceCapacityKPI) Init(db *db.DB, client client.Client, opts conf if err := k.BaseKPI.Init(db, client, opts); err != nil { return err } - k.utilizedCapacityPerHost = prometheus.NewDesc( - "cortex_kvm_host_capacity_utilized", - "Utilized resources on the KVM hosts (individually by host).", - []string{ - "compute_host", - "resource", - "availability_zone", - "building_block", - "cpu_architecture", - "workload_type", - "enabled", - "decommissioned", - "external_customer", - "maintenance", - }, - nil, - ) - k.paygCapacityPerHost = prometheus.NewDesc( - "cortex_kvm_host_capacity_payg", - "PAYG resources available on the KVM hosts (individually by host).", - []string{ - "compute_host", - "resource", - "availability_zone", - "building_block", - "cpu_architecture", - "workload_type", - "enabled", - "decommissioned", - "external_customer", - "maintenance", - }, - nil, - ) - k.reservedCapacityPerHost = prometheus.NewDesc( - "cortex_kvm_host_capacity_reserved", - "Reserved resources on the KVM hosts (individually by host).", - []string{ - "compute_host", - "resource", - "availability_zone", - "building_block", - "cpu_architecture", - "workload_type", - "enabled", - "decommissioned", - "external_customer", - "maintenance", - }, - nil, - ) - k.failoverCapacityPerHost = prometheus.NewDesc( - "cortex_kvm_host_capacity_failover", - "Failover resources on the KVM hosts (individually by host).", + k.totalCapacityPerHost = prometheus.NewDesc( + "cortex_kvm_host_capacity_total", + "Total resource capacity on the KVM hosts (individually by host).", []string{ "compute_host", "resource", @@ -115,12 +70,13 @@ func (k *KVMResourceCapacityKPI) Init(db *db.DB, client client.Client, opts conf }, nil, ) - k.totalCapacityPerHost = prometheus.NewDesc( - "cortex_kvm_host_capacity_total", - "Total resources on the KVM hosts (individually by host).", + k.capacityPerHost = prometheus.NewDesc( + "cortex_kvm_host_capacity_usage", + "Resource capacity usage on the KVM hosts (individually by host).", []string{ "compute_host", "resource", + "type", "availability_zone", "building_block", "cpu_architecture", @@ -136,23 +92,96 @@ func (k *KVMResourceCapacityKPI) Init(db *db.DB, client client.Client, opts conf } func (k *KVMResourceCapacityKPI) Describe(ch chan<- *prometheus.Desc) { - ch <- k.utilizedCapacityPerHost - ch <- k.paygCapacityPerHost - ch <- k.reservedCapacityPerHost - ch <- k.failoverCapacityPerHost ch <- k.totalCapacityPerHost + ch <- k.capacityPerHost +} + +// aggregateReservationsByHost groups Ready reservations by host, returning per-host +// failover totals and committed-resource "not yet in use" totals. +func aggregateReservationsByHost(reservations []v1alpha1.Reservation) ( + failoverByHost map[string]hostReservationResources, + committedNotInUseByHost map[string]hostReservationResources, +) { + + failoverByHost = make(map[string]hostReservationResources) + committedNotInUseByHost = make(map[string]hostReservationResources) + + for _, reservation := range reservations { + if reservation.Spec.SchedulingDomain != v1alpha1.SchedulingDomainNova { + continue + } + + readyCondition := meta.FindStatusCondition(reservation.Status.Conditions, v1alpha1.ReservationConditionReady) + if readyCondition == nil || readyCondition.Status != metav1.ConditionTrue { + continue + } + + host := reservation.Status.Host + if host == "" { + continue + } + + switch reservation.Spec.Type { + case v1alpha1.ReservationTypeFailover: + entry := failoverByHost[host] + cpuQty := reservation.Spec.Resources[hv1.ResourceCPU] + entry.cpu.Add(cpuQty) + memQty := reservation.Spec.Resources[hv1.ResourceMemory] + entry.memory.Add(memQty) + failoverByHost[host] = entry + + case v1alpha1.ReservationTypeCommittedResource: + // Total reserved resources for this reservation. + cpuTotal := reservation.Spec.Resources[hv1.ResourceCPU] + memTotal := reservation.Spec.Resources[hv1.ResourceMemory] + + // Sum allocated resources across all workloads. + var cpuAllocated, memAllocated resource.Quantity + if reservation.Spec.CommittedResourceReservation != nil { + for _, alloc := range reservation.Spec.CommittedResourceReservation.Allocations { + cpuAllocated.Add(alloc.Resources[hv1.ResourceCPU]) + memAllocated.Add(alloc.Resources[hv1.ResourceMemory]) + } + } + + // Not yet in use = total - allocated, clamped to zero. + cpuNotInUse := cpuTotal.DeepCopy() + cpuNotInUse.Sub(cpuAllocated) + if cpuNotInUse.Cmp(resource.MustParse("0")) < 0 { + cpuNotInUse = resource.MustParse("0") + } + + memNotInUse := memTotal.DeepCopy() + memNotInUse.Sub(memAllocated) + if memNotInUse.Cmp(resource.MustParse("0")) < 0 { + memNotInUse = resource.MustParse("0") + } + + entry := committedNotInUseByHost[host] + entry.cpu.Add(cpuNotInUse) + entry.memory.Add(memNotInUse) + committedNotInUseByHost[host] = entry + } + } + + return failoverByHost, committedNotInUseByHost } func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { - // The hypervisor resource auto-discovers its current utilization. - // We can use the hypervisor status to calculate the total capacity - // and then subtract the actual resource allocation from virtual machines. hvs := &hv1.HypervisorList{} if err := k.Client.List(context.Background(), hvs); err != nil { slog.Error("failed to list hypervisors", "error", err) return } + reservations := &v1alpha1.ReservationList{} + if err := k.Client.List(context.Background(), reservations); err != nil { + slog.Error("failed to list reservations", "error", err) + return + } + + failoverByHost, committedNotInUseByHost := aggregateReservationsByHost(reservations.Items) + for _, hypervisor := range hvs.Items { if hypervisor.Status.EffectiveCapacity == nil { slog.Warn("hypervisor with nil effective capacity, skipping", "host", hypervisor.Name) @@ -182,27 +211,28 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { ramUsed = resource.MustParse("0") } - exportCapacityMetricKVM(ch, k.totalCapacityPerHost, "cpu", cpuTotal.AsApproximateFloat64(), hypervisor) - exportCapacityMetricKVM(ch, k.totalCapacityPerHost, "ram", ramTotal.AsApproximateFloat64(), hypervisor) + // Get reservation data for this hypervisor (zero-value if absent). + failoverRes := failoverByHost[hypervisor.Name] + committedRes := committedNotInUseByHost[hypervisor.Name] + + cpuReserved := committedRes.cpu + ramReserved := committedRes.memory + cpuFailover := failoverRes.cpu + ramFailover := failoverRes.memory - exportCapacityMetricKVM(ch, k.utilizedCapacityPerHost, "cpu", cpuUsed.AsApproximateFloat64(), hypervisor) - exportCapacityMetricKVM(ch, k.utilizedCapacityPerHost, "ram", ramUsed.AsApproximateFloat64(), hypervisor) + labels := hostLabelsFromHypervisor(hypervisor) - // WARNING: Using dummy data for now. - // TODO Replace with actual data from reservations capacity CRDs - cpuReserved := resource.MustParse("100") - ramReserved := resource.MustParse("1Gi") + k.emitTotal(ch, "cpu", cpuTotal.AsApproximateFloat64(), labels) + k.emitTotal(ch, "ram", ramTotal.AsApproximateFloat64(), labels) - exportCapacityMetricKVM(ch, k.reservedCapacityPerHost, "cpu", cpuReserved.AsApproximateFloat64(), hypervisor) - exportCapacityMetricKVM(ch, k.reservedCapacityPerHost, "ram", ramReserved.AsApproximateFloat64(), hypervisor) + k.emitUsage(ch, "cpu", cpuUsed.AsApproximateFloat64(), "utilized", labels) + k.emitUsage(ch, "ram", ramUsed.AsApproximateFloat64(), "utilized", labels) - // WARNING: Using dummy data for now. - // TODO Replace with actual data from failover capacity CRDs - cpuFailover := resource.MustParse("100") - ramFailover := resource.MustParse("1Gi") + k.emitUsage(ch, "cpu", cpuReserved.AsApproximateFloat64(), "reserved", labels) + k.emitUsage(ch, "ram", ramReserved.AsApproximateFloat64(), "reserved", labels) - exportCapacityMetricKVM(ch, k.failoverCapacityPerHost, "cpu", cpuFailover.AsApproximateFloat64(), hypervisor) - exportCapacityMetricKVM(ch, k.failoverCapacityPerHost, "ram", ramFailover.AsApproximateFloat64(), hypervisor) + k.emitUsage(ch, "cpu", cpuFailover.AsApproximateFloat64(), "failover", labels) + k.emitUsage(ch, "ram", ramFailover.AsApproximateFloat64(), "failover", labels) // Calculate PAYG capacity paygCPU := cpuTotal.DeepCopy() @@ -215,21 +245,27 @@ func (k *KVMResourceCapacityKPI) Collect(ch chan<- prometheus.Metric) { paygRAM.Sub(ramReserved) paygRAM.Sub(ramFailover) - exportCapacityMetricKVM(ch, k.paygCapacityPerHost, "cpu", paygCPU.AsApproximateFloat64(), hypervisor) - exportCapacityMetricKVM(ch, k.paygCapacityPerHost, "ram", paygRAM.AsApproximateFloat64(), hypervisor) + k.emitUsage(ch, "cpu", paygCPU.AsApproximateFloat64(), "payg", labels) + k.emitUsage(ch, "ram", paygRAM.AsApproximateFloat64(), "payg", labels) } } -func exportCapacityMetricKVM(ch chan<- prometheus.Metric, metric *prometheus.Desc, resource string, value float64, hypervisor hv1.Hypervisor) { - bb := getBuildingBlock(hypervisor.Name) - - availabilityZone := hypervisor.Labels["topology.kubernetes.io/zone"] +// kvmHostLabels holds precomputed label values derived from a hypervisor. +type kvmHostLabels struct { + computeHost string + availabilityZone string + buildingBlock string + cpuArchitecture string + workloadType string + enabled string + decommissioned string + externalCustomer string + maintenance string +} - enabled := true +func hostLabelsFromHypervisor(hypervisor hv1.Hypervisor) kvmHostLabels { decommissioned := false externalCustomer := false - maintenance := false - workloadType := "general-purpose" cpuArchitecture := "cascade-lake" @@ -246,19 +282,52 @@ func exportCapacityMetricKVM(ch chan<- prometheus.Metric, metric *prometheus.Des } } + return kvmHostLabels{ + computeHost: hypervisor.Name, + availabilityZone: hypervisor.Labels["topology.kubernetes.io/zone"], + buildingBlock: getBuildingBlock(hypervisor.Name), + cpuArchitecture: cpuArchitecture, + workloadType: workloadType, + enabled: strconv.FormatBool(true), + decommissioned: strconv.FormatBool(decommissioned), + externalCustomer: strconv.FormatBool(externalCustomer), + maintenance: strconv.FormatBool(false), + } +} + +func (k *KVMResourceCapacityKPI) emitTotal(ch chan<- prometheus.Metric, resourceName string, value float64, l kvmHostLabels) { + ch <- prometheus.MustNewConstMetric( + k.totalCapacityPerHost, + prometheus.GaugeValue, + value, + l.computeHost, + resourceName, + l.availabilityZone, + l.buildingBlock, + l.cpuArchitecture, + l.workloadType, + l.enabled, + l.decommissioned, + l.externalCustomer, + l.maintenance, + ) +} + +func (k *KVMResourceCapacityKPI) emitUsage(ch chan<- prometheus.Metric, resourceName string, value float64, capacityType string, l kvmHostLabels) { ch <- prometheus.MustNewConstMetric( - metric, + k.capacityPerHost, prometheus.GaugeValue, value, - hypervisor.Name, - resource, - availabilityZone, - bb, - cpuArchitecture, - workloadType, - strconv.FormatBool(enabled), - strconv.FormatBool(decommissioned), - strconv.FormatBool(externalCustomer), - strconv.FormatBool(maintenance), + l.computeHost, + resourceName, + capacityType, + l.availabilityZone, + l.buildingBlock, + l.cpuArchitecture, + l.workloadType, + l.enabled, + l.decommissioned, + l.externalCustomer, + l.maintenance, ) } diff --git a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go index bb2e5f91a..3834c4d36 100644 --- a/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go +++ b/internal/knowledge/kpis/plugins/compute/resource_capacity_kvm_test.go @@ -6,6 +6,7 @@ package compute import ( "testing" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/pkg/conf" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "github.com/prometheus/client_golang/prometheus" @@ -23,9 +24,10 @@ func TestKVMResourceCapacityKPI_Init(t *testing.T) { } } -type metricLabels struct { +type kvmMetricLabels struct { ComputeHost string Resource string + Type string AvailabilityZone string BuildingBlock string CPUArchitecture string @@ -36,16 +38,45 @@ type metricLabels struct { Maintenance string } -type expectedMetric struct { - Labels metricLabels +type kvmExpectedMetric struct { + Name string // metric family name (e.g. "cortex_kvm_host_capacity_total") + Labels kvmMetricLabels Value float64 } +func defaultHostLabels(host, az, bb string) kvmMetricLabels { + return kvmMetricLabels{ + ComputeHost: host, + AvailabilityZone: az, + BuildingBlock: bb, + CPUArchitecture: "cascade-lake", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + } +} + +func totalMetric(host, res, az, bb string, value float64) kvmExpectedMetric { + l := defaultHostLabels(host, az, bb) + l.Resource = res + return kvmExpectedMetric{Name: "cortex_kvm_host_capacity_total", Labels: l, Value: value} +} + +func usageMetric(host, res, capacityType, az, bb string, value float64) kvmExpectedMetric { + l := defaultHostLabels(host, az, bb) + l.Resource = res + l.Type = capacityType + return kvmExpectedMetric{Name: "cortex_kvm_host_capacity_usage", Labels: l, Value: value} +} + func TestKVMResourceCapacityKPI_Collect(t *testing.T) { tests := []struct { name string hypervisors []hv1.Hypervisor - expectedMetrics map[string][]expectedMetric // metric_name -> []expectedMetric + reservations []v1alpha1.Reservation + expectedMetrics []kvmExpectedMetric }{ { name: "single hypervisor with nil effective capacity", @@ -58,7 +89,7 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, Status: hv1.HypervisorStatus{ - EffectiveCapacity: nil, // Simulate nil effective capacity + EffectiveCapacity: nil, Allocation: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse("64"), hv1.ResourceMemory: resource.MustParse("256Gi"), @@ -67,8 +98,7 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, }, - // No metrics should be emitted for this hypervisor since effective capacity is nil - expectedMetrics: map[string][]expectedMetric{}, + expectedMetrics: []kvmExpectedMetric{}, }, { name: "single hypervisor with zero total capacity", @@ -82,8 +112,8 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, Status: hv1.HypervisorStatus{ EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceCPU: resource.MustParse("0"), // Simulate zero CPU capacity - hv1.ResourceMemory: resource.MustParse("0"), // Simulate zero RAM capacity + hv1.ResourceCPU: resource.MustParse("0"), + hv1.ResourceMemory: resource.MustParse("0"), }, Allocation: map[hv1.ResourceName]resource.Quantity{ hv1.ResourceCPU: resource.MustParse("0"), @@ -93,11 +123,10 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, }, - // No metrics should be emitted for this hypervisor since total capacity is zero - expectedMetrics: map[string][]expectedMetric{}, + expectedMetrics: []kvmExpectedMetric{}, }, { - name: "single hypervisor with default traits", + name: "single hypervisor with default traits, no reservations", hypervisors: []hv1.Hypervisor{ { ObjectMeta: v1.ObjectMeta{ @@ -119,71 +148,17 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, }, - expectedMetrics: map[string][]expectedMetric{ - "cortex_kvm_host_capacity_total": { - { - Labels: metricLabels{ - ComputeHost: "node001-bb088", - Resource: "cpu", - AvailabilityZone: "qa-1a", - BuildingBlock: "bb088", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 128, - }, - { - Labels: metricLabels{ - ComputeHost: "node001-bb088", - Resource: "ram", - AvailabilityZone: "qa-1a", - BuildingBlock: "bb088", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 549755813888, // 512Gi in bytes - }, - }, - "cortex_kvm_host_capacity_utilized": { - { - Labels: metricLabels{ - ComputeHost: "node001-bb088", - Resource: "cpu", - AvailabilityZone: "qa-1a", - BuildingBlock: "bb088", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 64, - }, - { - Labels: metricLabels{ - ComputeHost: "node001-bb088", - Resource: "ram", - AvailabilityZone: "qa-1a", - BuildingBlock: "bb088", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 274877906944, // 256Gi in bytes - }, - }, + expectedMetrics: []kvmExpectedMetric{ + totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), + totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi + usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), + usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi + usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 64), // 128-64-0-0 + usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 274877906944), // 512Gi-256Gi }, }, { @@ -212,38 +187,174 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, }, - expectedMetrics: map[string][]expectedMetric{ - "cortex_kvm_host_capacity_total": { - { - Labels: metricLabels{ - ComputeHost: "node002-bb089", - Resource: "cpu", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb089", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "hana", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 256, - }, - { - Labels: metricLabels{ - ComputeHost: "node002-bb089", - Resource: "ram", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb089", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "hana", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 1099511627776, // 1Ti in bytes + expectedMetrics: []kvmExpectedMetric{ + { + Name: "cortex_kvm_host_capacity_total", + Labels: kvmMetricLabels{ + ComputeHost: "node002-bb089", + Resource: "cpu", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb089", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "hana", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 256, + }, + { + Name: "cortex_kvm_host_capacity_total", + Labels: kvmMetricLabels{ + ComputeHost: "node002-bb089", + Resource: "ram", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb089", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "hana", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 1099511627776, // 1Ti + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node002-bb089", + Resource: "cpu", + Type: "utilized", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb089", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "hana", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 128, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node002-bb089", + Resource: "ram", + Type: "utilized", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb089", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "hana", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 549755813888, // 512Gi + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node002-bb089", + Resource: "cpu", + Type: "reserved", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb089", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "hana", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 0, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node002-bb089", + Resource: "ram", + Type: "reserved", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb089", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "hana", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 0, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node002-bb089", + Resource: "cpu", + Type: "failover", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb089", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "hana", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 0, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node002-bb089", + Resource: "ram", + Type: "failover", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb089", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "hana", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 0, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node002-bb089", + Resource: "cpu", + Type: "payg", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb089", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "hana", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 128, // 256-128-0-0 + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node002-bb089", + Resource: "ram", + Type: "payg", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb089", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "hana", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", }, + Value: 549755813888, // 1Ti-512Gi }, }, }, @@ -273,23 +384,174 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, }, - expectedMetrics: map[string][]expectedMetric{ - "cortex_kvm_host_capacity_total": { - { - Labels: metricLabels{ - ComputeHost: "node003-bb090", - Resource: "cpu", - AvailabilityZone: "qa-1c", - BuildingBlock: "bb090", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "true", - ExternalCustomer: "true", - Maintenance: "false", - }, - Value: 64, + expectedMetrics: []kvmExpectedMetric{ + { + Name: "cortex_kvm_host_capacity_total", + Labels: kvmMetricLabels{ + ComputeHost: "node003-bb090", + Resource: "cpu", + AvailabilityZone: "qa-1c", + BuildingBlock: "bb090", + CPUArchitecture: "cascade-lake", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "true", + ExternalCustomer: "true", + Maintenance: "false", + }, + Value: 64, + }, + { + Name: "cortex_kvm_host_capacity_total", + Labels: kvmMetricLabels{ + ComputeHost: "node003-bb090", + Resource: "ram", + AvailabilityZone: "qa-1c", + BuildingBlock: "bb090", + CPUArchitecture: "cascade-lake", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "true", + ExternalCustomer: "true", + Maintenance: "false", + }, + Value: 274877906944, // 256Gi + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node003-bb090", + Resource: "cpu", + Type: "utilized", + AvailabilityZone: "qa-1c", + BuildingBlock: "bb090", + CPUArchitecture: "cascade-lake", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "true", + ExternalCustomer: "true", + Maintenance: "false", + }, + Value: 32, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node003-bb090", + Resource: "ram", + Type: "utilized", + AvailabilityZone: "qa-1c", + BuildingBlock: "bb090", + CPUArchitecture: "cascade-lake", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "true", + ExternalCustomer: "true", + Maintenance: "false", + }, + Value: 137438953472, // 128Gi + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node003-bb090", + Resource: "cpu", + Type: "reserved", + AvailabilityZone: "qa-1c", + BuildingBlock: "bb090", + CPUArchitecture: "cascade-lake", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "true", + ExternalCustomer: "true", + Maintenance: "false", + }, + Value: 0, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node003-bb090", + Resource: "ram", + Type: "reserved", + AvailabilityZone: "qa-1c", + BuildingBlock: "bb090", + CPUArchitecture: "cascade-lake", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "true", + ExternalCustomer: "true", + Maintenance: "false", + }, + Value: 0, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node003-bb090", + Resource: "cpu", + Type: "failover", + AvailabilityZone: "qa-1c", + BuildingBlock: "bb090", + CPUArchitecture: "cascade-lake", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "true", + ExternalCustomer: "true", + Maintenance: "false", + }, + Value: 0, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node003-bb090", + Resource: "ram", + Type: "failover", + AvailabilityZone: "qa-1c", + BuildingBlock: "bb090", + CPUArchitecture: "cascade-lake", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "true", + ExternalCustomer: "true", + Maintenance: "false", + }, + Value: 0, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node003-bb090", + Resource: "cpu", + Type: "payg", + AvailabilityZone: "qa-1c", + BuildingBlock: "bb090", + CPUArchitecture: "cascade-lake", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "true", + ExternalCustomer: "true", + Maintenance: "false", + }, + Value: 32, // 64-32-0-0 + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node003-bb090", + Resource: "ram", + Type: "payg", + AvailabilityZone: "qa-1c", + BuildingBlock: "bb090", + CPUArchitecture: "cascade-lake", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "true", + ExternalCustomer: "true", + Maintenance: "false", }, + Value: 137438953472, // 256Gi-128Gi }, }, }, @@ -335,38 +597,184 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { }, }, }, - expectedMetrics: map[string][]expectedMetric{ - "cortex_kvm_host_capacity_total": { - { - Labels: metricLabels{ - ComputeHost: "node010-bb100", - Resource: "cpu", - AvailabilityZone: "qa-1a", - BuildingBlock: "bb100", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 100, - }, - { - Labels: metricLabels{ - ComputeHost: "node020-bb200", - Resource: "cpu", - AvailabilityZone: "qa-1b", - BuildingBlock: "bb200", - CPUArchitecture: "sapphire-rapids", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 200, + expectedMetrics: []kvmExpectedMetric{ + totalMetric("node010-bb100", "cpu", "qa-1a", "bb100", 100), + totalMetric("node010-bb100", "ram", "qa-1a", "bb100", 214748364800), // 200Gi + usageMetric("node010-bb100", "cpu", "utilized", "qa-1a", "bb100", 50), + usageMetric("node010-bb100", "ram", "utilized", "qa-1a", "bb100", 107374182400), // 100Gi + usageMetric("node010-bb100", "cpu", "reserved", "qa-1a", "bb100", 0), + usageMetric("node010-bb100", "ram", "reserved", "qa-1a", "bb100", 0), + usageMetric("node010-bb100", "cpu", "failover", "qa-1a", "bb100", 0), + usageMetric("node010-bb100", "ram", "failover", "qa-1a", "bb100", 0), + usageMetric("node010-bb100", "cpu", "payg", "qa-1a", "bb100", 50), // 100-50-0-0 + usageMetric("node010-bb100", "ram", "payg", "qa-1a", "bb100", 107374182400), // 200Gi-100Gi + { + Name: "cortex_kvm_host_capacity_total", + Labels: kvmMetricLabels{ + ComputeHost: "node020-bb200", + Resource: "cpu", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb200", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", }, + Value: 200, + }, + { + Name: "cortex_kvm_host_capacity_total", + Labels: kvmMetricLabels{ + ComputeHost: "node020-bb200", + Resource: "ram", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb200", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 429496729600, // 400Gi + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node020-bb200", + Resource: "cpu", + Type: "utilized", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb200", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 150, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node020-bb200", + Resource: "ram", + Type: "utilized", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb200", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 322122547200, // 300Gi + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node020-bb200", + Resource: "cpu", + Type: "reserved", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb200", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 0, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node020-bb200", + Resource: "ram", + Type: "reserved", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb200", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 0, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node020-bb200", + Resource: "cpu", + Type: "failover", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb200", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 0, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node020-bb200", + Resource: "ram", + Type: "failover", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb200", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 0, + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node020-bb200", + Resource: "cpu", + Type: "payg", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb200", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 50, // 200-150-0-0 + }, + { + Name: "cortex_kvm_host_capacity_usage", + Labels: kvmMetricLabels{ + ComputeHost: "node020-bb200", + Resource: "ram", + Type: "payg", + AvailabilityZone: "qa-1b", + BuildingBlock: "bb200", + CPUArchitecture: "sapphire-rapids", + WorkloadType: "general-purpose", + Enabled: "true", + Decommissioned: "false", + ExternalCustomer: "false", + Maintenance: "false", + }, + Value: 107374182400, // 400Gi-300Gi }, }, }, @@ -385,78 +793,287 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { hv1.ResourceCPU: resource.MustParse("96"), hv1.ResourceMemory: resource.MustParse("384Gi"), }, - // No Allocation field - simulating missing data Allocation: nil, Traits: []string{}, }, }, }, - expectedMetrics: map[string][]expectedMetric{ - "cortex_kvm_host_capacity_total": { - { - Labels: metricLabels{ - ComputeHost: "node004-bb091", - Resource: "cpu", - AvailabilityZone: "qa-1d", - BuildingBlock: "bb091", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 96, - }, - { - Labels: metricLabels{ - ComputeHost: "node004-bb091", - Resource: "ram", - AvailabilityZone: "qa-1d", - BuildingBlock: "bb091", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 412316860416, // 384Gi in bytes - }, - }, - "cortex_kvm_host_capacity_utilized": { - { - Labels: metricLabels{ - ComputeHost: "node004-bb091", - Resource: "cpu", - AvailabilityZone: "qa-1d", - BuildingBlock: "bb091", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 0, // Should be 0 when allocation is missing - }, - { - Labels: metricLabels{ - ComputeHost: "node004-bb091", - Resource: "ram", - AvailabilityZone: "qa-1d", - BuildingBlock: "bb091", - CPUArchitecture: "cascade-lake", - WorkloadType: "general-purpose", - Enabled: "true", - Decommissioned: "false", - ExternalCustomer: "false", - Maintenance: "false", - }, - Value: 0, // Should be 0 when allocation is missing + expectedMetrics: []kvmExpectedMetric{ + totalMetric("node004-bb091", "cpu", "qa-1d", "bb091", 96), + totalMetric("node004-bb091", "ram", "qa-1d", "bb091", 412316860416), // 384Gi + usageMetric("node004-bb091", "cpu", "utilized", "qa-1d", "bb091", 0), + usageMetric("node004-bb091", "ram", "utilized", "qa-1d", "bb091", 0), + usageMetric("node004-bb091", "cpu", "reserved", "qa-1d", "bb091", 0), + usageMetric("node004-bb091", "ram", "reserved", "qa-1d", "bb091", 0), + usageMetric("node004-bb091", "cpu", "failover", "qa-1d", "bb091", 0), + usageMetric("node004-bb091", "ram", "failover", "qa-1d", "bb091", 0), + usageMetric("node004-bb091", "cpu", "payg", "qa-1d", "bb091", 96), // 96-0-0-0 + usageMetric("node004-bb091", "ram", "payg", "qa-1d", "bb091", 412316860416), // 384Gi-0 + }, + }, + { + name: "failover reservation on a hypervisor", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, }, }, }, + reservations: []v1alpha1.Reservation{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "failover-1", + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("16"), + hv1.ResourceMemory: resource.MustParse("64Gi"), + }, + FailoverReservation: &v1alpha1.FailoverReservationSpec{}, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + }, + expectedMetrics: []kvmExpectedMetric{ + totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), + totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi + usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), + usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi + usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 16), + usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 68719476736), // 64Gi + usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 48), // 128-64-0-16 + usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 206158430208), // 512Gi-256Gi-0-64Gi = 192Gi + }, + }, + { + name: "committed resource reservation with partial allocation", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + reservations: []v1alpha1.Reservation{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "committed-1", + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("32"), + hv1.ResourceMemory: resource.MustParse("128Gi"), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + Allocations: map[string]v1alpha1.CommittedResourceAllocation{ + "vm-uuid-1": { + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("8"), + hv1.ResourceMemory: resource.MustParse("32Gi"), + }, + }, + }, + }, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + }, + expectedMetrics: []kvmExpectedMetric{ + totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), + totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi + usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), + usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi + // reserved = 32-8=24 CPU, 128Gi-32Gi=96Gi RAM (not in use) + usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 24), + usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 103079215104), // 96Gi + usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 40), // 128-64-24-0 + usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 171798691840), // 512Gi-256Gi-96Gi-0 = 160Gi + }, + }, + { + name: "non-ready reservation should be ignored", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + reservations: []v1alpha1.Reservation{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "failover-not-ready", + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("16"), + hv1.ResourceMemory: resource.MustParse("64Gi"), + }, + FailoverReservation: &v1alpha1.FailoverReservationSpec{}, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionFalse}, + }, + }, + }, + }, + expectedMetrics: []kvmExpectedMetric{ + totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), + totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi + usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), + usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi + usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 0), + // Non-ready reservation ignored, so failover = 0 + usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 64), // 128-64-0-0 + usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 274877906944), // 512Gi-256Gi + }, + }, + { + name: "multiple failover reservations on same host are summed", + hypervisors: []hv1.Hypervisor{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "node001-bb088", + Labels: map[string]string{ + "topology.kubernetes.io/zone": "qa-1a", + }, + }, + Status: hv1.HypervisorStatus{ + EffectiveCapacity: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("128"), + hv1.ResourceMemory: resource.MustParse("512Gi"), + }, + Allocation: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("64"), + hv1.ResourceMemory: resource.MustParse("256Gi"), + }, + Traits: []string{}, + }, + }, + }, + reservations: []v1alpha1.Reservation{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "failover-1", + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("8"), + hv1.ResourceMemory: resource.MustParse("32Gi"), + }, + FailoverReservation: &v1alpha1.FailoverReservationSpec{}, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + { + ObjectMeta: v1.ObjectMeta{ + Name: "failover-2", + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("12"), + hv1.ResourceMemory: resource.MustParse("48Gi"), + }, + FailoverReservation: &v1alpha1.FailoverReservationSpec{}, + }, + Status: v1alpha1.ReservationStatus{ + Host: "node001-bb088", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + }, + expectedMetrics: []kvmExpectedMetric{ + totalMetric("node001-bb088", "cpu", "qa-1a", "bb088", 128), + totalMetric("node001-bb088", "ram", "qa-1a", "bb088", 549755813888), // 512Gi + usageMetric("node001-bb088", "cpu", "utilized", "qa-1a", "bb088", 64), + usageMetric("node001-bb088", "ram", "utilized", "qa-1a", "bb088", 274877906944), // 256Gi + usageMetric("node001-bb088", "cpu", "reserved", "qa-1a", "bb088", 0), + usageMetric("node001-bb088", "ram", "reserved", "qa-1a", "bb088", 0), + // failover = 8+12=20 CPU, 32Gi+48Gi=80Gi RAM + usageMetric("node001-bb088", "cpu", "failover", "qa-1a", "bb088", 20), + usageMetric("node001-bb088", "ram", "failover", "qa-1a", "bb088", 85899345920), // 80Gi + usageMetric("node001-bb088", "cpu", "payg", "qa-1a", "bb088", 44), // 128-64-0-20 + usageMetric("node001-bb088", "ram", "payg", "qa-1a", "bb088", 188978561024), // 512Gi-256Gi-0-80Gi = 176Gi + }, }, } @@ -466,10 +1083,16 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { if err := hv1.AddToScheme(scheme); err != nil { t.Fatalf("failed to add hypervisor scheme: %v", err) } + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add v1alpha1 scheme: %v", err) + } - objects := make([]runtime.Object, len(tt.hypervisors)) + objects := make([]runtime.Object, 0, len(tt.hypervisors)+len(tt.reservations)) for i := range tt.hypervisors { - objects[i] = &tt.hypervisors[i] + objects = append(objects, &tt.hypervisors[i]) + } + for i := range tt.reservations { + objects = append(objects, &tt.reservations[i]) } client := fake.NewClientBuilder(). @@ -486,25 +1109,24 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { kpi.Collect(ch) close(ch) - actualMetrics := make(map[string][]expectedMetric) + var actualMetrics []kvmExpectedMetric for metric := range ch { var m prometheusgo.Metric if err := metric.Write(&m); err != nil { t.Fatalf("failed to write metric: %v", err) } - // Extract metric name from description - desc := metric.Desc().String() - metricName := getMetricName(desc) + metricName := getMetricName(metric.Desc().String()) - // Extract labels - labels := metricLabels{} + labels := kvmMetricLabels{} for _, label := range m.Label { switch label.GetName() { case "compute_host": labels.ComputeHost = label.GetValue() case "resource": labels.Resource = label.GetValue() + case "type": + labels.Type = label.GetValue() case "availability_zone": labels.AvailabilityZone = label.GetValue() case "building_block": @@ -524,36 +1146,175 @@ func TestKVMResourceCapacityKPI_Collect(t *testing.T) { } } - actualMetrics[metricName] = append(actualMetrics[metricName], expectedMetric{ + actualMetrics = append(actualMetrics, kvmExpectedMetric{ + Name: metricName, Labels: labels, Value: m.GetGauge().GetValue(), }) } - // Verify expected metrics - for metricName, expectedList := range tt.expectedMetrics { - actualList, ok := actualMetrics[metricName] + // Verify exact equality: same number of metrics and each expected metric is present. + if len(actualMetrics) != len(tt.expectedMetrics) { + t.Errorf("metric count mismatch: expected %d, got %d\nactual: %+v", + len(tt.expectedMetrics), len(actualMetrics), actualMetrics) + } + for _, expected := range tt.expectedMetrics { + found := false + for _, actual := range actualMetrics { + nameMatch := expected.Name == "" || actual.Name == expected.Name + if nameMatch && actual.Labels == expected.Labels { + found = true + if actual.Value != expected.Value { + t.Errorf("metric %s with labels %+v: expected value %f, got %f", + expected.Name, expected.Labels, expected.Value, actual.Value) + } + break + } + } + if !found { + t.Errorf("metric %s with labels %+v not found in actual metrics", + expected.Name, expected.Labels) + } + } + for _, actual := range actualMetrics { + found := false + for _, expected := range tt.expectedMetrics { + nameMatch := expected.Name == "" || actual.Name == expected.Name + if nameMatch && actual.Labels == expected.Labels { + found = true + break + } + } + if !found { + t.Errorf("unexpected metric %s with labels %+v (value %f) in actual metrics", + actual.Name, actual.Labels, actual.Value) + } + } + }) + } +} + +func TestAggregateReservationsByHost(t *testing.T) { + tests := []struct { + name string + reservations []v1alpha1.Reservation + expectedFailover map[string]hostReservationResources + expectedCommittedNotInUse map[string]hostReservationResources + }{ + { + name: "empty reservations", + reservations: nil, + expectedFailover: map[string]hostReservationResources{}, + expectedCommittedNotInUse: map[string]hostReservationResources{}, + }, + { + name: "reservation with no ready condition is skipped", + reservations: []v1alpha1.Reservation{ + { + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("10"), + }, + }, + Status: v1alpha1.ReservationStatus{ + Host: "host-1", + // No conditions + }, + }, + }, + expectedFailover: map[string]hostReservationResources{}, + expectedCommittedNotInUse: map[string]hostReservationResources{}, + }, + { + name: "reservation with empty host is skipped", + reservations: []v1alpha1.Reservation{ + { + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeFailover, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("10"), + }, + }, + Status: v1alpha1.ReservationStatus{ + Host: "", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + }, + expectedFailover: map[string]hostReservationResources{}, + expectedCommittedNotInUse: map[string]hostReservationResources{}, + }, + { + name: "committed resource with nil spec does not panic", + reservations: []v1alpha1.Reservation{ + { + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceCPU: resource.MustParse("16"), + hv1.ResourceMemory: resource.MustParse("64Gi"), + }, + CommittedResourceReservation: nil, + }, + Status: v1alpha1.ReservationStatus{ + Host: "host-1", + Conditions: []v1.Condition{ + {Type: v1alpha1.ReservationConditionReady, Status: v1.ConditionTrue}, + }, + }, + }, + }, + expectedFailover: map[string]hostReservationResources{}, + expectedCommittedNotInUse: map[string]hostReservationResources{ + "host-1": { + cpu: resource.MustParse("16"), + memory: resource.MustParse("64Gi"), + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + failover, committed := aggregateReservationsByHost(tt.reservations) + + if len(failover) != len(tt.expectedFailover) { + t.Errorf("failover map length: expected %d, got %d", len(tt.expectedFailover), len(failover)) + } + for host, expected := range tt.expectedFailover { + actual, ok := failover[host] if !ok { - t.Errorf("metric %q not found in actual metrics", metricName) + t.Errorf("failover: host %q not found", host) continue } + if actual.cpu.Cmp(expected.cpu) != 0 { + t.Errorf("failover[%s].cpu: expected %s, got %s", host, expected.cpu.String(), actual.cpu.String()) + } + if actual.memory.Cmp(expected.memory) != 0 { + t.Errorf("failover[%s].memory: expected %s, got %s", host, expected.memory.String(), actual.memory.String()) + } + } - for _, expected := range expectedList { - found := false - for _, actual := range actualList { - if actual.Labels == expected.Labels { - found = true - if actual.Value != expected.Value { - t.Errorf("metric %q with labels %+v: expected value %f, got %f", - metricName, expected.Labels, expected.Value, actual.Value) - } - break - } - } - if !found { - t.Errorf("metric %q with labels %+v not found in actual metrics", - metricName, expected.Labels) - } + if len(committed) != len(tt.expectedCommittedNotInUse) { + t.Errorf("committed map length: expected %d, got %d", len(tt.expectedCommittedNotInUse), len(committed)) + } + for host, expected := range tt.expectedCommittedNotInUse { + actual, ok := committed[host] + if !ok { + t.Errorf("committed: host %q not found", host) + continue + } + if actual.cpu.Cmp(expected.cpu) != 0 { + t.Errorf("committed[%s].cpu: expected %s, got %s", host, expected.cpu.String(), actual.cpu.String()) + } + if actual.memory.Cmp(expected.memory) != 0 { + t.Errorf("committed[%s].memory: expected %s, got %s", host, expected.memory.String(), actual.memory.String()) } } })