Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/guides/multicluster/readme.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Cortex Multi-Cluster Testing

> [!NOTE]
> If you want to skip the reading part, there's `run.sh` and `cleanup.sh` scripts in this directory that will set up and tear down the multi-cluster environment for you.
> If you want to skip the reading part, there's `run.sh` and `cleanup.sh` scripts in this directory that will set up and tear down the multi-cluster environment for you. If you want to test the multi-cluster setup you can run the `schedule.sh` script, which will create a scheduling request and show you how it gets processed across the clusters.

Cortex provides support for multi-cluster deployments, where a "home" cluster hosts the cortex pods and one or more "remote" clusters are used to persist CRDs. A typical use case for this would be to offload the etcd storage for Cortex CRDs to a remote cluster, reducing the resource usage on the home cluster. Similarly, another use case is to have multiple remote clusters that maintain all the compute workloads and expose resources that Cortex needs to access, such as the `Hypervisor` resource.

Expand Down
8 changes: 6 additions & 2 deletions docs/guides/multicluster/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,20 @@ global:
gvks:
- kvm.cloud.sap/v1/Hypervisor
- kvm.cloud.sap/v1/HypervisorList
- cortex.cloud/v1alpha1/History
- cortex.cloud/v1alpha1/HistoryList
labels:
az: cortex-remote-az-a
availabilityZone: cortex-remote-az-a
caCert: |
$(cat /tmp/root-ca-remote-az-a.pem | sed 's/^/ /')
- host: https://host.docker.internal:8445
gvks:
- kvm.cloud.sap/v1/Hypervisor
- kvm.cloud.sap/v1/HypervisorList
- cortex.cloud/v1alpha1/History
- cortex.cloud/v1alpha1/HistoryList
labels:
az: cortex-remote-az-b
availabilityZone: cortex-remote-az-b
caCert: |
$(cat /tmp/root-ca-remote-az-b.pem | sed 's/^/ /')
EOF
Expand Down
196 changes: 196 additions & 0 deletions docs/guides/multicluster/schedule.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
#!/bin/bash

set -e

API_URL="http://localhost:8001/scheduler/nova/external"
INSTANCE_UUID="cortex-test-instance-001"

echo "Applying test pipeline to home cluster"
kubectl --context kind-cortex-home apply -f docs/guides/multicluster/test-pipeline.yaml

echo ""
echo "Sending scheduling request for instance $INSTANCE_UUID"
echo "The test pipeline will schedule the instance on one of the hosts in cortex-remote-az-b".
echo "Hosts: hypervisor-1-az-a, hypervisor-2-az-a, hypervisor-1-az-b, hypervisor-2-az-b"
echo ""

RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$API_URL" \
-H "Content-Type: application/json" \
-d @- <<EOF
{
"spec": {
"nova_object.name": "RequestSpec",
"nova_object.namespace": "nova",
"nova_object.version": "1.14",
"nova_object.changes": [],
"nova_object.data": {
"project_id": "test-project",
"user_id": "test-user",
"instance_uuid": "$INSTANCE_UUID",
"availability_zone": "cortex-remote-az-b",
"num_instances": 1,
"is_bfv": false,
"scheduler_hints": {},
"ignore_hosts": null,
"force_hosts": null,
"force_nodes": null,
"image": {
"nova_object.name": "ImageMeta",
"nova_object.namespace": "nova",
"nova_object.version": "1.8",
"nova_object.changes": [],
"nova_object.data": {
"id": "00000000-0000-0000-0000-000000000001",
"name": "test-image",
"status": "active",
"checksum": "0000000000000000",
"owner": "test-project",
"size": 1024,
"container_format": "bare",
"disk_format": "raw",
"created_at": "2025-01-01T00:00:00Z",
"updated_at": "2025-01-01T00:00:00Z",
"min_ram": 0,
"min_disk": 0,
"properties": {
"nova_object.name": "ImageMetaProps",
"nova_object.namespace": "nova",
"nova_object.version": "1.36",
"nova_object.changes": [],
"nova_object.data": {}
}
}
},
"flavor": {
"nova_object.name": "Flavor",
"nova_object.namespace": "nova",
"nova_object.version": "1.2",
"nova_object.changes": [],
"nova_object.data": {
"id": 1,
"name": "m1.small",
"memory_mb": 2048,
"vcpus": 1,
"root_gb": 20,
"ephemeral_gb": 0,
"flavorid": "1",
"swap": 0,
"rxtx_factor": 1.0,
"vcpu_weight": 0,
"disabled": false,
"is_public": true,
"extra_specs": {
"capabilities:hypervisor_type": "qemu"
},
"description": null,
"created_at": "2025-01-01T00:00:00Z",
"updated_at": null
}
},
"request_level_params": {
"nova_object.name": "RequestLevelParams",
"nova_object.namespace": "nova",
"nova_object.version": "1.1",
"nova_object.changes": [],
"nova_object.data": {
"root_required": [],
"root_forbidden": [],
"same_subtree": []
}
},
"network_metadata": {
"nova_object.name": "NetworkMetadata",
"nova_object.namespace": "nova",
"nova_object.version": "1.0",
"nova_object.changes": [],
"nova_object.data": {
"physnets": [],
"tunneled": false
}
},
"limits": {
"nova_object.name": "SchedulerLimits",
"nova_object.namespace": "nova",
"nova_object.version": "1.0",
"nova_object.changes": [],
"nova_object.data": {}
},
"requested_networks": {
"objects": null
},
"security_groups": {
"objects": null
}
}
},
"context": {
"user": "test-user",
"project_id": "test-project",
"system_scope": null,
"project": "test-project",
"domain": null,
"user_domain": "Default",
"project_domain": "Default",
"is_admin": false,
"read_only": false,
"show_deleted": false,
"request_id": "req-test-001",
"global_request_id": null,
"resource_uuid": null,
"roles": [],
"user_identity": "test-user test-project - Default -",
"is_admin_project": false,
"read_deleted": "no",
"remote_address": "127.0.0.1",
"timestamp": "2025-01-01T00:00:00.000000",
"quota_class": null,
"user_name": "test-user",
"project_name": "test-project"
},
"hosts": [
{"host": "hypervisor-1-az-a", "hypervisor_hostname": "hypervisor-1-az-a"},
{"host": "hypervisor-2-az-a", "hypervisor_hostname": "hypervisor-2-az-a"},
{"host": "hypervisor-1-az-b", "hypervisor_hostname": "hypervisor-1-az-b"},
{"host": "hypervisor-2-az-b", "hypervisor_hostname": "hypervisor-2-az-b"}
],
"weights": {
"hypervisor-1-az-a": 1.0,
"hypervisor-2-az-a": 2.0,
"hypervisor-1-az-b": 3.0,
"hypervisor-2-az-b": 4.0
},
"pipeline": "multicluster-test"
}
EOF
)

HTTP_CODE=$(echo "$RESPONSE" | tail -1)
BODY=$(echo "$RESPONSE" | sed '$d')

echo "Response from scheduler:"
echo "HTTP $HTTP_CODE"
echo "$BODY" | python3 -m json.tool 2>/dev/null || echo "$BODY"

Comment thread
coderabbitai[bot] marked this conversation as resolved.
sleep 1
echo ""
echo "--- Check History CRDs in cortex-home ---"
kubectl --context kind-cortex-home get histories
kubectl --context kind-cortex-home get events --field-selector reason=SchedulingSucceeded
echo ""
echo "--- Check History CRDs in cortex-remote-az-a ---"
kubectl --context kind-cortex-remote-az-a get histories
kubectl --context kind-cortex-remote-az-a get events --field-selector reason=SchedulingSucceeded

echo ""
echo "--- Check History CRDs in cortex-remote-az-b ---"
kubectl --context kind-cortex-remote-az-b get histories
kubectl --context kind-cortex-remote-az-b get events --field-selector reason=SchedulingSucceeded

echo "---"
echo "Press enter to describe the History CRD in cortex-remote-az-b and see the details of the scheduling result"
read -r
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Avoid blocking on prompt in non-interactive runs.

Line 191 always waits for input. In CI or piped runs, this can stall the script indefinitely.

Proposed fix
 echo "---"
 echo "Press enter to describe the History CRD in cortex-remote-az-b and see the details of the scheduling result"
-read -r
+if [[ -t 0 ]]; then
+  read -r
+fi
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
echo "Press enter to describe the History CRD in cortex-remote-az-b and see the details of the scheduling result"
read -r
echo "Press enter to describe the History CRD in cortex-remote-az-b and see the details of the scheduling result"
if [[ -t 0 ]]; then
read -r
fi
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@docs/guides/multicluster/schedule.sh` around lines 190 - 191, The script
currently always blocks on the interactive prompt (echo followed by read -r),
which stalls CI/piped runs; wrap the prompt/read so it only executes when stdin
is a TTY (e.g., check [ -t 0 ] or test for /dev/tty) and skip it in
non-interactive runs, leaving the echo message or a non-blocking log in place
when skipping; update the block around the existing echo and read -r to perform
that TTY check before calling read so CI pipelines won’t hang.


echo "--- Describe History CRD in cortex-remote-az-b ---"
kubectl --context kind-cortex-remote-az-b describe history nova-cortex-test-instance-001
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated


12 changes: 12 additions & 0 deletions docs/guides/multicluster/test-pipeline.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: cortex.cloud/v1alpha1
kind: Pipeline
metadata:
name: multicluster-test
spec:
schedulingDomain: nova
description: Minimal test pipeline for the multicluster guide.
type: filter-weigher
createHistory: true
filters:
- name: filter_correct_az
weighers: []
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ func (c *FilterWeigherPipelineController) InitPipeline(
func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error {
c.Initializer = c
c.SchedulingDomain = v1alpha1.SchedulingDomainCinder
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-cinder-scheduler")}
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-cinder-scheduler")}
if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil {
return err
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ func (c *FilterWeigherPipelineController) handleMachine() handler.EventHandler {
func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error {
c.Initializer = c
c.SchedulingDomain = v1alpha1.SchedulingDomainMachines
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-machines-scheduler")}
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-machines-scheduler")}
if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil {
return err
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ func (c *FilterWeigherPipelineController) InitPipeline(
func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error {
c.Initializer = c
c.SchedulingDomain = v1alpha1.SchedulingDomainManila
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-manila-scheduler")}
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-manila-scheduler")}
if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil {
return err
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ func (c *FilterWeigherPipelineController) InitPipeline(
func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error {
c.Initializer = c
c.SchedulingDomain = v1alpha1.SchedulingDomainNova
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-nova-scheduler")}
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-nova-scheduler")}
c.gatherer = &candidateGatherer{Client: mcl}
if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil {
return err
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ func (c *FilterWeigherPipelineController) handlePod() handler.EventHandler {
func (c *FilterWeigherPipelineController) SetupWithManager(mgr manager.Manager, mcl *multicluster.Client) error {
c.Initializer = c
c.SchedulingDomain = v1alpha1.SchedulingDomainPods
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mgr.GetEventRecorder("cortex-pods-scheduler")}
c.HistoryManager = lib.HistoryClient{Client: mcl, Recorder: mcl.GetEventRecorder("cortex-pods-scheduler")}
if err := mgr.Add(manager.RunnableFunc(c.InitAllPipelines)); err != nil {
return err
}
Expand Down
2 changes: 1 addition & 1 deletion internal/scheduling/reservations/failover/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -766,7 +766,7 @@ func (c *FailoverReservationController) patchReservationStatus(ctx context.Conte
// SetupWithManager sets up the watch-based reconciler with the Manager.
// This handles per-reservation reconciliation triggered by CRD changes.
func (c *FailoverReservationController) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.Client) error {
c.Recorder = mgr.GetEventRecorder("failover-reservation-controller")
c.Recorder = mcl.GetEventRecorder("failover-reservation-controller")

bldr := multicluster.BuildController(mcl, mgr)
bldr, err := bldr.WatchesMulticluster(
Expand Down
18 changes: 16 additions & 2 deletions pkg/multicluster/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/tools/events"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -74,8 +76,9 @@ func (f *fakeCache) getIndexFieldCalls() []indexFieldCall {
// fakeCluster implements cluster.Cluster interface for testing.
type fakeCluster struct {
cluster.Cluster
fakeClient client.Client
fakeCache *fakeCache
fakeClient client.Client
fakeCache *fakeCache
fakeRecorder events.EventRecorder
}

func (f *fakeCluster) GetClient() client.Client {
Expand All @@ -86,6 +89,17 @@ func (f *fakeCluster) GetCache() cache.Cache {
return f.fakeCache
}

func (f *fakeCluster) GetEventRecorder(_ string) events.EventRecorder {
if f.fakeRecorder != nil {
return f.fakeRecorder
}
return &fakeEventRecorder{}
}

func (f *fakeCluster) GetEventRecorderFor(_ string) record.EventRecorder {
return record.NewFakeRecorder(100)
}

func newFakeCluster(scheme *runtime.Scheme, objs ...client.Object) *fakeCluster {
return &fakeCluster{
fakeClient: fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build(),
Expand Down
Loading
Loading