From 28231d0b03aa421ccdcd257cf113774d3937c781 Mon Sep 17 00:00:00 2001 From: Marco Braga Date: Thu, 21 May 2026 15:37:01 -0300 Subject: [PATCH 1/2] OCPBUGS-83399: support CCM AWS e2e tests on HyperShift hosted clusters The AWSServiceLBNetworkSecurityGroup e2e tests fail on HyperShift because the test binary cannot reach AWS APIs (VPC private endpoint DNS) and the cloud-config ConfigMap has a different name and location than on standalone clusters. AWS endpoint resolution: - Force public regional endpoints (BaseEndpoint) on ELBv2 and EC2 clients to bypass VPC private endpoint DNS that is unreachable from the CI management cluster. - Validate cfg.Region against the AWS region pattern and fall back to infrastructure/cluster status.platformStatus.aws.region when the SDK returns a non-region value (e.g. CI lease UUID from LEASED_RESOURCE). Cloud-config validation (HyperShift-aware): - Detect External topology via Infrastructure resource to choose the right cloud-config source. - On HyperShift: read ConfigMap aws-cloud-config from the HCP namespace on the management cluster (via HYPERSHIFT_MANAGEMENT_CLUSTER_KUBECONFIG and HYPERSHIFT_MANAGEMENT_CLUSTER_NAMESPACE env vars). - On standalone: read ConfigMap cloud-conf from openshift-cloud-controller-manager namespace (existing behavior). - Extract cloud-config helpers (GetCloudConfig, IsConfigPresentCloudConfig, IsNLBSecurityGroupModeManaged, GetKubeClient) into common/helper.go without Ginkgo control-flow calls so they are safe to use from both spec and non-spec contexts. Skip mechanism: - Add SKIP_MANAGEMENT_CLUSTER_TESTS=true env var to gracefully skip tests requiring management cluster access when the kubeconfig is not available. Documentation: - Replace stale docs/dev/ote-ccm-aws.md with e2e-ote-ccm-aws.md covering corrected paths, batch test execution, HyperShift setup (env vars, NLB ingress patching), and the skip flag. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/dev/e2e-ote-ccm-aws.md | 205 ++++++++++++++++++ docs/dev/ote-ccm-aws.md | 92 -------- .../ccm-aws-tests/e2e/aws/helper.go | 76 ++++++- .../ccm-aws-tests/e2e/aws/loadbalancer.go | 3 +- .../ccm-aws-tests/e2e/common/helper.go | 89 +++++++- openshift-tests/ccm-aws-tests/go.mod | 2 +- 6 files changed, 365 insertions(+), 102 deletions(-) create mode 100644 docs/dev/e2e-ote-ccm-aws.md delete mode 100644 docs/dev/ote-ccm-aws.md diff --git a/docs/dev/e2e-ote-ccm-aws.md b/docs/dev/e2e-ote-ccm-aws.md new file mode 100644 index 000000000..da6d65a69 --- /dev/null +++ b/docs/dev/e2e-ote-ccm-aws.md @@ -0,0 +1,205 @@ +# OpenShift Tests Extension (OTE) for AWS Cloud Controller Manager + +The OpenShift Tests Extension (OTE) binary `cloud-controller-manager-aws-tests-ext` +exposes the upstream e2e tests from `cloud-provider-aws` (implemented under `tests/e2e`) +to the OpenShift test framework. Tests are selected/curated by the filters defined in +`main.go`, so that only the intended subset is available in the OpenShift test pipeline. + +OpenShift-specific (downstream) tests live under `openshift-tests/ccm-aws-tests/e2e/aws` +and are added to the list of tests executed by `openshift-tests`. + +The OTE library uses dot imports to register both upstream and downstream Ginkgo specs into the suite. + +```mermaid +flowchart LR + A["Upstream Tests
kubernetes/cloud-provider-aws
tests/e2e"] --> C["OTE Binary
cloud-controller-manager-aws-tests-ext"] + B["Downstream Tests
openshift/cluster-cloud-controller-manager-operator
openshift-tests/ccm-aws-tests/e2e"] --> C + C --> D["openshift-tests
execution framework"] + + style A fill:#e1f5ff + style B fill:#fff4e1 + style C fill:#e8f5e9 + style D fill:#f3e5f5 +``` + +## Directory Structure + +```text +openshift-tests/ccm-aws-tests/ +├── e2e/ +│ ├── aws/ +│ │ ├── loadbalancer.go # OpenShift-specific load balancer tests +│ │ └── helper.go # AWS client helpers (ELBv2, EC2, LB lookup) +│ └── common/ +│ └── helper.go # Shared helpers (feature gate, topology detection) +├── main.go # Test binary entrypoint +├── go.mod +└── vendor/ +``` + +## Prerequisites + +- Go 1.24+ +- Access to an OpenShift cluster on AWS +- Valid `KUBECONFIG` pointing to the cluster +- AWS credentials configured (for tests that query AWS APIs): + `AWS_REGION`, `AWS_SHARED_CREDENTIALS_FILE`, or default SDK credential chain + +## Building + +### Building the test binary + +From the root of the project: + +```sh +make cloud-controller-manager-aws-tests-ext +``` + +The binary will be at `openshift-tests/bin/cloud-controller-manager-aws-tests-ext`. + +### Building the container image + +```sh +podman build --authfile $PULL_SECRET_FILE -f Dockerfile.openshift -t ccm-local:devel . +# OR +make image +``` + +The binary is embedded in the image at `/usr/bin/cloud-controller-manager-aws-tests-ext.gz`. + +## Using the test binary + +### List available tests + +```sh +./openshift-tests/bin/cloud-controller-manager-aws-tests-ext list tests | jq .[].name +``` + +### List test suites + +```sh +./openshift-tests/bin/cloud-controller-manager-aws-tests-ext list suites | jq .[].name +``` + +### Run a specific test (standalone cluster) + +```sh +export KUBECONFIG=/path/to/kubeconfig +export AWS_REGION=us-east-1 + +./openshift-tests/bin/cloud-controller-manager-aws-tests-ext run-test \ + "[cloud-provider-aws-e2e] loadbalancer CLB should be reachable with default configurations [Suite:openshift/conformance/parallel]" +``` + +### Run multiple tests by pattern + +The `run-test` command only supports running **one test at a time** (a known +OTE framework limitation). To batch-run tests matching a +pattern, use process substitution so that each invocation gets its own stdin: + +```sh +BIN=./openshift-tests/bin/cloud-controller-manager-aws-tests-ext + +# Run all AWSServiceLBNetworkSecurityGroup tests +while IFS= read -r t; do + echo "=== Running: $t"; $BIN run-test "$t" < /dev/null +done < <($BIN list tests | jq -r '.[].name' | grep "AWSServiceLBNetworkSecurityGroup") + +# Run all upstream loadbalancer tests +while IFS= read -r t; do + echo "=== Running: $t"; $BIN run-test "$t" < /dev/null +done < <($BIN list tests | jq -r '.[].name' | grep "\[cloud-provider-aws-e2e\] loadbalancer") + +# Run all tests (upstream + downstream) +while IFS= read -r t; do + echo "=== Running: $t"; $BIN run-test "$t" < /dev/null +done < <($BIN list tests | jq -r '.[].name') +``` + +> **Note:** The `< /dev/null` is required — `run-test` reads stdin for +> additional test names, and without it the first invocation would consume +> all remaining names from the loop's input. + +Results may be quite verbose, you can pipe the logs to file and query results later with a summary: + +```sh +run_test(){ + while IFS= read -r t; do + echo "=== Running: $t"; + $BIN run-test "$t" < /dev/null; +done < <($BIN list tests | jq -r '.[].name' | grep "AWSServiceLBNetworkSecurityGroup"); } + +run_test | tee -a e2e-ote.log + +grep -E "(name\"\:|\"result\")" e2e-ote.log +``` + +### Run a specific test (HyperShift hosted cluster) + +When running against a HyperShift hosted cluster, `KUBECONFIG` must point to the +**guest** (hosted) cluster. Additionally, the AWSServiceLBNetworkSecurityGroup +tests need access to the management cluster to validate the CCM cloud-config, +which lives in the hosted control plane namespace. + +Set these environment variables before running: + +```sh +# Guest cluster kubeconfig (where tests run) +export KUBECONFIG=/path/to/hosted-cluster/kubeconfig + +# Management cluster kubeconfig (for cloud-config validation) +export HYPERSHIFT_MANAGEMENT_CLUSTER_KUBECONFIG=/path/to/management-cluster/kubeconfig + +# HCP namespace on the management cluster (e.g., clusters-) +export HYPERSHIFT_MANAGEMENT_CLUSTER_NAMESPACE=clusters-my-hosted-cluster + +# AWS credentials for the account where the hosted cluster's resources live. +# These must have permissions for DescribeLoadBalancers (ELBv2) and +# DescribeSecurityGroups (EC2). In CI, this is the hypershift pool account. +export AWS_SHARED_CREDENTIALS_FILE=/path/to/aws/credentials +export AWS_REGION=us-east-1 +``` + +Then run the test: + +```sh +./openshift-tests/bin/cloud-controller-manager-aws-tests-ext run-test \ + "[cloud-provider-aws-e2e-openshift] loadbalancer NLB [OCPFeatureGate:AWSServiceLBNetworkSecurityGroup] should have NLBSecurityGroupMode with 'Managed value in cloud-config [Suite:openshift/conformance/parallel]" +``` + +The test automatically detects External topology (HyperShift) by querying the +cluster's `Infrastructure` resource and reads the cloud-config from ConfigMap +`aws-cloud-config` in the HCP namespace on the management cluster, instead of +`cloud-conf` in `openshift-cloud-controller-manager` on the guest cluster. + + +#### Skipping tests that require management cluster access + +If you are running against a HyperShift hosted cluster but do **not** have +access to the management cluster kubeconfig, you can skip those tests by setting: + +```sh +export SKIP_MANAGEMENT_CLUSTER_TESTS=true +``` + +This will skip any test that requires reading resources from the management +cluster (e.g., the cloud-config validation test). All other tests will run +normally. + +## CI Jobs + +### Self managed + +Any job using `openshift/conformance/parallel` suite on OpenShift self-managed on AWS must run tests provided by CCM-AWS OTE, unless explicitly skipped (See [OTE setup](https://github.com/openshift/cluster-cloud-controller-manager-operator/tree/main/openshift-tests/ccm-aws-tests) for more information). + +### HyperShift (Hosted Cluster) + +The periodic CI job that runs these tests on Hosted Cluster is: + +```text +periodic-ci-openshift-hypershift-release-5.0-e2e-aws-ovn-conformance-ccm-techpreview +``` + +The CI step (`hypershift-conformance`) automatically sets the management cluster +environment variables (`HYPERSHIFT_MANAGEMENT_CLUSTER_KUBECONFIG`, +`HYPERSHIFT_MANAGEMENT_CLUSTER_NAMESPACE`) before launching `openshift-tests`. diff --git a/docs/dev/ote-ccm-aws.md b/docs/dev/ote-ccm-aws.md deleted file mode 100644 index 8b89736b2..000000000 --- a/docs/dev/ote-ccm-aws.md +++ /dev/null @@ -1,92 +0,0 @@ -# OpenShift Tests Extension (OTE) for AWS Cloud Controller Manager - -The OpenShift Tests Extension (OTE) binary `aws-cloud-controller-manager-tests-ext` -exposes the upstream e2e tests from `cloud-provider-aws` (implemented under `tests/e2e`) -to the OpenShift test framework. Tests are selected/curated by the filters defined in -`main.go`, so that only the intended subset is available in the OpenShift test pipeline. - -OpenShift-specific (downstream) tests live under `cmd/cloud-controller-manager-aws-tests-ext/e2e` -and are added to the list of tests executed by `openshift-tests`. - -The OTE library uses dot imports to register both upstream and downstream Ginkgo specs into the suite. - -```mermaid -flowchart LR - A["Upstream Tests
kubernetes/cloud-provider-aws
tests/e2e"] --> C["OTE Binary
aws-cloud-controller-manager-tests-ext"] - B["Downstream Tests
openshift/cluster-cloud-controller-manager-operator
cmd/cloud-controller-manager-aws-tests-ext/e2e"] --> C - C --> D["openshift-tests
execution framework"] - - style A fill:#e1f5ff - style B fill:#fff4e1 - style C fill:#e8f5e9 - style D fill:#f3e5f5 -``` - -## Directory Structure - -``` -cmd/cloud-controller-manager-aws-tests-ext/ -├── e2e/ -│ ├── loadbalancer.go # OpenShift-specific load balancer tests -│ └── helper.go # Helper functions (AWS clients, feature gate checks, etc.) -├── main.go # Test binary entrypoint -└── README.md -``` - -## Prerequisites - -- Go 1.24+ -- Access to an OpenShift cluster on AWS -- Valid `KUBECONFIG` pointing to the cluster -- AWS credentials configured (for tests that query AWS APIs) - -## Building - -### Building the test binary - -To build the OTE binary from the root of the project: - -```sh -make cloud-controller-manager-aws-tests-ext -``` - -The binary will be created at the `bin/` directory: `./bin/cloud-controller-manager-aws-tests-ext`. - -### Building the container image - -To build the container image (regular CCCMO build): - -```sh -make image -``` - -The binary will be created in the container image path `/usr/bin/cloud-controller-manager-aws-tests-ext.gz`. - -## Using the test binary - -### List available tests - -```sh -./aws-cloud-controller-manager-tests-ext list --topology=HighAvailability --platform=aws | jq .[].name -``` - -Example output: -``` -"[cloud-provider-aws-e2e-openshift] loadbalancer NLB feature AWSServiceLBNetworkSecurityGroup should have NLBSecurityGroupMode = Managed in cloud-config [Suite:openshift/conformance/parallel]" -... -"[cloud-provider-aws-e2e] nodes should set zone-id topology label [Suite:openshift/conformance/parallel]" -"[cloud-provider-aws-e2e] nodes should label nodes with topology network info if instance is supported [Suite:openshift/conformance/parallel]" -``` - -### List test suites - -```sh -./bin/cloud-controller-manager-aws-tests-ext list suites | jq .[].name -``` - -### Run a specific test - -```sh -export KUBECONFIG=/path/to/kubeconfig -./bin/cloud-controller-manager-aws-tests-ext run-test "[cloud-provider-aws-e2e-openshift] loadbalancer NLB [OCPFeatureGate:AWSServiceLBNetworkSecurityGroup] should have security groups attached to default ingress controller NLB [Suite:openshift/conformance/parallel]" -``` diff --git a/openshift-tests/ccm-aws-tests/e2e/aws/helper.go b/openshift-tests/ccm-aws-tests/e2e/aws/helper.go index f08e7a7cb..c0fec51bc 100644 --- a/openshift-tests/ccm-aws-tests/e2e/aws/helper.go +++ b/openshift-tests/ccm-aws-tests/e2e/aws/helper.go @@ -3,6 +3,7 @@ package aws import ( "context" "fmt" + "regexp" "strings" "time" @@ -14,17 +15,71 @@ import ( elbv2 "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2" elbv2types "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2/types" "github.com/openshift/cluster-cloud-controller-manager-operator/openshift-tests/ccm-aws-tests/e2e/common" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/kubernetes/test/e2e/framework" ) +// awsRegionPattern matches valid AWS region names across all partitions: +// standard (aws: us-east-1), China (aws-cn: cn-northwest-1), +// GovCloud (aws-us-gov: us-gov-west-1), European Sovereign Cloud (aws-eusc: eusc-de-east-1), +// and ISO/ISOB (aws-iso/iso-b: us-isob-east-1). +var awsRegionPattern = regexp.MustCompile(`^[a-z]{2,4}(?:-[a-z0-9]+)+-\d+$`) + // AWS helpers +// loadAWSConfig loads the default AWS SDK configuration. If the resolved region +// is not a valid AWS region (e.g. a CI lease UUID), it falls back to the region +// from the cluster's Infrastructure resource. +func loadAWSConfig(ctx context.Context) (aws.Config, error) { + cfg, err := config.LoadDefaultConfig(ctx) + if err != nil { + return aws.Config{}, fmt.Errorf("unable to load AWS config: %v", err) + } + + // This validation is required to prevent landing wrong hypershift region configurations + // set by environment variable. + if !awsRegionPattern.MatchString(cfg.Region) { + region, err := getRegionFromInfrastructure(ctx) + if err != nil { + return aws.Config{}, fmt.Errorf("AWS region %q is not valid and failed to get region from Infrastructure: %v", cfg.Region, err) + } + framework.Logf("AWS SDK region %q is not valid, using region from Infrastructure: %s", cfg.Region, region) + cfg.Region = region + } + + framework.Logf("AWS config loaded: region=%s", cfg.Region) + return cfg, nil +} + +// getRegionFromInfrastructure reads the AWS region from the cluster's +// Infrastructure resource (status.platformStatus.aws.region). +func getRegionFromInfrastructure(ctx context.Context) (string, error) { + oc, err := common.GetOcClient(ctx) + if err != nil { + return "", fmt.Errorf("failed to create config client: %v", err) + } + infra, err := oc.Infrastructures().Get(ctx, "cluster", metav1.GetOptions{}) + if err != nil { + return "", fmt.Errorf("failed to get Infrastructure: %v", err) + } + if infra.Status.PlatformStatus == nil || infra.Status.PlatformStatus.AWS == nil { + return "", fmt.Errorf("Infrastructure platformStatus.aws is nil") + } + region := infra.Status.PlatformStatus.AWS.Region + if region == "" { + return "", fmt.Errorf("Infrastructure platformStatus.aws.region is empty") + } + return region, nil +} + // createAWSClientLoadBalancer creates an AWS ELBv2 client using default credentials configured in the environment. +// It forces the public regional endpoint to avoid VPC private endpoint DNS +// resolution issues when running from a management cluster (HyperShift). func createAWSClientLoadBalancer(ctx context.Context) (*elbv2.Client, error) { - cfg, err := config.LoadDefaultConfig(ctx) + cfg, err := loadAWSConfig(ctx) if err != nil { - return nil, fmt.Errorf("unable to load AWS config: %v", err) + return nil, err } customRetryer := retry.NewStandard(func(o *retry.StandardOptions) { @@ -34,6 +89,9 @@ func createAWSClientLoadBalancer(ctx context.Context) (*elbv2.Client, error) { return elbv2.NewFromConfig(cfg, func(o *elbv2.Options) { o.Retryer = customRetryer + if cfg.Region != "" { + o.BaseEndpoint = aws.String(fmt.Sprintf("https://elasticloadbalancing.%s.amazonaws.com", cfg.Region)) + } }), nil } @@ -100,13 +158,19 @@ func isFeatureEnabled(ctx context.Context, featureName string) (bool, error) { return common.IsFeatureEnabled(ctx, featureName) } -// getAWSClientEC2 creates an AWS EC2 client using default credentials configured in the environment. +// createAWSClientEC2 creates an AWS EC2 client using default credentials configured in the environment. +// It forces the public regional endpoint to avoid VPC private endpoint DNS +// resolution issues when running from a management cluster (HyperShift). func createAWSClientEC2(ctx context.Context) (*ec2.Client, error) { - cfg, err := config.LoadDefaultConfig(ctx) + cfg, err := loadAWSConfig(ctx) if err != nil { - return nil, fmt.Errorf("unable to load AWS config: %v", err) + return nil, err } - return ec2.NewFromConfig(cfg), nil + return ec2.NewFromConfig(cfg, func(o *ec2.Options) { + if cfg.Region != "" { + o.BaseEndpoint = aws.String(fmt.Sprintf("https://ec2.%s.amazonaws.com", cfg.Region)) + } + }), nil } // getAWSSecurityGroup retrieves a security group by ID using the AWS EC2 client. diff --git a/openshift-tests/ccm-aws-tests/e2e/aws/loadbalancer.go b/openshift-tests/ccm-aws-tests/e2e/aws/loadbalancer.go index 796e2c3fc..c0549414c 100644 --- a/openshift-tests/ccm-aws-tests/e2e/aws/loadbalancer.go +++ b/openshift-tests/ccm-aws-tests/e2e/aws/loadbalancer.go @@ -77,8 +77,9 @@ var _ = Describe(fmt.Sprintf("%s NLB [OCPFeatureGate:%s]", e2eTestPrefixLoadBala // - The test must skip if the feature gate is not enabled It("should have NLBSecurityGroupMode with 'Managed value in cloud-config", func(ctx context.Context) { isNLBFeatureEnabled(ctx) + common.SkipIfManagementClusterTestsDisabled() - By("getting cloud-config ConfigMap from openshift-cloud-controller-manager namespace") + By("getting cloud-config ConfigMap") cm, err := common.GetCloudConfig(ctx, cs) framework.ExpectNoError(err, "failed to get cloud-config ConfigMap") diff --git a/openshift-tests/ccm-aws-tests/e2e/common/helper.go b/openshift-tests/ccm-aws-tests/e2e/common/helper.go index 2f285ab4c..9b694ad1f 100644 --- a/openshift-tests/ccm-aws-tests/e2e/common/helper.go +++ b/openshift-tests/ccm-aws-tests/e2e/common/helper.go @@ -3,19 +3,31 @@ package common import ( "context" "fmt" + "os" "regexp" "strings" + "github.com/onsi/ginkgo/v2" + configv1 "github.com/openshift/api/config/v1" configv1client "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" "k8s.io/kubernetes/test/e2e/framework" ) const ( cloudConfigNamespace = "openshift-cloud-controller-manager" cloudConfigName = "cloud-conf" + + // HyperShift uses different ConfigMap naming for the CCM cloud config. + hcpCloudConfigName = "aws-cloud-config" + + // EnvSkipManagementClusterTests when set to "true" skips tests that + // require a kubeconfig for the management cluster (e.g. reading the + // CCM cloud-config from a HyperShift hosted control plane). + EnvSkipManagementClusterTests = "SKIP_MANAGEMENT_CLUSTER_TESTS" ) // GetOcClient returns an OpenShift config/v1 API client (FeatureGates, Infrastructures, etc.). @@ -100,12 +112,85 @@ func IsFeatureEnabled(ctx context.Context, featureName string) (bool, error) { return false, nil } -// GetCloudConfig retrieves the CCM cloud-config ConfigMap. +// SkipIfManagementClusterTestsDisabled skips the current test when +// SKIP_MANAGEMENT_CLUSTER_TESTS=true. Call this at the beginning of any +// test that requires access to the management cluster kubeconfig. +// This is useful to provide flexibility on Hypershift jobs that don't want to always +// runs that checks this flag, forcing to skip any matching. +func SkipIfManagementClusterTestsDisabled() { + if os.Getenv(EnvSkipManagementClusterTests) == "true" { + ginkgo.Skip("Skipping: test requires management cluster access and SKIP_MANAGEMENT_CLUSTER_TESTS=true") + } +} + +// IsExternalTopology checks if the cluster has an external control plane topology +// (e.g., HyperShift hosted clusters) by querying the Infrastructure resource. +func IsExternalTopology(ctx context.Context) (bool, error) { + oc, err := GetOcClient(ctx) + if err != nil { + return false, fmt.Errorf("failed to create config client: %v", err) + } + + infra, err := oc.Infrastructures().Get(ctx, "cluster", metav1.GetOptions{}) + if err != nil { + return false, fmt.Errorf("failed to get Infrastructure 'cluster': %v", err) + } + + isExternal := infra.Status.ControlPlaneTopology == configv1.ExternalTopologyMode + // framework.Logf("Cluster control plane topology: %s (external: %v)", infra.Status.ControlPlaneTopology, isExternal) + return isExternal, nil +} + +// getHCPCloudConfig retrieves the CCM cloud config from a HyperShift hosted +// control plane. It reads the management cluster kubeconfig and HCP namespace +// from environment variables set by the CI step, then fetches the +// aws-cloud-config ConfigMap from the HCP namespace. +func getHCPCloudConfig(ctx context.Context) (*v1.ConfigMap, error) { + mgmtKubeconfig := os.Getenv("HYPERSHIFT_MANAGEMENT_CLUSTER_KUBECONFIG") + if len(mgmtKubeconfig) == 0 { + return nil, fmt.Errorf("HYPERSHIFT_MANAGEMENT_CLUSTER_KUBECONFIG must be set for HyperShift topology") + } + + hcpNamespace := os.Getenv("HYPERSHIFT_MANAGEMENT_CLUSTER_NAMESPACE") + if len(hcpNamespace) == 0 { + return nil, fmt.Errorf("HYPERSHIFT_MANAGEMENT_CLUSTER_NAMESPACE must be set for HyperShift topology") + } + + framework.Logf("Using management cluster kubeconfig=%s, HCP namespace=%s", mgmtKubeconfig, hcpNamespace) + + restConfig, err := clientcmd.BuildConfigFromFlags("", mgmtKubeconfig) + if err != nil { + return nil, fmt.Errorf("failed to load management cluster kubeconfig") + } + + mgmtClient, err := clientset.NewForConfig(restConfig) + if err != nil { + return nil, fmt.Errorf("failed to create management cluster client") + } + + cm, err := mgmtClient.CoreV1().ConfigMaps(hcpNamespace).Get(ctx, hcpCloudConfigName, metav1.GetOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to get HCP cloud-config ConfigMap %s/%s", hcpNamespace, hcpCloudConfigName) + } + + framework.Logf("Successfully retrieved HCP cloud-config from %s/%s", hcpNamespace, hcpCloudConfigName) + return cm, nil +} + +// GetCloudConfig retrieves the CCM cloud-config ConfigMap, choosing the right +// source based on cluster topology (HyperShift HCP vs standalone). // When cs is nil, a clientset is created from the current kubeconfig. // This function must not call Ginkgo control-flow helpers (Skip, Fail, etc.) // because it is also called from main.go outside a spec context. func GetCloudConfig(ctx context.Context, cs clientset.Interface) (*v1.ConfigMap, error) { - var err error + isExternal, err := IsExternalTopology(ctx) + if err != nil { + return nil, fmt.Errorf("failed to detect cluster topology: %w", err) + } + + if isExternal { + return getHCPCloudConfig(ctx) + } if cs == nil { cs, err = GetKubeClient(ctx) if err != nil { diff --git a/openshift-tests/ccm-aws-tests/go.mod b/openshift-tests/ccm-aws-tests/go.mod index c9a49c2f0..bc14fcc74 100644 --- a/openshift-tests/ccm-aws-tests/go.mod +++ b/openshift-tests/ccm-aws-tests/go.mod @@ -10,6 +10,7 @@ require ( github.com/onsi/ginkgo/v2 v2.28.1 github.com/onsi/gomega v1.39.1 github.com/openshift-eng/openshift-tests-extension v0.0.0-20250916161632-d81c09058835 + github.com/openshift/api v0.0.0-20260429122012-1180c0f5c3e9 github.com/openshift/client-go v0.0.0-20260429123927-c81f86abfa6a github.com/sirupsen/logrus v1.9.4 github.com/spf13/cobra v1.10.2 @@ -68,7 +69,6 @@ require ( github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect - github.com/openshift/api v0.0.0-20260429122012-1180c0f5c3e9 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_golang v1.23.2 // indirect From 53b70cda9c043bea14e4826978a5e34ab39e969f Mon Sep 17 00:00:00 2001 From: Marco Braga Date: Mon, 25 May 2026 15:22:38 -0300 Subject: [PATCH 2/2] fix(e2e/ote): improve error handling, wrapping and context Address chaibot feeback by updating error wrapping, as well improve const documentations of decision taking of using regional endpoints preventing VPC provided endpoints which does not work outside VPC - where the client/ote runs. --- .../ccm-aws-tests/e2e/aws/helper.go | 22 +++++++++++-------- .../ccm-aws-tests/e2e/aws/loadbalancer.go | 2 +- .../ccm-aws-tests/e2e/common/helper.go | 6 ++--- openshift-tests/ccm-aws-tests/main.go | 2 +- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/openshift-tests/ccm-aws-tests/e2e/aws/helper.go b/openshift-tests/ccm-aws-tests/e2e/aws/helper.go index c0fec51bc..fdb9ccf7a 100644 --- a/openshift-tests/ccm-aws-tests/e2e/aws/helper.go +++ b/openshift-tests/ccm-aws-tests/e2e/aws/helper.go @@ -34,7 +34,7 @@ var awsRegionPattern = regexp.MustCompile(`^[a-z]{2,4}(?:-[a-z0-9]+)+-\d+$`) func loadAWSConfig(ctx context.Context) (aws.Config, error) { cfg, err := config.LoadDefaultConfig(ctx) if err != nil { - return aws.Config{}, fmt.Errorf("unable to load AWS config: %v", err) + return aws.Config{}, fmt.Errorf("unable to load AWS config: %w", err) } // This validation is required to prevent landing wrong hypershift region configurations @@ -42,7 +42,7 @@ func loadAWSConfig(ctx context.Context) (aws.Config, error) { if !awsRegionPattern.MatchString(cfg.Region) { region, err := getRegionFromInfrastructure(ctx) if err != nil { - return aws.Config{}, fmt.Errorf("AWS region %q is not valid and failed to get region from Infrastructure: %v", cfg.Region, err) + return aws.Config{}, fmt.Errorf("AWS region %q is not valid and failed to get region from Infrastructure: %w", cfg.Region, err) } framework.Logf("AWS SDK region %q is not valid, using region from Infrastructure: %s", cfg.Region, region) cfg.Region = region @@ -57,11 +57,11 @@ func loadAWSConfig(ctx context.Context) (aws.Config, error) { func getRegionFromInfrastructure(ctx context.Context) (string, error) { oc, err := common.GetOcClient(ctx) if err != nil { - return "", fmt.Errorf("failed to create config client: %v", err) + return "", fmt.Errorf("failed to create config client: %w", err) } infra, err := oc.Infrastructures().Get(ctx, "cluster", metav1.GetOptions{}) if err != nil { - return "", fmt.Errorf("failed to get Infrastructure: %v", err) + return "", fmt.Errorf("failed to get Infrastructure: %w", err) } if infra.Status.PlatformStatus == nil || infra.Status.PlatformStatus.AWS == nil { return "", fmt.Errorf("Infrastructure platformStatus.aws is nil") @@ -89,6 +89,8 @@ func createAWSClientLoadBalancer(ctx context.Context) (*elbv2.Client, error) { return elbv2.NewFromConfig(cfg, func(o *elbv2.Options) { o.Retryer = customRetryer + // Use regional endpoints when region is defined preventing malformed or + // unreachable (from test binary, which usually runs outside cluster's VPC) endpoints. if cfg.Region != "" { o.BaseEndpoint = aws.String(fmt.Sprintf("https://elasticloadbalancing.%s.amazonaws.com", cfg.Region)) } @@ -105,7 +107,7 @@ func getAWSLoadBalancerFromDNSName(ctx context.Context, elbClient *elbv2.Client, for paginator.HasMorePages() { page, err := paginator.NextPage(ctx) if err != nil { - framework.Logf("transient error describing load balancers (will retry): %v", err) + framework.Logf("transient error describing load balancers (will retry): %w", err) return false, nil } @@ -122,7 +124,7 @@ func getAWSLoadBalancerFromDNSName(ctx context.Context, elbClient *elbv2.Client, }) if err != nil { - return nil, fmt.Errorf("failed to find load balancer with DNS name %s: %v", lbDNSName, err) + return nil, fmt.Errorf("failed to find load balancer with DNS name %s: %w", lbDNSName, err) } if foundLB == nil { @@ -139,7 +141,7 @@ func findAWSLoadBalancerByDNSName(ctx context.Context, elbClient *elbv2.Client, for paginator.HasMorePages() { page, err := paginator.NextPage(ctx) if err != nil { - return nil, fmt.Errorf("failed to describe load balancers: %v", err) + return nil, fmt.Errorf("failed to describe load balancers: %w", err) } framework.Logf("found %d load balancers in page", len(page.LoadBalancers)) @@ -167,6 +169,8 @@ func createAWSClientEC2(ctx context.Context) (*ec2.Client, error) { return nil, err } return ec2.NewFromConfig(cfg, func(o *ec2.Options) { + // Use regional endpoints when region is defined preventing malformed or + // unreachable (from test binary, which usually runs outside cluster's VPC) endpoints. if cfg.Region != "" { o.BaseEndpoint = aws.String(fmt.Sprintf("https://ec2.%s.amazonaws.com", cfg.Region)) } @@ -182,7 +186,7 @@ func getAWSSecurityGroup(ctx context.Context, ec2Client *ec2.Client, sgID string result, err := ec2Client.DescribeSecurityGroups(ctx, input) if err != nil { - return nil, fmt.Errorf("failed to describe security group %s: %v", sgID, err) + return nil, fmt.Errorf("failed to describe security group %s: %w", sgID, err) } if len(result.SecurityGroups) == 0 { @@ -220,7 +224,7 @@ func securityGroupExists(ctx context.Context, ec2Client *ec2.Client, sgID string framework.Logf("security group %s does not exist", sgID) return false, nil } - return false, fmt.Errorf("failed to check security group %s: %v", sgID, err) + return false, fmt.Errorf("failed to check security group %s: %w", sgID, err) } framework.Logf("security group %s exists", sgID) diff --git a/openshift-tests/ccm-aws-tests/e2e/aws/loadbalancer.go b/openshift-tests/ccm-aws-tests/e2e/aws/loadbalancer.go index c0549414c..07b1e199a 100644 --- a/openshift-tests/ccm-aws-tests/e2e/aws/loadbalancer.go +++ b/openshift-tests/ccm-aws-tests/e2e/aws/loadbalancer.go @@ -168,7 +168,7 @@ var _ = Describe(fmt.Sprintf("%s NLB [OCPFeatureGate:%s]", e2eTestPrefixLoadBala err = wait.PollUntilContextTimeout(ctx, 5*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) { s, err := cs.CoreV1().Services(ingressNamespace).Get(ctx, ingressServiceName, metav1.GetOptions{}) if err != nil { - framework.Logf("Failed to get service %s/%s: %v", ingressNamespace, ingressServiceName, err) + framework.Logf("Failed to get service %s/%s: %w", ingressNamespace, ingressServiceName, err) return false, nil } svc = s diff --git a/openshift-tests/ccm-aws-tests/e2e/common/helper.go b/openshift-tests/ccm-aws-tests/e2e/common/helper.go index 9b694ad1f..d2eb1bb45 100644 --- a/openshift-tests/ccm-aws-tests/e2e/common/helper.go +++ b/openshift-tests/ccm-aws-tests/e2e/common/helper.go @@ -128,16 +128,16 @@ func SkipIfManagementClusterTestsDisabled() { func IsExternalTopology(ctx context.Context) (bool, error) { oc, err := GetOcClient(ctx) if err != nil { - return false, fmt.Errorf("failed to create config client: %v", err) + return false, fmt.Errorf("failed to create config client: %w", err) } infra, err := oc.Infrastructures().Get(ctx, "cluster", metav1.GetOptions{}) if err != nil { - return false, fmt.Errorf("failed to get Infrastructure 'cluster': %v", err) + return false, fmt.Errorf("failed to get Infrastructure 'cluster': %w", err) } isExternal := infra.Status.ControlPlaneTopology == configv1.ExternalTopologyMode - // framework.Logf("Cluster control plane topology: %s (external: %v)", infra.Status.ControlPlaneTopology, isExternal) + framework.Logf("Cluster control plane topology: %s (external: %v)", infra.Status.ControlPlaneTopology, isExternal) return isExternal, nil } diff --git a/openshift-tests/ccm-aws-tests/main.go b/openshift-tests/ccm-aws-tests/main.go index e817f17e8..caf442aac 100644 --- a/openshift-tests/ccm-aws-tests/main.go +++ b/openshift-tests/ccm-aws-tests/main.go @@ -212,7 +212,7 @@ func initFrameworkForTests() error { func initFrameworkForTest() error { if ad := os.Getenv("ARTIFACT_DIR"); len(strings.TrimSpace(ad)) == 0 { if err := os.Setenv("ARTIFACT_DIR", filepath.Join(os.TempDir(), "artifacts")); err != nil { - return fmt.Errorf("unable to set ARTIFACT_DIR: %v", err) + return fmt.Errorf("unable to set ARTIFACT_DIR: %w", err) } }