Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions internal/app/machined/pkg/controllers/secrets/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ func (ctrl *APIController) reconcile(ctx context.Context, r controller.Runtime,
return err
}
} else {
if err := ctrl.generateWorker(ctx, r, logger, rootSpec, endpointsStr, certSANs); err != nil {
if err := ctrl.generateWorker(ctx, r, logger, rootSpec, endpointsStr, "", certSANs); err != nil {
return err
}
}
Expand Down Expand Up @@ -336,9 +336,11 @@ func (ctrl *APIController) generateControlPlane(ctx context.Context, r controlle
}

func (ctrl *APIController) generateWorker(ctx context.Context, r controller.Runtime, logger *zap.Logger,
rootSpec *secrets.OSRootSpec, endpointsStr []string, certSANs *secrets.CertSANSpec,
rootSpec *secrets.OSRootSpec, endpointsStr []string, endpointHost string, certSANs *secrets.CertSANSpec,
) error {
remoteGen, err := gen.NewRemoteGenerator(rootSpec.Token, endpointsStr, rootSpec.AcceptedCAs)
logger.Debug("Initializing CSR generator", zap.Strings("endpoints", endpointsStr), zap.String("host", endpointHost))

remoteGen, err := gen.NewRemoteGenerator(rootSpec.Token, endpointsStr, endpointHost, rootSpec.AcceptedCAs)
if err != nil {
return fmt.Errorf("failed creating trustd client: %w", err)
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/grpc/gen/remote.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ type RemoteGenerator struct {
}

// NewRemoteGenerator initializes a RemoteGenerator with a preconfigured grpc.ClientConn.
func NewRemoteGenerator(token string, endpoints []string, acceptedCAs []*x509.PEMEncodedCertificate) (g *RemoteGenerator, err error) {
func NewRemoteGenerator(token string, endpoints []string, host string, acceptedCAs []*x509.PEMEncodedCertificate) (g *RemoteGenerator, err error) {
if len(endpoints) == 0 {
return nil, errors.New("at least one root of trust endpoint is required")
}
Expand All @@ -42,7 +42,7 @@ func NewRemoteGenerator(token string, endpoints []string, acceptedCAs []*x509.PE

remoteGeneratorPprof.Add(g, 1)

conn, err := basic.NewConnection(fmt.Sprintf("%s:///%s", resolver.RoundRobinResolverScheme, strings.Join(endpoints, ",")), basic.NewTokenCredentials(token), acceptedCAs)
conn, err := basic.NewConnection(fmt.Sprintf("%s:///%s", resolver.RoundRobinResolverScheme, strings.Join(endpoints, ",")), host, basic.NewTokenCredentials(token), acceptedCAs)
if err != nil {
return nil, err
}
Expand Down
36 changes: 35 additions & 1 deletion pkg/grpc/middleware/auth/basic/basic.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"bytes"
"crypto/tls"
stdx509 "crypto/x509"
"net"

"github.com/siderolabs/crypto/x509"
"github.com/siderolabs/gen/xslices"
Expand All @@ -16,6 +17,7 @@ import (

"github.com/siderolabs/talos/pkg/httpdefaults"
"github.com/siderolabs/talos/pkg/machinery/client/dialer"
"github.com/siderolabs/talos/pkg/machinery/labels"
)

// Credentials describes an authorization method.
Expand All @@ -27,7 +29,7 @@ type Credentials interface {

// NewConnection initializes a grpc.ClientConn configured for basic
// authentication.
func NewConnection(address string, creds credentials.PerRPCCredentials, acceptedCAs []*x509.PEMEncodedCertificate) (conn *grpc.ClientConn, err error) {
func NewConnection(address string, host string, creds credentials.PerRPCCredentials, acceptedCAs []*x509.PEMEncodedCertificate) (conn *grpc.ClientConn, err error) {
tlsConfig := &tls.Config{}

tlsConfig.RootCAs = stdx509.NewCertPool()
Expand All @@ -42,6 +44,7 @@ func NewConnection(address string, creds credentials.PerRPCCredentials, accepted
))

grpcOpts := []grpc.DialOption{
grpc.WithAuthority(ParseAuthority(host)),
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this wouldn't work, as we have a list of endpoints, and pass a single authority with it.

Either the round-robin should return a proper authority, or something else should be going on here.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@smira is there any difference in the way Talos populates the value for the address parameter for the NewConnection method depending on the number of endpoints that would make this fail if there is more than one endpoint to balance the load with ?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, reading through the code of the gRPC client, and looking at our code, we are setting the ServerName correctly in our resolver, so it should be propagated as intended:

Adding the WithAuthority here will override that - and this is to be done by the user. We need a new configuration (new document? field in existing document?) that will allow setting that field (?).

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll try to describe the setup I've successfully tested this with.
I have an existing "admin" cluster made up of Talos nodes.

Within that cluster I have deployed :

  • a Gateway API (Traefik) with a NodePort service listening on ports 32443 and 32501
  • the Kamaji controller
  • a Kamaji TenantControlPlane named tenant-one with talos-csr-signer as an extra container (as described here)
  • gateway and routes to redirect requests for host api.tenant-one.ingress.example.com on both ports to either the Kamaji control plane or CSR signer for that tenant

I also have :

  • a reserved public IP
  • a wildcard DNS entry for *.ingress.example.com pointing to that IP
  • a load balancer attached to that IP, listening on ports 443 and 50001 and forwarding requests to the admin cluster nodes on the nodeports mentioned previously

I'm now trying to add Talos nodes to that tenant-one Kamaji control-plane with the following cluster configuration :

cluster:
  clusterName: tenant-one
  controlPlane:
    endpoint: https://api.tenant-one.ingress.example.com:443

With Talos v1.10.9 (without the proposed changes) when the node is initializing it sends a request to the resolved IP for api.tenant-one.ingress.example.com on port 50001. As that requests doesn't include the configured host name, the Gateway API controller doesn't present the right certificate and Talos can't validate the CA.

With a custom Talos image with the hard-coded stringapi.tenant-one.ingress.example.comas authority, the node successfully gets its certificate from the CSR signer.

Even if I haven't tested this setup with more than one IP resolving the host or with Talos control-plane nodes (and not Kamaji's CSR signer), I don't see why it would not work.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you provide a reproducer - minimal set of manifests that we can test this with? This might be easier than I initially thought.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's great that it works, but the problem with this change is that we have multiple addresses passed down to the gRPC client, and it iterates over them, while we set a single authority. I haven't looked into the code, but I think authority should come from the selected endpoint (not sure if it's possible with gRPC even), or this WithAuthority should be a special case when there is just a single endpoint to talk to.

As it reproduces (with pods in a K8S cluster) a regular setup with 3 control-plane nodes and a bunch of worker Nodes all running Talos images, that test actually also shows that it works even if we pass multiple addresses.

I had to run the test again as the pod logs had expired since my initial test, but here are the logs from one of the worker nodes :

[talos] 2026/01/14 15:43:04 Initializing CSR generator {"component": "controller-runtime", "controller": "secrets.APIController", "endpoints": ["10.244.1.217", "10.244.3.6", "10.244.3.7", "9XX.XXX.XXX.6"], "host": "talos-test.ingress.XXXX.net"}
[talos] 2026/01/14 15:43:04 sending CSR {"component": "controller-runtime", "controller": "secrets.APIController", "endpoints": ["10.244.1.217", "10.244.3.6", "10.244.3.7", "9XX.XXX.XXX.6"]}

You can clearly see the control-plane pod IPs and the external LB IP (resolved from the provided cluster endpoint in the Talos config) being passed as endpoints to the gRPC client.

I'm sorry but I don't see how this would be any different with VM or BM nodes.

Copy link
Copy Markdown
Contributor

@smasset-orange smasset-orange Jan 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only restriction for it to work might be that all control-plane nodes need to include the host used as gRPC authority in the machine certSANs

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only restriction for it to work might be that all control-plane nodes need to include the host used as gRPC authority in the machine certSANs

Yes, exactly, and this is not the case for "bare" Talos, this is only the case for your setup.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only restriction for it to work might be that all control-plane nodes need to include the host used as gRPC authority in the machine certSANs

Yes, exactly, and this is not the case for "bare" Talos, this is only the case for your setup.

I'm afraid this is not just my setup : it was built following Talos documentation guidelines :

When using a TCP loadbalancer, make sure the loadbalancer endpoint is included in the .machine.certSANs list in the machine configuration.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This talks about API access, that is apid, not trustd, and this has nothing to do with what I posted above.

Talos workers connect to trustds using direct connection by IP - that's the primary path.

grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)),
grpc.WithPerRPCCredentials(creds),
grpc.WithSharedWriteBuffer(true),
Expand All @@ -55,3 +58,34 @@ func NewConnection(address string, creds credentials.PerRPCCredentials, accepted

return conn, nil
}

// ParseAuthority checks if provided host parameter is neither empty nor
// an IP address and returns the extracted host if found
// or an empty string in all other cases.
func ParseAuthority(host string) string {
if host == "" {
return ""
}

var parsedHost string

// Check if port is provided and remove it
h, _, err := net.SplitHostPort(host)
if err == nil {
parsedHost = h
} else {
parsedHost = host
}

// If parsedHost is an IP address it should not be used as an authority
if ip := net.ParseIP(parsedHost); ip != nil {
return ""
}

if err := labels.ValidateDNS1123Subdomain(parsedHost); err != nil {
return ""
}

// Otherwise return the parsed host
return parsedHost
}
32 changes: 32 additions & 0 deletions pkg/grpc/middleware/auth/basic/basic_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package basic_test

import (
"testing"

"github.com/siderolabs/talos/pkg/grpc/middleware/auth/basic"
"github.com/stretchr/testify/assert"
)

func TestParseAuthority(t *testing.T) {
for _, tc := range []struct {
host string
want string
}{
{"", ""},
{"::1", ""},
{"[::1]", ""},
{"[::1]:443", ""},
{"127.0.0.1", ""},
{"127.0.0.1:443", ""},
{"[example.com]", ""},
{"example.com", "example.com"},
{"example.com:443", "example.com"},
{"[example.com]:443", "example.com"},
} {
assert.Equalf(t, tc.want, basic.ParseAuthority(tc.host), "ParseAuthority(%q)", tc.host)
}
}
14 changes: 0 additions & 14 deletions pkg/grpc/middleware/auth/basic/username_and_password_test.go

This file was deleted.

Loading