Skip to content

Commit aa2b3cd

Browse files
grpc: Add noncebalancer that tracks non-READY backends
1 parent 84b88da commit aa2b3cd

10 files changed

Lines changed: 564 additions & 6 deletions

File tree

cmd/boulder-wfe2/main.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/letsencrypt/boulder/goodkey/sagoodkey"
2020
bgrpc "github.com/letsencrypt/boulder/grpc"
2121
"github.com/letsencrypt/boulder/grpc/noncebalancer"
22+
"github.com/letsencrypt/boulder/grpc/noncebalancerv2"
2223
"github.com/letsencrypt/boulder/issuance"
2324
"github.com/letsencrypt/boulder/nonce"
2425
rapb "github.com/letsencrypt/boulder/ra/proto"
@@ -318,9 +319,11 @@ func main() {
318319
cmd.FailOnError(err, "Failed to load credentials and create gRPC connection to get nonce service")
319320
gnc := nonce.NewGetter(getNonceConn)
320321

321-
if c.WFE.RedeemNonceService.SRVResolver != noncebalancer.SRVResolverScheme {
322+
if c.WFE.RedeemNonceService.SRVResolver != noncebalancer.SRVResolverScheme &&
323+
c.WFE.RedeemNonceService.SRVResolver != noncebalancerv2.SRVResolverScheme {
322324
cmd.Fail(fmt.Sprintf(
323-
"'redeemNonceService.SRVResolver' must be set to %q", noncebalancer.SRVResolverScheme),
325+
"'redeemNonceService.SRVResolver' must be set to %q or %q",
326+
noncebalancer.SRVResolverScheme, noncebalancerv2.SRVResolverScheme),
324327
)
325328
}
326329
redeemNonceConn, err := bgrpc.ClientSetup(c.WFE.RedeemNonceService, tlsConfig, stats, clk)

cmd/config.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ type GRPCClientConfig struct {
306306
// implementation of the SRV resolver should be used. The default is 'srv'
307307
// For more details, see the documentation in:
308308
// grpc/internal/resolver/dns/dns_resolver.go.
309-
SRVResolver string `validate:"excluded_with=ServerAddress,isdefault|oneof=srv nonce-srv"`
309+
SRVResolver string `validate:"excluded_with=ServerAddress,isdefault|oneof=srv nonce-srv nonce-srv-v2"`
310310

311311
// ServerAddress is a single <hostname|IPv4|[IPv6]>:<port> or `:<port>` that
312312
// the gRPC client will, if necessary, resolve via DNS and then connect to.

grpc/internal/resolver/dns/dns_resolver.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import (
4040
"github.com/letsencrypt/boulder/bdns"
4141
"github.com/letsencrypt/boulder/grpc/internal/backoff"
4242
"github.com/letsencrypt/boulder/grpc/noncebalancer"
43+
"github.com/letsencrypt/boulder/grpc/noncebalancerv2"
4344
)
4445

4546
var logger = grpclog.Component("srv")
@@ -54,6 +55,7 @@ var (
5455
func init() {
5556
resolver.Register(NewDefaultSRVBuilder())
5657
resolver.Register(NewNonceSRVBuilder())
58+
resolver.Register(NewNonceSRVBuilderV2())
5759
}
5860

5961
const defaultDNSSvrPort = "53"
@@ -96,6 +98,12 @@ func NewNonceSRVBuilder() resolver.Builder {
9698
return &srvBuilder{scheme: noncebalancer.SRVResolverScheme, balancer: noncebalancer.Name}
9799
}
98100

101+
// NewNonceSRVBuilderV2 creates a srvBuilder which is used to factory SRV DNS
102+
// resolvers with the v2 nonce balancer used by nonce-service clients.
103+
func NewNonceSRVBuilderV2() resolver.Builder {
104+
return &srvBuilder{scheme: noncebalancerv2.SRVResolverScheme, balancer: noncebalancerv2.Name}
105+
}
106+
99107
type srvBuilder struct {
100108
scheme string
101109
balancer string

grpc/noncebalancerv2/balancer.go

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
package noncebalancerv2
2+
3+
import (
4+
"errors"
5+
"fmt"
6+
7+
"google.golang.org/grpc/balancer"
8+
"google.golang.org/grpc/balancer/base"
9+
"google.golang.org/grpc/connectivity"
10+
"google.golang.org/grpc/grpclog"
11+
"google.golang.org/grpc/resolver"
12+
)
13+
14+
var logger = grpclog.Component("noncebalancerv2")
15+
16+
// nonceBalancer implements balancer.Balancer. It is a near-exact copy of
17+
// grpc/balancer/base/balancer.go's baseBalancer with one difference:
18+
// regeneratePicker passes ALL resolver-tracked SubConns to the picker, not just
19+
// READY ones. This allows the picker to distinguish "backend is temporarily
20+
// reconnecting" (queue the RPC via ErrNoSubConnAvailable) from "prefix is
21+
// genuinely unknown" (fail with ErrNoBackendsMatchPrefix).
22+
type nonceBalancer struct {
23+
cc balancer.ClientConn
24+
25+
csEvltr *balancer.ConnectivityStateEvaluator
26+
state connectivity.State
27+
28+
subConns *resolver.AddressMapV2[balancer.SubConn]
29+
scStates map[balancer.SubConn]connectivity.State
30+
picker balancer.Picker
31+
config base.Config
32+
33+
resolverErr error // the last error reported by the resolver; cleared on successful resolution
34+
connErr error // the last connection error; cleared upon leaving TransientFailure
35+
}
36+
37+
func (b *nonceBalancer) ResolverError(err error) {
38+
b.resolverErr = err
39+
if b.subConns.Len() == 0 {
40+
b.state = connectivity.TransientFailure
41+
}
42+
43+
if b.state != connectivity.TransientFailure {
44+
// The picker will not change since the balancer does not currently
45+
// report an error.
46+
return
47+
}
48+
b.regeneratePicker()
49+
b.cc.UpdateState(balancer.State{
50+
ConnectivityState: b.state,
51+
Picker: b.picker,
52+
})
53+
}
54+
55+
func (b *nonceBalancer) UpdateClientConnState(s balancer.ClientConnState) error {
56+
// TODO: handle s.ResolverState.ServiceConfig?
57+
if logger.V(2) {
58+
logger.Info("noncebalancer: got new ClientConn state: ", s)
59+
}
60+
// Successful resolution; clear resolver error and ensure we return nil.
61+
b.resolverErr = nil
62+
// addrsSet is the set converted from addrs, it's used for quick lookup of an address.
63+
addrsSet := resolver.NewAddressMapV2[any]()
64+
for _, a := range s.ResolverState.Addresses {
65+
addrsSet.Set(a, nil)
66+
if _, ok := b.subConns.Get(a); !ok {
67+
// a is a new address (not existing in b.subConns).
68+
var sc balancer.SubConn
69+
opts := balancer.NewSubConnOptions{
70+
HealthCheckEnabled: b.config.HealthCheck,
71+
StateListener: func(scs balancer.SubConnState) { b.updateSubConnState(sc, scs) },
72+
}
73+
sc, err := b.cc.NewSubConn([]resolver.Address{a}, opts)
74+
if err != nil {
75+
logger.Warningf("noncebalancer: failed to create new SubConn: %v", err)
76+
continue
77+
}
78+
b.subConns.Set(a, sc)
79+
b.scStates[sc] = connectivity.Idle
80+
b.csEvltr.RecordTransition(connectivity.Shutdown, connectivity.Idle)
81+
sc.Connect()
82+
}
83+
}
84+
for _, a := range b.subConns.Keys() {
85+
sc, _ := b.subConns.Get(a)
86+
// a was removed by resolver.
87+
if _, ok := addrsSet.Get(a); !ok {
88+
sc.Shutdown()
89+
b.subConns.Delete(a)
90+
// Keep the state of this sc in b.scStates until sc's state becomes Shutdown.
91+
// The entry will be deleted in updateSubConnState.
92+
}
93+
}
94+
// If resolver state contains no addresses, return an error so ClientConn
95+
// will trigger re-resolve. Also records this as a resolver error, so when
96+
// the overall state turns transient failure, the error message will have
97+
// the zero address information.
98+
if len(s.ResolverState.Addresses) == 0 {
99+
b.ResolverError(errors.New("produced zero addresses"))
100+
return balancer.ErrBadResolverState
101+
}
102+
103+
b.regeneratePicker()
104+
b.cc.UpdateState(balancer.State{ConnectivityState: b.state, Picker: b.picker})
105+
return nil
106+
}
107+
108+
// mergeErrors builds an error from the last connection error and the last
109+
// resolver error. Must only be called if b.state is TransientFailure.
110+
func (b *nonceBalancer) mergeErrors() error {
111+
// connErr must always be non-nil unless there are no SubConns, in which
112+
// case resolverErr must be non-nil.
113+
if b.connErr == nil {
114+
return fmt.Errorf("last resolver error: %v", b.resolverErr)
115+
}
116+
if b.resolverErr == nil {
117+
return fmt.Errorf("last connection error: %v", b.connErr)
118+
}
119+
return fmt.Errorf("last connection error: %v; last resolver error: %v", b.connErr, b.resolverErr)
120+
}
121+
122+
// regeneratePicker takes a snapshot of the balancer, and generates a picker
123+
// from it. The picker is
124+
// - errPicker if the balancer is in TransientFailure,
125+
// - a nonce picker with all READY SubConns and all known SubConns otherwise.
126+
//
127+
// This is the only method that differs from baseBalancer: it builds both a
128+
// READY set and a not-READY set from b.subConns. baseBalancer only builds the
129+
// READY set.
130+
func (b *nonceBalancer) regeneratePicker() {
131+
if b.state == connectivity.TransientFailure {
132+
b.picker = base.NewErrPicker(b.mergeErrors())
133+
return
134+
}
135+
readySCs := make(map[balancer.SubConn]resolver.Address)
136+
notReadySCs := make(map[balancer.SubConn]resolver.Address)
137+
138+
for _, addr := range b.subConns.Keys() {
139+
sc, _ := b.subConns.Get(addr)
140+
if st, ok := b.scStates[sc]; ok && st == connectivity.Ready {
141+
readySCs[sc] = addr
142+
} else {
143+
notReadySCs[sc] = addr
144+
}
145+
}
146+
b.picker = &picker{
147+
readyBackends: readySCs,
148+
notReadyBackends: notReadySCs,
149+
}
150+
}
151+
152+
// UpdateSubConnState is a nop because a StateListener is always set in NewSubConn.
153+
func (b *nonceBalancer) UpdateSubConnState(sc balancer.SubConn, state balancer.SubConnState) {
154+
logger.Errorf("noncebalancer: UpdateSubConnState(%v, %+v) called unexpectedly", sc, state)
155+
}
156+
157+
func (b *nonceBalancer) updateSubConnState(sc balancer.SubConn, state balancer.SubConnState) {
158+
s := state.ConnectivityState
159+
if logger.V(2) {
160+
logger.Infof("noncebalancer: handle SubConn state change: %p, %v", sc, s)
161+
}
162+
oldS, ok := b.scStates[sc]
163+
if !ok {
164+
if logger.V(2) {
165+
logger.Infof("noncebalancer: got state changes for an unknown SubConn: %p, %v", sc, s)
166+
}
167+
return
168+
}
169+
if oldS == connectivity.TransientFailure &&
170+
(s == connectivity.Connecting || s == connectivity.Idle) {
171+
// Once a subconn enters TRANSIENT_FAILURE, ignore subsequent IDLE or
172+
// CONNECTING transitions to prevent the aggregated state from being
173+
// always CONNECTING when many backends exist but are all down.
174+
if s == connectivity.Idle {
175+
sc.Connect()
176+
}
177+
return
178+
}
179+
b.scStates[sc] = s
180+
switch s {
181+
case connectivity.Idle:
182+
sc.Connect()
183+
case connectivity.Shutdown:
184+
// When an address was removed by resolver, b called Shutdown but kept
185+
// the sc's state in scStates. Remove state for this sc here.
186+
delete(b.scStates, sc)
187+
case connectivity.TransientFailure:
188+
// Save error to be reported via picker.
189+
b.connErr = state.ConnectionError
190+
}
191+
192+
b.state = b.csEvltr.RecordTransition(oldS, s)
193+
194+
// Regenerate picker when one of the following happens:
195+
// - this sc entered or left ready
196+
// - the aggregated state of balancer is TransientFailure
197+
// (may need to update error message)
198+
if (s == connectivity.Ready) != (oldS == connectivity.Ready) ||
199+
b.state == connectivity.TransientFailure {
200+
b.regeneratePicker()
201+
}
202+
b.cc.UpdateState(balancer.State{ConnectivityState: b.state, Picker: b.picker})
203+
}
204+
205+
// Close is a nop because base balancer doesn't have internal state to clean up,
206+
// and it doesn't need to call Shutdown for the SubConns.
207+
func (b *nonceBalancer) Close() {
208+
}
209+
210+
// ExitIdle is a nop because the base balancer attempts to stay connected to
211+
// all SubConns at all times.
212+
func (b *nonceBalancer) ExitIdle() {
213+
}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
package noncebalancerv2
2+
3+
import (
4+
"google.golang.org/grpc/balancer"
5+
"google.golang.org/grpc/balancer/base"
6+
"google.golang.org/grpc/connectivity"
7+
"google.golang.org/grpc/resolver"
8+
)
9+
10+
const (
11+
// Name is the name used to register the nonce balancer with the gRPC
12+
// runtime.
13+
Name = "noncev2"
14+
15+
// SRVResolverScheme is the scheme used to invoke an instance of the SRV
16+
// resolver which will use the noncebalancer to pick backends. It would be
17+
// ideal to export this from the SRV resolver package but that package is
18+
// internal.
19+
SRVResolverScheme = "nonce-srv-v2"
20+
)
21+
22+
type builder struct {
23+
name string
24+
config base.Config
25+
}
26+
27+
// NewBalancerBuilder returns a nonce balancer builder configured by the
28+
// provided config.
29+
func NewBalancerBuilder(name string, config base.Config) balancer.Builder {
30+
return &builder{
31+
name: name,
32+
config: config,
33+
}
34+
}
35+
36+
func (bb *builder) Build(cc balancer.ClientConn, _ balancer.BuildOptions) balancer.Balancer {
37+
bal := &nonceBalancer{
38+
cc: cc,
39+
40+
subConns: resolver.NewAddressMapV2[balancer.SubConn](),
41+
scStates: make(map[balancer.SubConn]connectivity.State),
42+
csEvltr: &balancer.ConnectivityStateEvaluator{},
43+
config: bb.config,
44+
state: connectivity.Connecting,
45+
}
46+
// Initialize picker to a picker that always returns
47+
// ErrNoSubConnAvailable, because when state of a SubConn changes, we
48+
// may call UpdateState with this picker.
49+
bal.picker = base.NewErrPicker(balancer.ErrNoSubConnAvailable)
50+
return bal
51+
}
52+
53+
func (bb *builder) Name() string {
54+
return bb.name
55+
}
56+
57+
func init() {
58+
balancer.Register(NewBalancerBuilder(Name, base.Config{}))
59+
}

0 commit comments

Comments
 (0)