Skip to content

Commit bb64998

Browse files
authored
feat(bigtable): add load balancing penalty for channel (#14149)
1 parent 704a21f commit bb64998

File tree

2 files changed

+124
-1
lines changed

2 files changed

+124
-1
lines changed

bigtable/internal/transport/connpool.go

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,11 @@ import (
3535
"go.opentelemetry.io/otel/attribute"
3636
"go.opentelemetry.io/otel/metric"
3737
gtransport "google.golang.org/api/transport/grpc"
38+
"google.golang.org/grpc/codes"
3839
"google.golang.org/grpc/credentials/alts"
3940
"google.golang.org/grpc/metadata"
4041
"google.golang.org/grpc/peer"
42+
"google.golang.org/grpc/status"
4143

4244
btopt "cloud.google.com/go/bigtable/internal/option"
4345

@@ -48,6 +50,8 @@ import (
4850
// We cap the max draining timeout to 30mins as there might be a long running stream (such as full table scan).
4951
var maxDrainingTimeout = 30 * time.Minute
5052

53+
const artificialLoadIfError = 10
54+
const artificialLoadPenalizedTimer = 5 * time.Second
5155
const requestParamsHeader = "x-goog-request-params"
5256

5357
// ipProtocol represents the type of IP protocol used.
@@ -250,6 +254,7 @@ type connEntry struct {
250254
streamingLoad atomic.Int32 // In-flight streaming requests
251255
errorCount atomic.Int64 // Errors since the last metric report
252256
drainingState atomic.Bool // True if the connection is being gracefully drained.
257+
penaltyExpiry atomic.Int64 // penaltyExpiry stores the UnixNano timestamp of when the penalty ends
253258

254259
}
255260

@@ -271,6 +276,25 @@ func (e *connEntry) createdAt() int64 {
271276
return e.conn.creationTime()
272277
}
273278

279+
// applyErrorPenalty checks if the error warrants a load balancing penalty,
280+
// and if so, sets an expiration time for the artificial load.
281+
func (e *connEntry) applyErrorPenalty(err error) {
282+
if err == nil {
283+
return
284+
}
285+
286+
code := status.Code(err)
287+
288+
// Penalize errors that typically indicate target-specific health or capacity issues.
289+
if code == codes.Unavailable ||
290+
code == codes.ResourceExhausted ||
291+
code == codes.Internal {
292+
// A simple Store is safe here; concurrent updates is fine here.
293+
newExpiry := time.Now().Add(artificialLoadPenalizedTimer).UnixNano()
294+
e.penaltyExpiry.Store(newExpiry)
295+
}
296+
}
297+
274298
// isDraining atomically checks if the connection is in the draining state.
275299
func (e *connEntry) isDraining() bool {
276300
return e.drainingState.Load()
@@ -313,7 +337,18 @@ func (p *BigtableChannelPool) waitForDrainAndClose(entry *connEntry) {
313337
func (e *connEntry) calculateConnLoad() int32 {
314338
unary := e.unaryLoad.Load()
315339
streaming := e.streamingLoad.Load()
316-
return unary + streaming
340+
load := unary + streaming
341+
342+
expiry := e.penaltyExpiry.Load()
343+
if expiry > 0 {
344+
if time.Now().UnixNano() < expiry {
345+
load += artificialLoadIfError // Apply the artificial penalty weight
346+
} else {
347+
// restore to zero
348+
e.penaltyExpiry.CompareAndSwap(expiry, 0)
349+
}
350+
}
351+
return load
317352
}
318353

319354
// BigtableChannelPool implements ConnPool and routes requests to the connection
@@ -673,6 +708,7 @@ func (p *BigtableChannelPool) Invoke(ctx context.Context, method string, args in
673708
err = entry.conn.Invoke(ctx, method, args, reply, opts...)
674709
if err != nil {
675710
entry.errorCount.Add(1)
711+
entry.applyErrorPenalty(err) // Apply penalty on error
676712
}
677713
return err
678714

@@ -823,6 +859,7 @@ func (s *refCountedStream) SendMsg(m interface{}) error {
823859
err := s.ClientStream.SendMsg(m)
824860
if err != nil {
825861
s.entry.errorCount.Add(1)
862+
s.entry.applyErrorPenalty(err)
826863
s.decrementLoad()
827864
}
828865
return err
@@ -835,6 +872,7 @@ func (s *refCountedStream) RecvMsg(m interface{}) error {
835872
// io.EOF is a normal stream termination, not an error to be counted.
836873
if !errors.Is(err, io.EOF) {
837874
s.entry.errorCount.Add(1)
875+
s.entry.applyErrorPenalty(err)
838876
}
839877
s.decrementLoad()
840878
}

bigtable/internal/transport/connpool_test.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,53 @@ func TestNewBigtableChannelPool(t *testing.T) {
689689
})
690690
}
691691

692+
func TestConnEntryCalculateConnLoadWithPenalty(t *testing.T) {
693+
entry := &connEntry{}
694+
695+
// 5 load
696+
entry.unaryLoad.Store(2)
697+
entry.streamingLoad.Store(3)
698+
699+
t.Run("NoPenalty", func(t *testing.T) {
700+
entry.penaltyExpiry.Store(0)
701+
if load := entry.calculateConnLoad(); load != 5 {
702+
t.Errorf("Expected load of 5, got %d", load)
703+
}
704+
})
705+
706+
t.Run("ActivePenalty", func(t *testing.T) {
707+
// Set expiration 5 seconds into the future
708+
future := time.Now().Add(5 * time.Second).UnixNano()
709+
entry.penaltyExpiry.Store(future)
710+
711+
// Expected: 5 (base) + 10 (penalty) = 15
712+
if load := entry.calculateConnLoad(); load != 15 {
713+
t.Errorf("Expected load of 15 due to active penalty, got %d", load)
714+
}
715+
716+
// Verify the timestamp wasn't accidentally cleared
717+
if entry.penaltyExpiry.Load() == 0 {
718+
t.Errorf("Active penalty timestamp was incorrectly cleared")
719+
}
720+
})
721+
722+
t.Run("ExpiredPenaltySelfCleans", func(t *testing.T) {
723+
// Set expiration 1 second in the past
724+
past := time.Now().Add(-1 * time.Second).UnixNano()
725+
entry.penaltyExpiry.Store(past)
726+
727+
// Expected: 5 (penalty is ignored because it expired)
728+
if load := entry.calculateConnLoad(); load != 5 {
729+
t.Errorf("Expected load of 5 due to expired penalty, got %d", load)
730+
}
731+
732+
// Verify the lazy-evaluation reset the timestamp to 0
733+
if expiry := entry.penaltyExpiry.Load(); expiry != 0 {
734+
t.Errorf("Expected expired penalty timestamp to be reset to 0, but it remained %d", expiry)
735+
}
736+
})
737+
}
738+
692739
func TestSelectLeastLoaded(t *testing.T) {
693740
pool := &BigtableChannelPool{}
694741
pool.conns.Store((&[]*connEntry{}))
@@ -983,6 +1030,44 @@ func TestPoolClose(t *testing.T) {
9831030
}
9841031
}
9851032

1033+
func TestConnEntryApplyErrorPenalty(t *testing.T) {
1034+
tests := []struct {
1035+
name string
1036+
err error
1037+
wantPenalty bool
1038+
}{
1039+
{"Nil error", nil, false},
1040+
{"Non-gRPC error", errors.New("standard network error"), false}, // Treated as codes.Unknown
1041+
{"OK", status.Error(codes.OK, "ok"), false},
1042+
{"NotFound", status.Error(codes.NotFound, "key not found"), false},
1043+
{"Canceled", status.Error(codes.Canceled, "client canceled"), false},
1044+
{"Unavailable", status.Error(codes.Unavailable, "server down"), true},
1045+
{"ResourceExhausted", status.Error(codes.ResourceExhausted, "too many requests"), true},
1046+
{"Internal", status.Error(codes.Internal, "proxy error"), true},
1047+
}
1048+
1049+
for _, tc := range tests {
1050+
t.Run(tc.name, func(t *testing.T) {
1051+
entry := &connEntry{}
1052+
1053+
// Apply the error
1054+
entry.applyErrorPenalty(tc.err)
1055+
1056+
expiry := entry.penaltyExpiry.Load()
1057+
1058+
if tc.wantPenalty {
1059+
if expiry == 0 {
1060+
t.Errorf("Expected a penalty to be applied for %v, but expiry was 0", tc.err)
1061+
}
1062+
} else {
1063+
if expiry != 0 {
1064+
t.Errorf("Expected NO penalty for %v, but got expiry timestamp %d", tc.err, expiry)
1065+
}
1066+
}
1067+
})
1068+
}
1069+
}
1070+
9861071
func TestGracefulDraining(t *testing.T) {
9871072
ctx := context.Background()
9881073
fake := &fakeService{}

0 commit comments

Comments
 (0)