Skip to content

Commit 41ed1d7

Browse files
authored
chore: debug missing digests chunking issues (#11504)
`Read` requests shouldn't fail, otherwise they can cause missing digest issues. There aren't many per day, and there are even less that are above the chunking threshold, so I don't expect many logs here. To help debug some of the issues we're seeing, this PR adds logs and metrics for Read requests that fail during the chunking path.
1 parent c1fe6a4 commit 41ed1d7

File tree

4 files changed

+63
-4
lines changed

4 files changed

+63
-4
lines changed

enterprise/server/byte_stream_server_proxy/byte_stream_server_proxy.go

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,10 @@ func (s *ByteStreamServerProxy) read(ctx context.Context, req *bspb.ReadRequest,
177177
// If chunking is enabled and the blob is large enough, try to read the chunks.
178178
// TODO(buildbuddy-internal#6426): Once most large blobs are chunked, consider
179179
// attempting the chunked read first.
180+
var chunkedErr error
180181
if s.shouldReadChunked(ctx, req, rn) {
181-
chunkMetrics, chunkedErr := s.readChunked(ctx, req, stream, rn)
182+
chunkMetrics, err := s.readChunked(ctx, req, stream, rn)
183+
chunkedErr = err
182184
if chunkedErr == nil {
183185
return readMetrics{
184186
cacheStatus: metrics.MissStatusLabel,
@@ -189,15 +191,18 @@ func (s *ByteStreamServerProxy) read(ctx context.Context, req *bspb.ReadRequest,
189191
chunksLocal: chunkMetrics.chunksLocal,
190192
chunksRemote: chunkMetrics.chunksRemote,
191193
}, nil
192-
} else if !status.IsNotFoundError(chunkedErr) {
193-
log.CtxWarningf(ctx, "Error reading chunked blob %s: %s", rn.DownloadString(), chunkedErr)
194194
}
195195
}
196196

197+
remoteErr := s.readRemoteWriteLocal(req, stream)
198+
if remoteErr != nil && chunkedErr != nil {
199+
// TODO(buildbuddy-internal#6426): Remove once root cause for missing digests is found.
200+
log.CtxWarningf(ctx, "Proxy chunked read failed for %s (remote fallback also failed): chunkedErr=%s remoteErr=%s", rn.DownloadString(), chunkedErr, remoteErr)
201+
}
197202
return readMetrics{
198203
cacheStatus: metrics.MissStatusLabel,
199204
compressor: rn.GetCompressor().String(),
200-
}, s.readRemoteWriteLocal(req, stream)
205+
}, remoteErr
201206
} else {
202207
return readMetrics{
203208
cacheStatus: metrics.HitStatusLabel,
@@ -229,6 +234,10 @@ func (s *ByteStreamServerProxy) readChunked(ctx context.Context, req *bspb.ReadR
229234
// Using the interface.Cache requires namespacing the AC keys by group ID.
230235
ctx, err := prefix.AttachUserPrefixToContext(ctx, s.authenticator)
231236
if err != nil {
237+
metrics.ByteStreamProxyChunkedReadFailures.With(prometheus.Labels{
238+
metrics.ChunkedFailureReasonLabel: "auth_error",
239+
metrics.StatusHumanReadableLabel: status.MetricsLabel(err),
240+
}).Inc()
232241
return m, err
233242
}
234243

@@ -242,17 +251,29 @@ func (s *ByteStreamServerProxy) readChunked(ctx context.Context, req *bspb.ReadR
242251
}
243252
splitResp, err := s.remoteCAS.SplitBlob(ctx, splitReq)
244253
if err != nil {
254+
metrics.ByteStreamProxyChunkedReadFailures.With(prometheus.Labels{
255+
metrics.ChunkedFailureReasonLabel: "split_blob_error",
256+
metrics.StatusHumanReadableLabel: status.MetricsLabel(err),
257+
}).Inc()
245258
return m, err
246259
}
247260
for _, chunkDigest := range splitResp.GetChunkDigests() {
248261
chunkRN := digest.NewCASResourceName(chunkDigest, instanceName, digestFunction)
249262
chunkRN.SetCompressor(rn.GetCompressor())
250263
if err := s.local.ReadCASResource(ctx, chunkRN, 0, 0, stream); status.IsNotFoundError(err) {
251264
if err := s.readRemoteWriteLocal(&bspb.ReadRequest{ResourceName: chunkRN.DownloadString()}, stream); err != nil {
265+
metrics.ByteStreamProxyChunkedReadFailures.With(prometheus.Labels{
266+
metrics.ChunkedFailureReasonLabel: "chunk_remote_fetch_error",
267+
metrics.StatusHumanReadableLabel: status.MetricsLabel(err),
268+
}).Inc()
252269
return m, err
253270
}
254271
m.chunksRemote++
255272
} else if err != nil {
273+
metrics.ByteStreamProxyChunkedReadFailures.With(prometheus.Labels{
274+
metrics.ChunkedFailureReasonLabel: "chunk_local_read_error",
275+
metrics.StatusHumanReadableLabel: status.MetricsLabel(err),
276+
}).Inc()
256277
return m, err
257278
} else {
258279
m.chunksLocal++

server/metrics/metrics.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,8 @@ const (
179179
// Whether the request was handled using chunking
180180
ChunkedLabel = "chunked"
181181

182+
ChunkedFailureReasonLabel = "reason"
183+
182184
// The name of the table in Clickhouse
183185
ClickhouseTableName = "clickhouse_table_name"
184186

@@ -3685,6 +3687,26 @@ var (
36853687
CompressionType,
36863688
})
36873689

3690+
ByteStreamProxyChunkedReadFailures = promauto.NewCounterVec(prometheus.CounterOpts{
3691+
Namespace: bbNamespace,
3692+
Subsystem: "proxy",
3693+
Name: "byte_stream_chunked_read_failures",
3694+
Help: "Number of chunked read attempts that failed, by failure reason.",
3695+
}, []string{
3696+
ChunkedFailureReasonLabel,
3697+
StatusHumanReadableLabel,
3698+
})
3699+
3700+
ByteStreamServerChunkedReadFailures = promauto.NewCounterVec(prometheus.CounterOpts{
3701+
Namespace: bbNamespace,
3702+
Subsystem: "remote_cache",
3703+
Name: "byte_stream_chunked_read_failures",
3704+
Help: "Number of chunked read attempts that failed on the backend, by failure reason.",
3705+
}, []string{
3706+
ChunkedFailureReasonLabel,
3707+
StatusHumanReadableLabel,
3708+
})
3709+
36883710
CapabilitiesProxiedRequests = promauto.NewCounterVec(prometheus.CounterOpts{
36893711
Namespace: bbNamespace,
36903712
Subsystem: "proxy",

server/remote_cache/byte_stream_server/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ go_library(
1010
"//proto:remote_execution_go_proto",
1111
"//server/environment",
1212
"//server/interfaces",
13+
"//server/metrics",
1314
"//server/real_environment",
1415
"//server/remote_cache/chunking",
1516
"//server/remote_cache/config",
@@ -26,6 +27,7 @@ go_library(
2627
"//server/util/prefix",
2728
"//server/util/quota",
2829
"//server/util/status",
30+
"@com_github_prometheus_client_golang//prometheus",
2931
"@org_golang_google_genproto_googleapis_bytestream//:bytestream",
3032
"@org_golang_google_grpc//peer",
3133
],

server/remote_cache/byte_stream_server/byte_stream_server.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88

99
"github.com/buildbuddy-io/buildbuddy/server/environment"
1010
"github.com/buildbuddy-io/buildbuddy/server/interfaces"
11+
"github.com/buildbuddy-io/buildbuddy/server/metrics"
1112
"github.com/buildbuddy-io/buildbuddy/server/real_environment"
1213
"github.com/buildbuddy-io/buildbuddy/server/remote_cache/chunking"
1314
"github.com/buildbuddy-io/buildbuddy/server/remote_cache/digest"
@@ -23,6 +24,7 @@ import (
2324
"github.com/buildbuddy-io/buildbuddy/server/util/prefix"
2425
"github.com/buildbuddy-io/buildbuddy/server/util/quota"
2526
"github.com/buildbuddy-io/buildbuddy/server/util/status"
27+
"github.com/prometheus/client_golang/prometheus"
2628
"google.golang.org/grpc/peer"
2729

2830
cappb "github.com/buildbuddy-io/buildbuddy/proto/capability"
@@ -207,8 +209,16 @@ func (s *ByteStreamServer) attemptReadChunked(ctx context.Context, rn *digest.CA
207209

208210
rns := manifest.ChunkResourceNames()
209211
if missing, err := s.cache.FindMissing(ctx, rns); err != nil {
212+
metrics.ByteStreamServerChunkedReadFailures.With(prometheus.Labels{
213+
metrics.ChunkedFailureReasonLabel: "chunk_find_missing_error",
214+
metrics.StatusHumanReadableLabel: status.MetricsLabel(err),
215+
}).Inc()
210216
return nil, err
211217
} else if len(missing) > 0 {
218+
metrics.ByteStreamServerChunkedReadFailures.With(prometheus.Labels{
219+
metrics.ChunkedFailureReasonLabel: "chunks_missing",
220+
metrics.StatusHumanReadableLabel: "NotFound",
221+
}).Inc()
212222
return nil, status.NotFoundErrorf("chunks missing from manifest for %q: %q", rn.DownloadString(), chunking.DigestsSummary(missing))
213223
}
214224

@@ -218,6 +228,10 @@ func (s *ByteStreamServer) attemptReadChunked(ctx context.Context, rn *digest.CA
218228
chunkRN.SetCompressor(rn.GetCompressor())
219229
rc, err := s.cache.Reader(ctx, chunkRN.ToProto(), 0, 0)
220230
if err != nil {
231+
metrics.ByteStreamServerChunkedReadFailures.With(prometheus.Labels{
232+
metrics.ChunkedFailureReasonLabel: "chunk_read_error",
233+
metrics.StatusHumanReadableLabel: status.MetricsLabel(err),
234+
}).Inc()
221235
for _, openCloser := range rcs {
222236
openCloser.Close()
223237
}

0 commit comments

Comments
 (0)