Skip to content

Commit d34f38d

Browse files
authored
Merge pull request #3734 from buildkite/improve-auto-mode-logging
Validate ping mode flag, tweak log levels
2 parents 9b1fcd7 + 2480514 commit d34f38d

File tree

4 files changed

+59
-20
lines changed

4 files changed

+59
-20
lines changed

agent/agent_worker.go

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ import (
2020
"github.com/buildkite/roko"
2121
)
2222

23+
const (
24+
PingModeAuto = "auto" // empty string can be used from tests
25+
PingModeStreamOnly = "stream-only"
26+
PingModePollOnly = "poll-only"
27+
)
28+
2329
type AgentWorkerConfig struct {
2430
// Whether to set debug in the job
2531
Debug bool
@@ -304,15 +310,23 @@ func (a *AgentWorker) Start(ctx context.Context, idleMon *idleMonitor) (startErr
304310
streamingLoop := func() {
305311
defer wg.Done()
306312
err := a.runStreamingPingLoop(ctx, fromStreamingLoopCh)
307-
if a.agentConfiguration.PingMode != "streaming-only" {
308-
// If the ping mode is streaming-only, then an unrecoverable failure
309-
// in the streaming loop should be reported.
310-
// Otherwise, it should fall back to the ping loop and carry on,
311-
// and if that also has an unrecoverable failure we can report that.
312-
// Said another way:
313-
// Streaming is best-effort but preferred, unless in streaming-only
314-
// mode (it's the only available option).
315-
err = nil
313+
if err != nil {
314+
switch a.agentConfiguration.PingMode {
315+
case PingModeStreamOnly:
316+
// In streaming-only mode, an unrecoverable failure
317+
// in the streaming loop should be reported and should
318+
// terminate the agent worker.
319+
a.logger.Error("Streaming ping mode failed due to an unrecoverable error: %v", err)
320+
default:
321+
// In auto mode, the worker should fall back to the ping loop
322+
// and carry on. The user might find that interesting (especially if
323+
// they are expecting streaming to work).
324+
a.logger.Info("Streaming ping mode is unavailable, permanently falling back to polling-based ping mode (the underlying error was: %v)", err)
325+
// If the ping loop then has its own unrecoverable error, then
326+
// *that* will terminate the worker. But the streaming loop shouldn't.
327+
// So treat the error from the streaming loop as "business as usual".
328+
err = nil
329+
}
316330
}
317331
errCh <- err
318332
}
@@ -323,18 +337,21 @@ func (a *AgentWorker) Start(ctx context.Context, idleMon *idleMonitor) (startErr
323337

324338
var loops []func()
325339
switch a.agentConfiguration.PingMode {
326-
case "", "auto":
340+
case "", PingModeAuto: // note: "" can happen in some tests
327341
loops = []func(){pingLoop, streamingLoop, debouncerLoop}
328342
bat.Acquire(actorDebouncer)
329343

330-
case "ping-only":
344+
case PingModePollOnly:
331345
loops = []func(){pingLoop}
332346
fromDebouncerCh = nil // prevent action loop listening to streaming side
333347

334-
case "stream-only":
348+
case PingModeStreamOnly:
335349
loops = []func(){streamingLoop, debouncerLoop}
336350
fromPingLoopCh = nil // prevent action loop listening to ping side
337351
bat.Acquire(actorDebouncer)
352+
353+
default:
354+
return fmt.Errorf("unknown ping mode %q", a.agentConfiguration.PingMode)
338355
}
339356

340357
// There's always an action handler.

agent/agent_worker_streaming.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,9 +135,11 @@ func (a *streamLoopState) startStream(ctx, streamCtx context.Context) error {
135135
a.logger.Debug("[runStreamingPingLoop] Connecting (attempt %d)", a.attempts)
136136
stream, err := a.apiClient.StreamPings(streamCtx, a.agent.UUID)
137137
if err != nil {
138-
a.logger.Error("Connection to ping stream failed: %v", err)
138+
// TODO: after we've made streaming endpoints generally available,
139+
// think about making some of these logs error or warning level.
140+
a.logger.Debug("[runStreamingPingLoop] Connection to ping stream failed: %v", err)
139141
if isUnrecoverable(err) {
140-
a.logger.Error("Stopping ping stream because the error is unrecoverable")
142+
a.logger.Debug("[runStreamingPingLoop] Stopping because the error is unrecoverable")
141143
return err
142144
}
143145
// Fast fallback to the ping loop
@@ -181,9 +183,11 @@ func (a *streamLoopState) handle(ctx context.Context, msg *agentedgev1.StreamPin
181183
var amsg actionMessage
182184
switch {
183185
case streamErr != nil:
186+
// TODO: after we've made streaming endpoints generally available,
187+
// think about making some of these logs error or warning level.
184188
a.logger.Debug("[runStreamingPingLoop] Connection to ping stream failed or ended: %v", streamErr)
185189
if isUnrecoverable(streamErr) {
186-
a.logger.Error("Stopping ping stream loop because the error is unrecoverable: %v", streamErr)
190+
a.logger.Debug("[runStreamingPingLoop] Stopping because the error is unrecoverable")
187191
return streamErr
188192
}
189193
// Stay healthy if the error is deadline-exceeded.
@@ -197,7 +201,7 @@ func (a *streamLoopState) handle(ctx context.Context, msg *agentedgev1.StreamPin
197201
amsg.unhealthy = true
198202

199203
case msg == nil:
200-
a.logger.Error("Ping stream yielded a nil message, so assuming the stream is broken")
204+
a.logger.Debug("[runStreamingPingLoop] Ping stream yielded a nil message, so assuming the stream is broken")
201205
a.logger.Debug("[runStreamingPingLoop] Becoming unhealthy")
202206
amsg.unhealthy = true
203207

clicommand/agent_start.go

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,18 @@ Example:
6060
6161
$ buildkite-agent start --token xxx`
6262

63+
const pingModePingOnly = "ping-only"
64+
6365
var (
6466
verificationFailureBehaviors = []string{agent.VerificationBehaviourBlock, agent.VerificationBehaviourWarn}
6567

68+
pingModes = []string{
69+
agent.PingModeAuto,
70+
agent.PingModePollOnly,
71+
pingModePingOnly, // canonicalises to agent.PingModePollOnly
72+
agent.PingModeStreamOnly,
73+
}
74+
6675
buildkiteSetEnvironmentVariables = []*regexp.Regexp{
6776
regexp.MustCompile("^BUILDKITE$"),
6877
regexp.MustCompile("^BUILDKITE_.*$"),
@@ -764,8 +773,8 @@ var AgentStartCommand = cli.Command{
764773
// API + agent behaviour
765774
cli.StringFlag{
766775
Name: "ping-mode",
767-
Usage: "Selects available protocols for dispatching work to this agent. One of ping-only (default), auto (prefer streaming, but fall back to polling when necessary) or stream-only.",
768-
Value: "ping-only",
776+
Usage: "Selects available protocols for dispatching work to this agent. One of auto (default, prefer streaming, but fall back to polling when necessary), poll-only, or stream-only.",
777+
Value: "auto",
769778
EnvVar: "BUILDKITE_AGENT_PING_MODE",
770779
},
771780

@@ -852,6 +861,15 @@ var AgentStartCommand = cli.Command{
852861
return fmt.Errorf("failed to unset config from environment: %w", err)
853862
}
854863

864+
if !slices.Contains(pingModes, cfg.PingMode) {
865+
return fmt.Errorf("invalid ping mode %q, must be one of %v", cfg.PingMode, pingModes)
866+
}
867+
// Calling it "ping-only" was a mistake, so canonicalise it to "poll-only"
868+
// on the very remote chance someone is using that.
869+
if cfg.PingMode == pingModePingOnly {
870+
cfg.PingMode = agent.PingModePollOnly
871+
}
872+
855873
if cfg.VerificationJWKSFile != "" {
856874
if !slices.Contains(verificationFailureBehaviors, cfg.VerificationFailureBehavior) {
857875
return fmt.Errorf(

internal/e2e/basic_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ func TestBasicE2E(t *testing.T) {
3131
}
3232
}
3333

34-
func TestBasicE2E_PingOnly(t *testing.T) {
34+
func TestBasicE2E_PollOnly(t *testing.T) {
3535
ctx := t.Context()
3636
tc := newTestCase(t, "basic_e2e.yaml")
3737

38-
tc.startAgent("--ping-mode=ping-only")
38+
tc.startAgent("--ping-mode=poll-only")
3939
build := tc.triggerBuild()
4040

4141
// It should take much less time than 1 minute to successfully run the job.

0 commit comments

Comments
 (0)