From 060c6b1135b87a8c363d0dd8359dafd3bda5223e Mon Sep 17 00:00:00 2001 From: Plamen Bardarov Date: Sun, 11 Aug 2024 14:14:32 +0300 Subject: [PATCH 1/2] added metrics for failed liveness checks --- depot/steps/emit_check_failure_metric_step.go | 74 +++++++++ .../emit_check_failure_metric_step_test.go | 150 ++++++++++++++++++ depot/transformer/transformer.go | 100 +++++++----- depot/transformer/transformer_test.go | 29 ++++ resources.go | 24 +++ 5 files changed, 337 insertions(+), 40 deletions(-) create mode 100644 depot/steps/emit_check_failure_metric_step.go create mode 100644 depot/steps/emit_check_failure_metric_step_test.go diff --git a/depot/steps/emit_check_failure_metric_step.go b/depot/steps/emit_check_failure_metric_step.go new file mode 100644 index 00000000..c313b1c1 --- /dev/null +++ b/depot/steps/emit_check_failure_metric_step.go @@ -0,0 +1,74 @@ +package steps + +import ( + "fmt" + "os" + + loggingclient "code.cloudfoundry.org/diego-logging-client" + "code.cloudfoundry.org/executor" + "github.com/tedsuo/ifrit" +) + +type emitCheckFailureMetricStep struct { + checkStep ifrit.Runner + checkProtocol executor.CheckProtocol + checkType executor.HealthcheckType + metronClient loggingclient.IngressClient +} + +const ( + CheckFailedCount = "ChecksFailedCount" +) + +func NewEmitCheckFailureMetricStep( + checkStep ifrit.Runner, + checkProtocol executor.CheckProtocol, + checkType executor.HealthcheckType, + metronClient loggingclient.IngressClient) ifrit.Runner { + return &emitCheckFailureMetricStep{ + checkStep: checkStep, + checkProtocol: checkProtocol, + checkType: checkType, + metronClient: metronClient, + } +} + +func (step *emitCheckFailureMetricStep) Run(signals <-chan os.Signal, ready chan<- struct{}) error { + if step.checkStep == nil { + return nil + } + + checkProccess := ifrit.Background(step.checkStep) + + done := make(chan struct{}) + defer close(done) + go waitForReadiness(checkProccess, ready, done) + + select { + case err := <-checkProccess.Wait(): + if err != nil { + step.emitFailureMetric() + } + return err + case s := <-signals: + checkProccess.Signal(s) + return <-checkProccess.Wait() + } +} + +func (step *emitCheckFailureMetricStep) emitFailureMetric() { + metricName := constructMetricName(step.checkProtocol, step.checkType) + go step.metronClient.IncrementCounter(metricName) +} + +func constructMetricName(checkProtocol executor.CheckProtocol, checkType executor.HealthcheckType) string { + return fmt.Sprintf("%s%s%s", checkProtocol, checkType, CheckFailedCount) +} + +func waitForReadiness(p ifrit.Process, ready chan<- struct{}, done <-chan struct{}) { + select { + case <-p.Ready(): + close(ready) + case <-done: + } +} diff --git a/depot/steps/emit_check_failure_metric_step_test.go b/depot/steps/emit_check_failure_metric_step_test.go new file mode 100644 index 00000000..4286dbc3 --- /dev/null +++ b/depot/steps/emit_check_failure_metric_step_test.go @@ -0,0 +1,150 @@ +package steps_test + +import ( + "errors" + "os" + + mfakes "code.cloudfoundry.org/diego-logging-client/testhelpers" + "code.cloudfoundry.org/executor" + "code.cloudfoundry.org/executor/depot/steps" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/tedsuo/ifrit" + fake_runner "github.com/tedsuo/ifrit/fake_runner_v2" +) + +var _ = Describe("EmitCheckFailureMetricStep", func() { + var ( + subStep *fake_runner.TestRunner + step ifrit.Runner + fakeMetronClient *mfakes.FakeIngressClient + errorToReturn error + checkProtocol executor.CheckProtocol + checkType executor.HealthcheckType + ) + + BeforeEach(func() { + checkProtocol = executor.HTTPCheck + checkType = executor.IsLivenessCheck + fakeMetronClient = new(mfakes.FakeIngressClient) + errorToReturn = nil + subStep = fake_runner.NewTestRunner() + + subStep.RunStub = func(signals <-chan os.Signal, ready chan<- struct{}) error { + return errorToReturn + } + }) + + AfterEach(func() { + subStep.EnsureExit() + + }) + + JustBeforeEach(func() { + step = steps.NewEmitCheckFailureMetricStep(subStep, checkProtocol, checkType, fakeMetronClient) + }) + + Describe("Ready", func() { + var ( + fakeRunner *fake_runner.TestRunner + ) + + Context("when substep is not nil", func() { + BeforeEach(func() { + fakeRunner = fake_runner.NewTestRunner() + subStep = fakeRunner + }) + + It("becomes ready when the substep is ready", func() { + p := ifrit.Background(step) + Consistently(p.Ready()).ShouldNot(BeClosed()) + fakeRunner.TriggerReady() + Eventually(p.Ready()).Should(BeClosed()) + }) + }) + }) + + Describe("Running", func() { + var ( + err error + ) + + JustBeforeEach(func() { + err = <-ifrit.Invoke(step).Wait() + }) + + Context("when the substep succeeds", func() { + It("should not emit any metric", func() { + Expect(err).NotTo(HaveOccurred()) + Eventually(fakeMetronClient.IncrementCounterCallCount).Should(Equal(0)) + }) + }) + + Context("when the substep fails", func() { + BeforeEach(func() { + errorToReturn = errors.New("check failed") + }) + + It("should pass the error along", func() { + Expect(err).To(MatchError(errorToReturn)) + }) + + Context("when HealthCheckType is Liveness and CheckProtocol is HTTP", func() { + BeforeEach(func() { + checkType = executor.IsLivenessCheck + checkProtocol = executor.HTTPCheck + }) + + It("should emit metric for the correct health check protocol and type", func() { + Eventually(fakeMetronClient.IncrementCounterCallCount).Should(Equal(1)) + name := fakeMetronClient.IncrementCounterArgsForCall(0) + Expect(name).To(Equal("HTTPLivenessChecksFailedCount")) + }) + }) + + Context("when HealthCheckType is Liveness and CheckProtocol is TCP", func() { + BeforeEach(func() { + checkType = executor.IsLivenessCheck + checkProtocol = executor.TCPCheck + }) + + It("should emit metric for the correct health check protocol and type", func() { + Eventually(fakeMetronClient.IncrementCounterCallCount).Should(Equal(1)) + name := fakeMetronClient.IncrementCounterArgsForCall(0) + Expect(name).To(Equal("TCPLivenessChecksFailedCount")) + }) + }) + }) + }) + + Describe("Signal", func() { + var ( + p ifrit.Process + finished chan struct{} + ) + + BeforeEach(func() { + finished = make(chan struct{}) + subStep.RunStub = func(signals <-chan os.Signal, ready chan<- struct{}) error { + <-signals + close(finished) + return new(steps.CancelledError) + } + }) + + JustBeforeEach(func() { + p = ifrit.Background(step) + }) + + It("should pass the signal", func() { + Consistently(finished).ShouldNot(BeClosed()) + p.Signal(os.Interrupt) + Eventually(finished).Should(BeClosed()) + }) + + It("should not emit metric when cancelled", func() { + p.Signal(os.Interrupt) + Eventually(fakeMetronClient.IncrementCounterCallCount).Should(Equal(0)) + }) + }) +}) diff --git a/depot/transformer/transformer.go b/depot/transformer/transformer.go index a58c894c..edcc2330 100644 --- a/depot/transformer/transformer.go +++ b/depot/transformer/transformer.go @@ -24,19 +24,6 @@ import ( "github.com/tedsuo/ifrit" ) -type HealthcheckType int -type CheckProtocol int - -const ( - isStartupCheck HealthcheckType = iota - isLivenessCheck - isUntilSuccessReadinessCheck - isUntilFailureReadinessCheck - - HTTPCheck CheckProtocol = iota - TCPCheck -) - const ( healthCheckNofiles uint64 = 1024 DefaultDeclarativeHealthcheckRequestTimeout = int(1 * time.Second / time.Millisecond) // this is just 1000, transformed eventually into a str: "1000ms" or equivalently "1s" @@ -465,11 +452,13 @@ func (t *transformer) StepsRunner( startupSidecarName, int(p), DefaultDeclarativeHealthcheckRequestTimeout, - TCPCheck, - isStartupCheck, + executor.TCPCheck, + executor.IsStartupCheck, t.unhealthyMonitoringInterval, envoyStartupLogger, "instance proxy failed to start", + config.MetronClient, + false, ) proxyStartupChecks = append(proxyStartupChecks, step) } @@ -483,6 +472,7 @@ func (t *transformer) StepsRunner( logStreamer, config.BindMounts, proxyStartupChecks, + config.MetronClient, ) substeps = append(substeps, monitor) } @@ -495,6 +485,7 @@ func (t *transformer) StepsRunner( logStreamer, config.BindMounts, proxyStartupChecks, + config.MetronClient, ) if err == nil { substeps = append(substeps, readinessMonitor) @@ -567,11 +558,13 @@ func (t *transformer) createCheck( sidecarName string, port, timeout int, - checkProtocol CheckProtocol, - checkType HealthcheckType, + checkProtocol executor.CheckProtocol, + checkType executor.HealthcheckType, interval time.Duration, logger lager.Logger, prefix string, + metronClient loggingclient.IngressClient, + emitMetric bool, ) ifrit.Runner { nofiles := healthCheckNofiles @@ -586,24 +579,24 @@ func (t *transformer) createCheck( fmt.Sprintf("-timeout=%dms", timeout), } - if checkProtocol == HTTPCheck { + if checkProtocol == executor.HTTPCheck { args = append(args, fmt.Sprintf("-uri=%s", path)) } - if checkType == isStartupCheck { + if checkType == executor.IsStartupCheck { args = append(args, fmt.Sprintf("-startup-interval=%s", interval)) args = append(args, fmt.Sprintf("-startup-timeout=%s", time.Duration(container.StartTimeoutMs)*time.Millisecond)) } - if checkType == isLivenessCheck { + if checkType == executor.IsLivenessCheck { args = append(args, fmt.Sprintf("-liveness-interval=%s", interval)) } - if checkType == isUntilSuccessReadinessCheck { + if checkType == executor.IsUntilSuccessReadinessCheck { args = append(args, fmt.Sprintf("-until-ready-interval=%s", interval)) } - if checkType == isUntilFailureReadinessCheck { + if checkType == executor.IsUntilFailureReadinessCheck { args = append(args, fmt.Sprintf("-readiness-interval=%s", interval)) } @@ -637,10 +630,19 @@ func (t *transformer) createCheck( sidecar, container.Privileged, ) + + var wrapperStep ifrit.Runner if prefix != "" { - return steps.NewOutputWrapperWithPrefix(runStep, buffer, prefix) + wrapperStep = steps.NewOutputWrapperWithPrefix(runStep, buffer, prefix) + } else { + wrapperStep = steps.NewOutputWrapper(runStep, buffer) } - return steps.NewOutputWrapper(runStep, buffer) + + if emitMetric { + wrapperStep = steps.NewEmitCheckFailureMetricStep(wrapperStep, checkProtocol, checkType, metronClient) + } + + return wrapperStep } func (t *transformer) transformReadinessCheckDefinition( @@ -650,6 +652,7 @@ func (t *transformer) transformReadinessCheckDefinition( logstreamer log_streamer.LogStreamer, bindMounts []garden.BindMount, proxyStartupChecks []ifrit.Runner, + metronClient loggingclient.IngressClient, ) (ifrit.Runner, chan steps.ReadinessState, error) { var untilSuccessReadinessChecks []ifrit.Runner var untilFailureReadinessChecks []ifrit.Runner @@ -690,11 +693,13 @@ func (t *transformer) transformReadinessCheckDefinition( untilReadySidecarName, int(check.HttpCheck.Port), timeout, - HTTPCheck, - isUntilSuccessReadinessCheck, + executor.HTTPCheck, + executor.IsUntilSuccessReadinessCheck, t.unhealthyMonitoringInterval, readinessLogger, "", + metronClient, + false, )) untilFailureReadinessChecks = append(untilFailureReadinessChecks, t.createCheck( @@ -705,11 +710,13 @@ func (t *transformer) transformReadinessCheckDefinition( readinessSidecarName, int(check.HttpCheck.Port), timeout, - HTTPCheck, - isUntilFailureReadinessCheck, + executor.HTTPCheck, + executor.IsUntilFailureReadinessCheck, interval, readinessLogger, "", + metronClient, + false, )) } else { // is tcp check timeout, interval, _ := t.applyCheckDefaults( @@ -726,11 +733,13 @@ func (t *transformer) transformReadinessCheckDefinition( readinessSidecarName, int(check.TcpCheck.Port), timeout, - TCPCheck, - isUntilSuccessReadinessCheck, + executor.TCPCheck, + executor.IsUntilSuccessReadinessCheck, t.unhealthyMonitoringInterval, readinessLogger, "", + metronClient, + false, )) untilFailureReadinessChecks = append(untilFailureReadinessChecks, t.createCheck( @@ -741,11 +750,13 @@ func (t *transformer) transformReadinessCheckDefinition( readinessSidecarName, int(check.TcpCheck.Port), timeout, - TCPCheck, - isUntilFailureReadinessCheck, + executor.TCPCheck, + executor.IsUntilFailureReadinessCheck, interval, readinessLogger, "", + metronClient, + false, )) } } @@ -786,6 +797,7 @@ func (t *transformer) transformCheckDefinition( logstreamer log_streamer.LogStreamer, bindMounts []garden.BindMount, proxyStartupChecks []ifrit.Runner, + metronClient loggingclient.IngressClient, ) ifrit.Runner { var startupChecks []ifrit.Runner var livenessChecks []ifrit.Runner @@ -825,11 +837,13 @@ func (t *transformer) transformCheckDefinition( startupSidecarName, int(check.HttpCheck.Port), timeout, - HTTPCheck, - isStartupCheck, + executor.HTTPCheck, + executor.IsStartupCheck, t.unhealthyMonitoringInterval, startupLogger, "", + metronClient, + false, )) livenessChecks = append(livenessChecks, t.createCheck( container, @@ -839,11 +853,13 @@ func (t *transformer) transformCheckDefinition( livenessSidecarName, int(check.HttpCheck.Port), timeout, - HTTPCheck, - isLivenessCheck, + executor.HTTPCheck, + executor.IsLivenessCheck, interval, livenessLogger, "", + metronClient, + true, )) } else if check.TcpCheck != nil { @@ -862,11 +878,13 @@ func (t *transformer) transformCheckDefinition( startupSidecarName, int(check.TcpCheck.Port), timeout, - TCPCheck, - isStartupCheck, + executor.TCPCheck, + executor.IsStartupCheck, t.unhealthyMonitoringInterval, startupLogger, "", + metronClient, + false, )) livenessChecks = append(livenessChecks, t.createCheck( container, @@ -876,11 +894,13 @@ func (t *transformer) transformCheckDefinition( livenessSidecarName, int(check.TcpCheck.Port), timeout, - TCPCheck, - isLivenessCheck, + executor.TCPCheck, + executor.IsLivenessCheck, interval, livenessLogger, "", + metronClient, + true, )) } } diff --git a/depot/transformer/transformer_test.go b/depot/transformer/transformer_test.go index f035bfa7..9f71d8ae 100644 --- a/depot/transformer/transformer_test.go +++ b/depot/transformer/transformer_test.go @@ -1575,6 +1575,12 @@ var _ = Describe("Transformer", func() { It("returns the liveness check output in the error", func() { Eventually(process.Wait()).Should(Receive(MatchError(ContainSubstring("Instance became unhealthy: liveness check failed")))) }) + + It("emits HTTPLivenessChecksFailedCount metric", func() { + Eventually(fakeMetronClient.IncrementCounterCallCount).Should(Equal(1)) + name := fakeMetronClient.IncrementCounterArgsForCall(0) + Expect(name).To(Equal("HTTPLivenessChecksFailedCount")) + }) }) }) }) @@ -1709,6 +1715,29 @@ var _ = Describe("Transformer", func() { })) }) }) + + Context("when the liveness check exits", func() { + JustBeforeEach(func() { + Eventually(gardenContainer.RunCallCount).Should(Equal(3)) + + By("waiting the action and liveness check processes to start") + var io garden.ProcessIO + Eventually(livenessIO).Should(Receive(&io)) + _, err := io.Stdout.Write([]byte("liveness check failed")) + Expect(err).NotTo(HaveOccurred()) + + By("exiting the liveness check") + livenessCh <- 1 + Eventually(actionProcess.SignalCallCount).Should(Equal(1)) + actionCh <- 2 + }) + + It("emits TCPLivenessChecksFailedCount metric", func() { + Eventually(fakeMetronClient.IncrementCounterCallCount).Should(Equal(1)) + name := fakeMetronClient.IncrementCounterArgsForCall(0) + Expect(name).To(Equal("TCPLivenessChecksFailedCount")) + }) + }) }) }) diff --git a/resources.go b/resources.go index 05ff52b1..c09c3e84 100644 --- a/resources.go +++ b/resources.go @@ -430,3 +430,27 @@ func truncateString(s string, length int) string { last := len(s) - length/2 + len(delimeter)/2 return fmt.Sprintf("%s%s%s", s[:((length/2-1)-len(delimeter)/2)], delimeter, s[last:]) } + +type HealthcheckType int + +const ( + IsStartupCheck HealthcheckType = iota + IsLivenessCheck + IsUntilSuccessReadinessCheck + IsUntilFailureReadinessCheck +) + +func (h HealthcheckType) String() string { + return [...]string{"Startup", "Liveness", "UntilSuccessReadiness", "UntilFailureReadiness"}[h] +} + +type CheckProtocol int + +const ( + HTTPCheck CheckProtocol = iota + TCPCheck +) + +func (c CheckProtocol) String() string { + return [...]string{"HTTP", "TCP"}[c] +} From fd0b20c96e6366e02a5020902730ab18d7143c28 Mon Sep 17 00:00:00 2001 From: Plamen Bardarov Date: Tue, 17 Sep 2024 10:16:17 +0300 Subject: [PATCH 2/2] add config property for disabling healtcheck metrics --- depot/transformer/transformer.go | 11 ++++++-- depot/transformer/transformer_test.go | 40 +++++++++++++++++++++------ initializer/initializer.go | 7 +++++ 3 files changed, 48 insertions(+), 10 deletions(-) diff --git a/depot/transformer/transformer.go b/depot/transformer/transformer.go index edcc2330..0439578b 100644 --- a/depot/transformer/transformer.go +++ b/depot/transformer/transformer.go @@ -57,6 +57,7 @@ type transformer struct { sidecarRootFS string useDeclarativeHealthCheck bool + emitHealthCheckMetrics bool healthyMonitoringInterval time.Duration unhealthyMonitoringInterval time.Duration gracefulShutdownInterval time.Duration @@ -99,6 +100,12 @@ func WithPostSetupHook(user string, hook []string) Option { } } +func WithDeclarativeHealthcheckFailureMetrics() Option { + return func(t *transformer) { + t.emitHealthCheckMetrics = true + } +} + func NewTransformer( clock clock.Clock, cachedDownloader cacheddownloader.CachedDownloader, @@ -859,7 +866,7 @@ func (t *transformer) transformCheckDefinition( livenessLogger, "", metronClient, - true, + t.emitHealthCheckMetrics, )) } else if check.TcpCheck != nil { @@ -900,7 +907,7 @@ func (t *transformer) transformCheckDefinition( livenessLogger, "", metronClient, - true, + t.emitHealthCheckMetrics, )) } } diff --git a/depot/transformer/transformer_test.go b/depot/transformer/transformer_test.go index 9f71d8ae..a5973043 100644 --- a/depot/transformer/transformer_test.go +++ b/depot/transformer/transformer_test.go @@ -1576,10 +1576,22 @@ var _ = Describe("Transformer", func() { Eventually(process.Wait()).Should(Receive(MatchError(ContainSubstring("Instance became unhealthy: liveness check failed")))) }) - It("emits HTTPLivenessChecksFailedCount metric", func() { - Eventually(fakeMetronClient.IncrementCounterCallCount).Should(Equal(1)) - name := fakeMetronClient.IncrementCounterArgsForCall(0) - Expect(name).To(Equal("HTTPLivenessChecksFailedCount")) + Context("and emitting liveness check failures is enabled", func() { + BeforeEach(func() { + options = append(options, transformer.WithDeclarativeHealthcheckFailureMetrics()) + }) + + It("emits HTTPLivenessChecksFailedCount metric", func() { + Eventually(fakeMetronClient.IncrementCounterCallCount).Should(Equal(1)) + name := fakeMetronClient.IncrementCounterArgsForCall(0) + Expect(name).To(Equal("HTTPLivenessChecksFailedCount")) + }) + }) + + Context("and emitting liveness check failures is disabled", func() { + It("does not emit HTTPLivenessChecksFailedCount metric", func() { + Eventually(fakeMetronClient.IncrementCounterCallCount).Should(Equal(0)) + }) }) }) }) @@ -1732,10 +1744,22 @@ var _ = Describe("Transformer", func() { actionCh <- 2 }) - It("emits TCPLivenessChecksFailedCount metric", func() { - Eventually(fakeMetronClient.IncrementCounterCallCount).Should(Equal(1)) - name := fakeMetronClient.IncrementCounterArgsForCall(0) - Expect(name).To(Equal("TCPLivenessChecksFailedCount")) + Context("and emitting liveness check failures is enabled", func() { + BeforeEach(func() { + options = append(options, transformer.WithDeclarativeHealthcheckFailureMetrics()) + }) + + It("emits TCPLivenessChecksFailedCount metric", func() { + Eventually(fakeMetronClient.IncrementCounterCallCount).Should(Equal(1)) + name := fakeMetronClient.IncrementCounterArgsForCall(0) + Expect(name).To(Equal("TCPLivenessChecksFailedCount")) + }) + }) + + Context("and emitting liveness check failures is disabled", func() { + It("does not emit TCPLivenessChecksFailedCount metric", func() { + Eventually(fakeMetronClient.IncrementCounterCallCount).Should(Equal(0)) + }) }) }) }) diff --git a/initializer/initializer.go b/initializer/initializer.go index 9209663b..37ce4c15 100644 --- a/initializer/initializer.go +++ b/initializer/initializer.go @@ -103,6 +103,7 @@ type ExecutorConfig struct { DiskMB string `json:"disk_mb,omitempty"` EnableContainerProxy bool `json:"enable_container_proxy,omitempty"` EnableDeclarativeHealthcheck bool `json:"enable_declarative_healthcheck,omitempty"` + EnableHealtcheckMetrics bool `json:"enable_healthcheck_metrics,omitempty"` EnableUnproxiedPortMappings bool `json:"enable_unproxied_port_mappings"` EnvoyConfigRefreshDelay durationjson.Duration `json:"envoy_config_refresh_delay"` EnvoyConfigReloadDuration durationjson.Duration `json:"envoy_config_reload_duration"` @@ -248,6 +249,7 @@ func Initialize(logger lager.Logger, config ExecutorConfig, cellID, zone string, postSetupHook, config.PostSetupUser, config.EnableDeclarativeHealthcheck, + config.EnableHealtcheckMetrics, gardenHealthcheckRootFS, config.EnableContainerProxy, time.Duration(config.EnvoyDrainTimeout), @@ -544,6 +546,7 @@ func initializeTransformer( postSetupHook []string, postSetupUser string, useDeclarativeHealthCheck bool, + emitHealthCheckMetrics bool, declarativeHealthcheckRootFS string, enableContainerProxy bool, drainWait time.Duration, @@ -557,6 +560,10 @@ func initializeTransformer( options = append(options, transformer.WithDeclarativeHealthchecks()) } + if emitHealthCheckMetrics { + options = append(options, transformer.WithDeclarativeHealthcheckFailureMetrics()) + } + if enableContainerProxy { options = append(options, transformer.WithContainerProxy(drainWait)) }