From d8485b073b78e3fe0e7cb9be235787fb141d4afd Mon Sep 17 00:00:00 2001 From: Plamen Bardarov Date: Mon, 13 Jan 2025 13:37:43 +0200 Subject: [PATCH] Add envoy proxy livenss checks --- depot/transformer/transformer.go | 49 ++++++++++-- depot/transformer/transformer_test.go | 107 ++++++++++++++++++++++++++ initializer/initializer.go | 10 +++ 3 files changed, 161 insertions(+), 5 deletions(-) diff --git a/depot/transformer/transformer.go b/depot/transformer/transformer.go index 0439578b..55cf1e5d 100644 --- a/depot/transformer/transformer.go +++ b/depot/transformer/transformer.go @@ -2,6 +2,7 @@ package transformer import ( "bytes" + "code.cloudfoundry.org/lager/v3" "errors" "fmt" "os" @@ -19,7 +20,6 @@ import ( "code.cloudfoundry.org/executor/depot/steps" "code.cloudfoundry.org/executor/depot/uploader" "code.cloudfoundry.org/garden" - "code.cloudfoundry.org/lager/v3" "code.cloudfoundry.org/workpool" "github.com/tedsuo/ifrit" ) @@ -63,8 +63,10 @@ type transformer struct { gracefulShutdownInterval time.Duration healthCheckWorkPool *workpool.WorkPool - useContainerProxy bool - drainWait time.Duration + useContainerProxy bool + drainWait time.Duration + enableContainerProxyHealthChecks bool + proxyHealthCheckInterval time.Duration postSetupHook []string postSetupUser string @@ -93,6 +95,13 @@ func WithContainerProxy(drainWait time.Duration) Option { } } +func WithProxyLivenessChecks(interval time.Duration) Option { + return func(t *transformer) { + t.enableContainerProxyHealthChecks = true + t.proxyHealthCheckInterval = interval + } +} + func WithPostSetupHook(user string, hook []string) Option { return func(t *transformer) { t.postSetupUser = user @@ -443,11 +452,13 @@ func (t *transformer) StepsRunner( } var proxyStartupChecks []ifrit.Runner + var proxyLivenessChecks []ifrit.Runner if t.useContainerProxy && t.useDeclarativeHealthCheck { envoyStartupLogger := logger.Session("envoy-startup-check") + envoyLivenessLogger := logger.Session("envoy-liveness-check") - for idx, p := range config.ProxyTLSPorts { + for idx, port := range config.ProxyTLSPorts { // add envoy startup checks startupSidecarName := fmt.Sprintf("%s-envoy-startup-healthcheck-%d", gardenContainer.Handle(), idx) @@ -457,7 +468,7 @@ func (t *transformer) StepsRunner( config.BindMounts, "", startupSidecarName, - int(p), + int(port), DefaultDeclarativeHealthcheckRequestTimeout, executor.TCPCheck, executor.IsStartupCheck, @@ -467,6 +478,30 @@ func (t *transformer) StepsRunner( config.MetronClient, false, ) + + if t.enableContainerProxyHealthChecks { + livenessSidecarName := fmt.Sprintf("%s-envoy-liveness-healthcheck-%d", gardenContainer.Handle(), idx) + + livenessStep := t.createCheck( + &container, + gardenContainer, + config.BindMounts, + "", + livenessSidecarName, + int(port), + DefaultDeclarativeHealthcheckRequestTimeout, + executor.TCPCheck, + executor.IsLivenessCheck, + t.proxyHealthCheckInterval, + envoyLivenessLogger, + "instance proxy health check failed", + config.MetronClient, + t.emitHealthCheckMetrics, + ) + + proxyLivenessChecks = append(proxyLivenessChecks, livenessStep) + } + proxyStartupChecks = append(proxyStartupChecks, step) } } @@ -479,8 +514,10 @@ func (t *transformer) StepsRunner( logStreamer, config.BindMounts, proxyStartupChecks, + proxyLivenessChecks, config.MetronClient, ) + substeps = append(substeps, monitor) } @@ -804,6 +841,7 @@ func (t *transformer) transformCheckDefinition( logstreamer log_streamer.LogStreamer, bindMounts []garden.BindMount, proxyStartupChecks []ifrit.Runner, + proxyLivenessChecks []ifrit.Runner, metronClient loggingclient.IngressClient, ) ifrit.Runner { var startupChecks []ifrit.Runner @@ -913,6 +951,7 @@ func (t *transformer) transformCheckDefinition( } startupCheck := steps.NewParallel(append(proxyStartupChecks, startupChecks...)) + livenessChecks = append(livenessChecks, proxyLivenessChecks...) livenessCheck := steps.NewCodependent(livenessChecks, false, false) return steps.NewHealthCheckStep( diff --git a/depot/transformer/transformer_test.go b/depot/transformer/transformer_test.go index a5973043..2ab7c118 100644 --- a/depot/transformer/transformer_test.go +++ b/depot/transformer/transformer_test.go @@ -670,6 +670,7 @@ var _ = Describe("Transformer", func() { Context("and container proxy is enabled", func() { BeforeEach(func() { options = append(options, transformer.WithContainerProxy(time.Second)) + options = append(options, transformer.WithProxyLivenessChecks(time.Second)) cfg.BindMounts = append(cfg.BindMounts, garden.BindMount{ Origin: garden.BindMountOriginHost, SrcPath: declarativeHealthcheckSrcPath, @@ -1504,6 +1505,112 @@ var _ = Describe("Transformer", func() { })) }) + Context("and container proxy is enabled", func() { + var ( + otherStartupProcess *gardenfakes.FakeProcess + otherStartupCh chan int + otherLivenessProcess *gardenfakes.FakeProcess + otherLivenessCh chan int + ) + + BeforeEach(func() { + options = append(options, transformer.WithContainerProxy(time.Second)) + cfg.ProxyTLSPorts = []uint16{61001} + + otherStartupCh = make(chan int) + otherStartupProcess = makeProcess(otherStartupCh) + + otherLivenessCh = make(chan int) + otherLivenessProcess = makeProcess(otherLivenessCh) + + healthcheckCallCount := int64(0) + + gardenContainer.RunStub = func(spec garden.ProcessSpec, io garden.ProcessIO) (process garden.Process, err error) { + defer GinkgoRecover() + // get rid of race condition caused by write inside the BeforeEach + processLock.Lock() + defer processLock.Unlock() + + switch spec.Path { + case "/action/path": + return actionProcess, nil + case filepath.Join(transformer.HealthCheckDstPath, "healthcheck"): + oldCount := atomic.AddInt64(&healthcheckCallCount, 1) + switch oldCount { + case 1: + return startupProcess, nil + case 2: + return otherStartupProcess, nil + case 3: + return livenessProcess, nil + case 4: + return otherLivenessProcess, nil + } + return livenessProcess, nil + case "/monitor/path": + return monitorProcess, nil + } + + err = errors.New("") + Fail("unexpected executable path: " + spec.Path) + return + } + }) + + JustBeforeEach(func() { + otherStartupCh <- 0 + }) + + AfterEach(func() { + close(otherStartupCh) + close(otherLivenessCh) + }) + + Context("and proxy liveness check is enabled", func() { + BeforeEach(func() { + options = append(options, transformer.WithProxyLivenessChecks(time.Second*30)) + }) + + It("starts the proxy liveness check", func() { + Eventually(gardenContainer.RunCallCount).Should(Equal(5)) + var ids []string + var args [][]string + for i := 0; i < gardenContainer.RunCallCount(); i++ { + spec, _ := gardenContainer.RunArgsForCall(i) + ids = append(ids, spec.ID) + args = append(args, spec.Args) + } + + Expect(ids).To(ContainElement(fmt.Sprintf("%s-%s", gardenContainer.Handle(), "envoy-liveness-healthcheck-0"))) + Expect(args).To(ContainElement([]string{ + "-port=61001", + "-timeout=1000ms", + "-liveness-interval=30s", + })) + }) + }) + + Context("and proxy liveness check is disabled", func() { + It("does not start the proxy liveness check", func() { + Eventually(gardenContainer.RunCallCount).Should(Equal(4)) + var ids []string + var args [][]string + for i := 0; i < gardenContainer.RunCallCount(); i++ { + spec, _ := gardenContainer.RunArgsForCall(i) + ids = append(ids, spec.ID) + args = append(args, spec.Args) + } + + Expect(ids).To(Not(ContainElement(fmt.Sprintf("%s-%s", gardenContainer.Handle(), "envoy-liveness-healthcheck-0")))) + Expect(args).To(Not(ContainElement([]string{ + "-port=61001", + "-timeout=1000ms", + "-liveness-interval=30s", + }))) + }) + }) + }) + Context("when optional values are not provided in liveness check defintion", func() { BeforeEach(func() { container.CheckDefinition = &models.CheckDefinition{ diff --git a/initializer/initializer.go b/initializer/initializer.go index 777f2074..d3070160 100644 --- a/initializer/initializer.go +++ b/initializer/initializer.go @@ -102,12 +102,14 @@ type ExecutorConfig struct { DeleteWorkPoolSize int `json:"delete_work_pool_size,omitempty"` DiskMB string `json:"disk_mb,omitempty"` EnableContainerProxy bool `json:"enable_container_proxy,omitempty"` + EnableContainerProxyHealthChecks bool `json:"enable_container_proxy_healthcheck,omitempty"` EnableDeclarativeHealthcheck bool `json:"enable_declarative_healthcheck,omitempty"` EnableHealtcheckMetrics bool `json:"enable_healthcheck_metrics,omitempty"` EnableUnproxiedPortMappings bool `json:"enable_unproxied_port_mappings"` EnvoyConfigRefreshDelay durationjson.Duration `json:"envoy_config_refresh_delay"` EnvoyConfigReloadDuration durationjson.Duration `json:"envoy_config_reload_duration"` EnvoyDrainTimeout durationjson.Duration `json:"envoy_drain_timeout,omitempty"` + ProxyHealthCheckInterval durationjson.Duration `json:"proxy_healthcheck_interval,omitempty"` ExportNetworkEnvVars bool `json:"export_network_env_vars,omitempty"` // DEPRECATED. Kept around for dusts compatability GardenAddr string `json:"garden_addr,omitempty"` GardenHealthcheckCommandRetryPause durationjson.Duration `json:"garden_healthcheck_command_retry_pause,omitempty"` @@ -267,6 +269,8 @@ func Initialize( gardenHealthcheckRootFS, config.EnableContainerProxy, time.Duration(config.EnvoyDrainTimeout), + config.EnableContainerProxyHealthChecks, + time.Duration(config.ProxyHealthCheckInterval), ) hub := event.NewHub() @@ -564,6 +568,8 @@ func initializeTransformer( declarativeHealthcheckRootFS string, enableContainerProxy bool, drainWait time.Duration, + enableProxyHealthChecks bool, + proxyHealthCheckInterval time.Duration, ) transformer.Transformer { var options []transformer.Option compressor := compressor.NewTgz() @@ -580,6 +586,10 @@ func initializeTransformer( if enableContainerProxy { options = append(options, transformer.WithContainerProxy(drainWait)) + + if enableProxyHealthChecks { + options = append(options, transformer.WithProxyLivenessChecks(proxyHealthCheckInterval)) + } } options = append(options, transformer.WithPostSetupHook(postSetupUser, postSetupHook))