From b1f3e93e87171e066a86f7bfcd4c387a70007387 Mon Sep 17 00:00:00 2001 From: Thomas Kosiewski Date: Fri, 13 Feb 2026 09:26:25 +0000 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=A4=96=20fix:=20wait=20for=20template?= =?UTF-8?q?=20build=20before=20active=20promotion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add configurable template-version build wait/backoff for aggregated codertemplate updates. Export a reusable aggregated CoderTemplate example and document tuning knobs. --- README.md | 2 + docs/how-to/deploy-aggregated-apiserver.md | 20 ++ examples/aggregated/README.md | 19 ++ .../codertemplate-smoke-scratch.yaml | 91 ++++++++ internal/aggregated/storage/storage_test.go | 194 +++++++++++++++- internal/aggregated/storage/template.go | 218 ++++++++++++++++++ 6 files changed, 538 insertions(+), 6 deletions(-) create mode 100644 examples/aggregated/README.md create mode 100644 examples/aggregated/codertemplate-smoke-scratch.yaml diff --git a/README.md b/README.md index e79854ba..26e30289 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,8 @@ make docs-serve - [`examples/cloudnativepg/`](examples/cloudnativepg/) — Deploy a `CoderControlPlane` with a CloudNativePG-managed PostgreSQL backend. +- [`examples/aggregated/`](examples/aggregated/) - Reusable `CoderTemplate` manifests for aggregated API testing. + ## KIND development cluster (k9s demos) Bootstrap a KIND cluster and install CRDs/RBAC (**this switches current kubectl context**): diff --git a/docs/how-to/deploy-aggregated-apiserver.md b/docs/how-to/deploy-aggregated-apiserver.md index 47f9fe26..fd4766c6 100644 --- a/docs/how-to/deploy-aggregated-apiserver.md +++ b/docs/how-to/deploy-aggregated-apiserver.md @@ -90,6 +90,26 @@ kubectl get codertemplates.aggregation.coder.com -A kubectl logs -n coder-system deploy/coder-k8s ``` +## Template build wait tuning + +When updating `CoderTemplate.spec.files`, the aggregated API server now waits for +Coder to finish building the new template version before promoting it active. + +The wait behavior is configurable via environment variables on the +`coder-k8s` deployment: + +- `CODER_K8S_TEMPLATE_BUILD_WAIT_TIMEOUT` (default: `25m`) +- `CODER_K8S_TEMPLATE_BUILD_BACKOFF_AFTER` (default: `2m`) +- `CODER_K8S_TEMPLATE_BUILD_INITIAL_POLL_INTERVAL` (default: `2s`) +- `CODER_K8S_TEMPLATE_BUILD_MAX_POLL_INTERVAL` (default: `10s`) + +Behavior: + +- Polls at the initial interval for the first 2 minutes. +- After that, poll interval doubles up to the max poll interval. +- Fails if the version build ends in `failed`/`canceled` or the total wait + timeout is exceeded. + ## TLS note `deploy/apiserver-apiservice.yaml` uses `insecureSkipTLSVerify: true` for development convenience. diff --git a/examples/aggregated/README.md b/examples/aggregated/README.md new file mode 100644 index 00000000..e10c0e85 --- /dev/null +++ b/examples/aggregated/README.md @@ -0,0 +1,19 @@ +# Aggregated API examples + +This directory contains example `CoderTemplate` resources for exercising the +aggregated API server (`aggregation.coder.com/v1alpha1`). + +## Smoke template export + +- [`codertemplate-smoke-scratch.yaml`](./codertemplate-smoke-scratch.yaml) + is an exported template manifest captured from a live smoke-test cluster. +- It is intended as a reusable baseline for template create/update API testing. + +Apply it with: + +```bash +kubectl apply -f examples/aggregated/codertemplate-smoke-scratch.yaml +``` + +If you are updating an already-existing object with `kubectl replace`, include +the latest `metadata.resourceVersion` from `kubectl get -o yaml`. diff --git a/examples/aggregated/codertemplate-smoke-scratch.yaml b/examples/aggregated/codertemplate-smoke-scratch.yaml new file mode 100644 index 00000000..d25c7284 --- /dev/null +++ b/examples/aggregated/codertemplate-smoke-scratch.yaml @@ -0,0 +1,91 @@ +apiVersion: aggregation.coder.com/v1alpha1 +kind: CoderTemplate +metadata: + name: coder.smoke-scratch + namespace: coder +spec: + organization: coder + # Leave blank for create-from-files workflows; the API creates a new template + # version and promotes it active after build succeeds. + versionID: "" + files: + README.md: | + --- + display_name: Scratch + description: A minimal starter template for Coder + icon: ../../../site/static/emojis/1f4e6.png + maintainer_github: coder + verified: true + tags: [] + --- + + # A minimal Scaffolding for a Coder Template + + Use this starter template as a basis to create your own unique template from scratch. + main.tf: | + terraform { + required_providers { + coder = { + source = "coder/coder" + } + } + } + + data "coder_provisioner" "me" {} + + data "coder_workspace" "me" {} + + resource "coder_agent" "main" { + arch = data.coder_provisioner.me.arch + os = data.coder_provisioner.me.os + + metadata { + display_name = "CPU Usage" + key = "0_cpu_usage" + script = "coder stat cpu" + interval = 10 + timeout = 1 + } + + metadata { + display_name = "RAM Usage" + key = "1_ram_usage" + script = "coder stat mem" + interval = 10 + timeout = 1 + } + } + + # Use this to set environment variables in your workspace + # details: https://registry.terraform.io/providers/coder/coder/latest/docs/resources/env + resource "coder_env" "welcome_message" { + agent_id = coder_agent.main.id + name = "WELCOME_MESSAGE" + value = "Welcome to your Coder workspace!" + } + + # Adds code-server + # See all available modules at https://registry.coder.com/modules + module "code-server" { + count = data.coder_workspace.me.start_count + source = "registry.coder.com/coder/code-server/coder" + + # This ensures that the latest non-breaking version of the module gets downloaded, you can also pin the module version to prevent breaking changes in production. + version = "~> 1.0" + + agent_id = coder_agent.main.id + } + + # Runs a script at workspace start/stop or on a cron schedule + # details: https://registry.terraform.io/providers/coder/coder/latest/docs/resources/script + resource "coder_script" "startup_script" { + agent_id = coder_agent.main.id + display_name = "Startup Script" + script = <<-EOF + #!/bin/sh + set -e + # Run programs at workspace startup + EOF + run_on_start = true + start_blocks_login = true + } diff --git a/internal/aggregated/storage/storage_test.go b/internal/aggregated/storage/storage_test.go index 31cf4ac9..f384f5f3 100644 --- a/internal/aggregated/storage/storage_test.go +++ b/internal/aggregated/storage/storage_test.go @@ -327,6 +327,128 @@ func TestTemplateStorageUpdateWithChangedFiles(t *testing.T) { } } +func TestTemplateStorageUpdateWaitsForTemplateVersionBuildBeforePromotion(t *testing.T) { + t.Parallel() + + server, state := newMockCoderServer(t) + defer server.Close() + + templateStorage := NewTemplateStorage(newTestClientProvider(t, server.URL)) + ctx := namespacedContext("control-plane") + + createObj := &aggregationv1alpha1.CoderTemplate{ + ObjectMeta: metav1.ObjectMeta{Name: "acme.wait-for-build-template"}, + Spec: aggregationv1alpha1.CoderTemplateSpec{ + Organization: "acme", + DisplayName: "Wait For Build Template", + Files: map[string]string{"main.tf": "resource \"null_resource\" \"initial\" {}"}, + }, + } + + createdObj, err := templateStorage.Create(ctx, createObj, rest.ValidateAllObjectFunc, nil) + if err != nil { + t.Fatalf("expected template create with files to succeed: %v", err) + } + createdTemplate, ok := createdObj.(*aggregationv1alpha1.CoderTemplate) + if !ok { + t.Fatalf("expected *CoderTemplate from create, got %T", createdObj) + } + + activeVersionBefore, ok := state.templateActiveVersionID("acme", "wait-for-build-template") + if !ok { + t.Fatal("expected active version for created template") + } + + state.setNextCreatedTemplateVersionPendingForPolls(2) + + desiredTemplate := createdTemplate.DeepCopy() + desiredTemplate.Spec.Files = map[string]string{"main.tf": "resource \"null_resource\" \"updated\" {}"} + + updatedObj, created, err := templateStorage.Update( + ctx, + desiredTemplate.Name, + testUpdatedObjectInfo{obj: desiredTemplate}, + nil, + rest.ValidateAllObjectUpdateFunc, + false, + nil, + ) + if err != nil { + t.Fatalf("expected update with changed files to succeed after build wait: %v", err) + } + if created { + t.Fatal("expected update created=false") + } + + activeVersionAfter, ok := state.templateActiveVersionID("acme", "wait-for-build-template") + if !ok { + t.Fatal("expected active version for updated template") + } + if activeVersionAfter == activeVersionBefore { + t.Fatalf("expected active version to change after build wait, both were %q", activeVersionAfter) + } + + updatedTemplate, ok := updatedObj.(*aggregationv1alpha1.CoderTemplate) + if !ok { + t.Fatalf("expected *CoderTemplate from update, got %T", updatedObj) + } + expectedFiles := map[string]string{"main.tf": "resource \"null_resource\" \"updated\" {}"} + if !reflect.DeepEqual(updatedTemplate.Spec.Files, expectedFiles) { + t.Fatalf("expected updated files %v, got %v", expectedFiles, updatedTemplate.Spec.Files) + } +} + +func TestLoadTemplateVersionBuildWaitConfigFromEnvDefaults(t *testing.T) { + t.Setenv(templateVersionBuildWaitTimeoutEnv, "") + t.Setenv(templateVersionBuildBackoffAfterEnv, "") + t.Setenv(templateVersionBuildInitialPollIntervalEnv, "") + t.Setenv(templateVersionBuildMaxPollIntervalEnv, "") + + cfg, err := loadTemplateVersionBuildWaitConfigFromEnv() + if err != nil { + t.Fatalf("expected default wait config load to succeed: %v", err) + } + if cfg.waitTimeout != defaultTemplateVersionBuildWaitTimeout { + t.Fatalf("expected default wait timeout %s, got %s", defaultTemplateVersionBuildWaitTimeout, cfg.waitTimeout) + } + if cfg.backoffAfter != defaultTemplateVersionBuildBackoffAfter { + t.Fatalf("expected default backoff-after %s, got %s", defaultTemplateVersionBuildBackoffAfter, cfg.backoffAfter) + } + if cfg.initialPoll != defaultTemplateVersionBuildInitialPoll { + t.Fatalf("expected default initial poll %s, got %s", defaultTemplateVersionBuildInitialPoll, cfg.initialPoll) + } + if cfg.maxPollInterval != defaultTemplateVersionBuildMaxPollInterval { + t.Fatalf("expected default max poll interval %s, got %s", defaultTemplateVersionBuildMaxPollInterval, cfg.maxPollInterval) + } +} + +func TestLoadTemplateVersionBuildWaitConfigFromEnvInvalid(t *testing.T) { + t.Setenv(templateVersionBuildWaitTimeoutEnv, "1m") + t.Setenv(templateVersionBuildBackoffAfterEnv, "2m") + t.Setenv(templateVersionBuildInitialPollIntervalEnv, "500ms") + t.Setenv(templateVersionBuildMaxPollIntervalEnv, "10s") + + _, err := loadTemplateVersionBuildWaitConfigFromEnv() + if err == nil { + t.Fatal("expected invalid wait config to fail") + } + if !strings.Contains(err.Error(), templateVersionBuildBackoffAfterEnv) { + t.Fatalf("expected error to mention %s, got %v", templateVersionBuildBackoffAfterEnv, err) + } + + t.Setenv(templateVersionBuildBackoffAfterEnv, "2s") + t.Setenv(templateVersionBuildInitialPollIntervalEnv, "5s") + t.Setenv(templateVersionBuildMaxPollIntervalEnv, "1s") + + _, err = loadTemplateVersionBuildWaitConfigFromEnv() + if err == nil { + t.Fatal("expected max-poll-interval validation to fail") + } + if !strings.Contains(err.Error(), templateVersionBuildMaxPollIntervalEnv) { + t.Fatalf("expected error to mention %s, got %v", templateVersionBuildMaxPollIntervalEnv, err) + } +} + func TestTemplateStorageUpdateVerifiesActiveVersionPromotion(t *testing.T) { t.Parallel() @@ -2118,10 +2240,13 @@ type mockCoderServerState struct { workspacesByID map[uuid.UUID]codersdk.Workspace workspaceIDsByUser map[string]map[string]uuid.UUID - buildTransitions []codersdk.WorkspaceTransition - failBuildTransitions map[codersdk.WorkspaceTransition]int - templateMetaPatchCall int - failActiveVersionPromotion bool + buildTransitions []codersdk.WorkspaceTransition + failBuildTransitions map[codersdk.WorkspaceTransition]int + templateMetaPatchCall int + failActiveVersionPromotion bool + templateVersionPollsBeforeSuccess map[uuid.UUID]int + nextTemplateVersionInitialStatus codersdk.ProvisionerJobStatus + nextTemplateVersionPendingPolls int } func newMockCoderServer(t *testing.T) (*httptest.Server, *mockCoderServerState) { @@ -2176,6 +2301,7 @@ func newMockCoderServer(t *testing.T) (*httptest.Server, *mockCoderServerState) Message: "initial version", Job: codersdk.ProvisionerJob{ FileID: fileID, + Status: codersdk.ProvisionerJobSucceeded, }, } @@ -2229,8 +2355,10 @@ func newMockCoderServer(t *testing.T) (*httptest.Server, *mockCoderServerState) workspace.Name: workspace.ID, }, }, - buildTransitions: []codersdk.WorkspaceTransition{}, - failBuildTransitions: map[codersdk.WorkspaceTransition]int{}, + buildTransitions: []codersdk.WorkspaceTransition{}, + failBuildTransitions: map[codersdk.WorkspaceTransition]int{}, + templateVersionPollsBeforeSuccess: map[uuid.UUID]int{}, + nextTemplateVersionInitialStatus: codersdk.ProvisionerJobSucceeded, } server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -2480,6 +2608,10 @@ func (s *mockCoderServerState) handleCreateTemplateVersion(w http.ResponseWriter } now := time.Now().UTC() + initialStatus := s.nextTemplateVersionInitialStatus + if initialStatus == "" { + initialStatus = codersdk.ProvisionerJobSucceeded + } templateVersion := codersdk.TemplateVersion{ ID: uuid.New(), OrganizationID: s.organization.ID, @@ -2489,6 +2621,7 @@ func (s *mockCoderServerState) handleCreateTemplateVersion(w http.ResponseWriter Message: request.Message, Job: codersdk.ProvisionerJob{ FileID: request.FileID, + Status: initialStatus, }, } if request.TemplateID != uuid.Nil { @@ -2500,6 +2633,17 @@ func (s *mockCoderServerState) handleCreateTemplateVersion(w http.ResponseWriter templateVersion.TemplateID = &templateID } + if s.nextTemplateVersionPendingPolls > 0 { + if initialStatus != codersdk.ProvisionerJobPending && initialStatus != codersdk.ProvisionerJobRunning { + panic(fmt.Sprintf("assertion failed: pending poll simulation requires pending/running status, got %q", initialStatus)) + } + s.templateVersionPollsBeforeSuccess[templateVersion.ID] = s.nextTemplateVersionPendingPolls + } + + // Reset one-shot template version behavior knobs after creation. + s.nextTemplateVersionInitialStatus = codersdk.ProvisionerJobSucceeded + s.nextTemplateVersionPendingPolls = 0 + s.templateVersionsByID[templateVersion.ID] = templateVersion writeJSON(w, http.StatusCreated, templateVersion) @@ -2586,6 +2730,15 @@ func (s *mockCoderServerState) handleUpdateActiveTemplateVersion(w http.Response s.templateVersionsByID[templateVersion.ID] = templateVersion } + if templateVersion.Job.Status != codersdk.ProvisionerJobSucceeded { + writeCoderError( + w, + http.StatusForbidden, + "Only versions that have been built successfully can be promoted.\n Error: Attempted to promote a version with a running build", + ) + return + } + if !s.failActiveVersionPromotion { template.ActiveVersionID = request.ID } @@ -2634,6 +2787,23 @@ func (s *mockCoderServerState) handleGetTemplateVersion(w http.ResponseWriter, t return } + if pollsRemaining, hasPendingPolls := s.templateVersionPollsBeforeSuccess[templateVersionID]; hasPendingPolls { + if pollsRemaining <= 0 { + panic(fmt.Sprintf("assertion failed: template version %q pending poll count must be > 0", templateVersionID.String())) + } + pollsRemaining-- + if pollsRemaining == 0 { + completedAt := time.Now().UTC() + templateVersion.Job.Status = codersdk.ProvisionerJobSucceeded + templateVersion.Job.CompletedAt = &completedAt + templateVersion.UpdatedAt = completedAt + delete(s.templateVersionPollsBeforeSuccess, templateVersionID) + } else { + s.templateVersionPollsBeforeSuccess[templateVersionID] = pollsRemaining + } + s.templateVersionsByID[templateVersionID] = templateVersion + } + writeJSON(w, http.StatusOK, templateVersion) } @@ -2912,6 +3082,18 @@ func (s *mockCoderServerState) setFailActiveVersionPromotion(fail bool) { s.failActiveVersionPromotion = fail } +func (s *mockCoderServerState) setNextCreatedTemplateVersionPendingForPolls(polls int) { + if polls <= 0 { + panic(fmt.Sprintf("assertion failed: pending poll count must be > 0, got %d", polls)) + } + + s.mu.Lock() + defer s.mu.Unlock() + + s.nextTemplateVersionInitialStatus = codersdk.ProvisionerJobPending + s.nextTemplateVersionPendingPolls = polls +} + func (s *mockCoderServerState) setTemplateVersionTemplateID(templateVersionID, templateID uuid.UUID) { if templateVersionID == uuid.Nil { panic("assertion failed: template version ID must not be nil") diff --git a/internal/aggregated/storage/template.go b/internal/aggregated/storage/template.go index b9c66e6d..a2693d3d 100644 --- a/internal/aggregated/storage/template.go +++ b/internal/aggregated/storage/template.go @@ -4,7 +4,10 @@ import ( "bytes" "context" "fmt" + "os" "reflect" + "strings" + "time" "github.com/google/uuid" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -19,6 +22,24 @@ import ( "github.com/coder/coder/v2/codersdk" ) +const ( + templateVersionBuildWaitTimeoutEnv = "CODER_K8S_TEMPLATE_BUILD_WAIT_TIMEOUT" + templateVersionBuildBackoffAfterEnv = "CODER_K8S_TEMPLATE_BUILD_BACKOFF_AFTER" + templateVersionBuildInitialPollIntervalEnv = "CODER_K8S_TEMPLATE_BUILD_INITIAL_POLL_INTERVAL" + templateVersionBuildMaxPollIntervalEnv = "CODER_K8S_TEMPLATE_BUILD_MAX_POLL_INTERVAL" + defaultTemplateVersionBuildWaitTimeout = 25 * time.Minute + defaultTemplateVersionBuildBackoffAfter = 2 * time.Minute + defaultTemplateVersionBuildInitialPoll = 2 * time.Second + defaultTemplateVersionBuildMaxPollInterval = 10 * time.Second +) + +type templateVersionBuildWaitConfig struct { + waitTimeout time.Duration + backoffAfter time.Duration + initialPoll time.Duration + maxPollInterval time.Duration +} + var ( _ rest.Storage = (*TemplateStorage)(nil) _ rest.Getter = (*TemplateStorage)(nil) @@ -475,6 +496,10 @@ func (s *TemplateStorage) Update( return nil, false, fmt.Errorf("assertion failed: new template version ID must not be nil") } + if waitErr := waitForTemplateVersionBuild(ctx, sdk, newVersion.ID); waitErr != nil { + return nil, false, fmt.Errorf("wait for template version %q build: %w", newVersion.ID.String(), waitErr) + } + if err := sdk.UpdateActiveTemplateVersion(ctx, templateID, codersdk.UpdateActiveTemplateVersion{ID: newVersion.ID}); err != nil { return nil, false, coder.MapCoderError(err, aggregationv1alpha1.Resource("codertemplates"), name) } @@ -571,6 +596,199 @@ func (s *TemplateStorage) ConvertToTable(ctx context.Context, object, tableOptio return s.tableConvertor.ConvertToTable(ctx, object, tableOptions) } +func waitForTemplateVersionBuild(ctx context.Context, sdk *codersdk.Client, versionID uuid.UUID) error { + if ctx == nil { + return fmt.Errorf("assertion failed: context must not be nil") + } + if sdk == nil { + return fmt.Errorf("assertion failed: codersdk client must not be nil") + } + if versionID == uuid.Nil { + return fmt.Errorf("assertion failed: template version ID must not be nil") + } + + waitConfig, waitConfigErr := loadTemplateVersionBuildWaitConfigFromEnv() + if waitConfigErr != nil { + return waitConfigErr + } + + waitCtx, cancel := context.WithTimeout(ctx, waitConfig.waitTimeout) + defer cancel() + + pollInterval := waitConfig.initialPoll + pollStartTime := time.Now() + + lastStatus := codersdk.ProvisionerJobUnknown + lastError := "" + for { + version, err := sdk.TemplateVersion(waitCtx, versionID) + if err != nil { + if waitCtx.Err() != nil { + return fmt.Errorf( + "template version %q build wait canceled after status %q: %w", + versionID.String(), + lastStatus, + waitCtx.Err(), + ) + } + return fmt.Errorf("fetch template version %q: %w", versionID.String(), err) + } + + status := version.Job.Status + if status == "" { + status = codersdk.ProvisionerJobUnknown + } + lastStatus = status + lastError = version.Job.Error + + switch status { + case codersdk.ProvisionerJobSucceeded: + return nil + case codersdk.ProvisionerJobFailed, codersdk.ProvisionerJobCanceled: + if lastError == "" { + lastError = "template version build did not succeed" + } + return fmt.Errorf( + "template version %q build ended with status %q: %s", + versionID.String(), + status, + lastError, + ) + case codersdk.ProvisionerJobPending, codersdk.ProvisionerJobRunning, codersdk.ProvisionerJobCanceling, codersdk.ProvisionerJobUnknown: + // Keep polling below. + default: + return fmt.Errorf( + "assertion failed: unexpected template version build status %q for version %q", + status, + versionID.String(), + ) + } + + if waitConfig.backoffAfter > 0 && time.Since(pollStartTime) >= waitConfig.backoffAfter && pollInterval < waitConfig.maxPollInterval { + nextPollInterval := pollInterval * 2 + if nextPollInterval <= 0 || nextPollInterval > waitConfig.maxPollInterval { + nextPollInterval = waitConfig.maxPollInterval + } + pollInterval = nextPollInterval + } + + select { + case <-waitCtx.Done(): + if lastError == "" { + return fmt.Errorf( + "template version %q did not succeed within %s (last status: %q): %w", + versionID.String(), + waitConfig.waitTimeout, + lastStatus, + waitCtx.Err(), + ) + } + return fmt.Errorf( + "template version %q did not succeed within %s (last status: %q, last error: %s): %w", + versionID.String(), + waitConfig.waitTimeout, + lastStatus, + lastError, + waitCtx.Err(), + ) + case <-time.After(pollInterval): + } + } +} + +func loadTemplateVersionBuildWaitConfigFromEnv() (templateVersionBuildWaitConfig, error) { + waitTimeout, waitTimeoutErr := parseDurationEnvOrDefault(templateVersionBuildWaitTimeoutEnv, defaultTemplateVersionBuildWaitTimeout) + if waitTimeoutErr != nil { + return templateVersionBuildWaitConfig{}, waitTimeoutErr + } + if waitTimeout <= 0 { + return templateVersionBuildWaitConfig{}, fmt.Errorf( + "assertion failed: %s must be > 0, got %s", + templateVersionBuildWaitTimeoutEnv, + waitTimeout, + ) + } + + backoffAfter, backoffAfterErr := parseDurationEnvOrDefault(templateVersionBuildBackoffAfterEnv, defaultTemplateVersionBuildBackoffAfter) + if backoffAfterErr != nil { + return templateVersionBuildWaitConfig{}, backoffAfterErr + } + if backoffAfter < 0 { + return templateVersionBuildWaitConfig{}, fmt.Errorf( + "assertion failed: %s must be >= 0, got %s", + templateVersionBuildBackoffAfterEnv, + backoffAfter, + ) + } + if backoffAfter > waitTimeout { + return templateVersionBuildWaitConfig{}, fmt.Errorf( + "assertion failed: %s (%s) must be <= %s (%s)", + templateVersionBuildBackoffAfterEnv, + backoffAfter, + templateVersionBuildWaitTimeoutEnv, + waitTimeout, + ) + } + + initialPollInterval, initialPollIntervalErr := parseDurationEnvOrDefault(templateVersionBuildInitialPollIntervalEnv, defaultTemplateVersionBuildInitialPoll) + if initialPollIntervalErr != nil { + return templateVersionBuildWaitConfig{}, initialPollIntervalErr + } + if initialPollInterval <= 0 { + return templateVersionBuildWaitConfig{}, fmt.Errorf( + "assertion failed: %s must be > 0, got %s", + templateVersionBuildInitialPollIntervalEnv, + initialPollInterval, + ) + } + + maxPollInterval, maxPollIntervalErr := parseDurationEnvOrDefault(templateVersionBuildMaxPollIntervalEnv, defaultTemplateVersionBuildMaxPollInterval) + if maxPollIntervalErr != nil { + return templateVersionBuildWaitConfig{}, maxPollIntervalErr + } + if maxPollInterval <= 0 { + return templateVersionBuildWaitConfig{}, fmt.Errorf( + "assertion failed: %s must be > 0, got %s", + templateVersionBuildMaxPollIntervalEnv, + maxPollInterval, + ) + } + if maxPollInterval < initialPollInterval { + return templateVersionBuildWaitConfig{}, fmt.Errorf( + "assertion failed: %s (%s) must be >= %s (%s)", + templateVersionBuildMaxPollIntervalEnv, + maxPollInterval, + templateVersionBuildInitialPollIntervalEnv, + initialPollInterval, + ) + } + + return templateVersionBuildWaitConfig{ + waitTimeout: waitTimeout, + backoffAfter: backoffAfter, + initialPoll: initialPollInterval, + maxPollInterval: maxPollInterval, + }, nil +} + +func parseDurationEnvOrDefault(envName string, defaultValue time.Duration) (time.Duration, error) { + if envName == "" { + return 0, fmt.Errorf("assertion failed: environment variable name must not be empty") + } + + rawValue := strings.TrimSpace(os.Getenv(envName)) + if rawValue == "" { + return defaultValue, nil + } + + parsedValue, parseErr := time.ParseDuration(rawValue) + if parseErr != nil { + return 0, fmt.Errorf("parse %s=%q: %w", envName, rawValue, parseErr) + } + + return parsedValue, nil +} + func (s *TemplateStorage) clientForNamespace(ctx context.Context, namespace string) (*codersdk.Client, error) { if s.provider == nil { return nil, fmt.Errorf("assertion failed: template client provider must not be nil") From b4629664eb192637aa3bd6a5e8a90973240c1230 Mon Sep 17 00:00:00 2001 From: Thomas Kosiewski Date: Fri, 13 Feb 2026 09:44:46 +0000 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=A4=96=20fix:=20map=20template=20buil?= =?UTF-8?q?d=20wait=20failures=20to=20status=20errors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preserve Kubernetes API error mapping for template build failures/timeouts during active-version promotion. --- internal/aggregated/storage/storage_test.go | 123 ++++++++++++++++ internal/aggregated/storage/template.go | 148 ++++++++++++++++---- 2 files changed, 242 insertions(+), 29 deletions(-) diff --git a/internal/aggregated/storage/storage_test.go b/internal/aggregated/storage/storage_test.go index f384f5f3..599478c4 100644 --- a/internal/aggregated/storage/storage_test.go +++ b/internal/aggregated/storage/storage_test.go @@ -398,6 +398,117 @@ func TestTemplateStorageUpdateWaitsForTemplateVersionBuildBeforePromotion(t *tes } } +func TestTemplateStorageUpdateReturnsBadRequestWhenTemplateVersionBuildFails(t *testing.T) { + t.Parallel() + + server, state := newMockCoderServer(t) + defer server.Close() + + templateStorage := NewTemplateStorage(newTestClientProvider(t, server.URL)) + ctx := namespacedContext("control-plane") + + createObj := &aggregationv1alpha1.CoderTemplate{ + ObjectMeta: metav1.ObjectMeta{Name: "acme.failed-build-template"}, + Spec: aggregationv1alpha1.CoderTemplateSpec{ + Organization: "acme", + Files: map[string]string{"main.tf": "resource \"null_resource\" \"initial\" {}"}, + }, + } + createdObj, err := templateStorage.Create(ctx, createObj, rest.ValidateAllObjectFunc, nil) + if err != nil { + t.Fatalf("expected template create to succeed: %v", err) + } + createdTemplate := createdObj.(*aggregationv1alpha1.CoderTemplate) + + state.setNextCreatedTemplateVersionStatus(codersdk.ProvisionerJobFailed) + + desiredTemplate := createdTemplate.DeepCopy() + desiredTemplate.Spec.Files = map[string]string{"main.tf": "resource \"null_resource\" \"updated\" {}"} + + updatedObj, created, updateErr := templateStorage.Update( + ctx, + desiredTemplate.Name, + testUpdatedObjectInfo{obj: desiredTemplate}, + nil, + rest.ValidateAllObjectUpdateFunc, + false, + nil, + ) + if updateErr == nil { + t.Fatal("expected update to fail when template version build fails") + } + if updatedObj != nil { + t.Fatalf("expected nil updated object on failure, got %T", updatedObj) + } + if created { + t.Fatal("expected update created=false") + } + if !apierrors.IsBadRequest(updateErr) { + t.Fatalf("expected BadRequest for failed build, got %v", updateErr) + } + assertTopLevelStatusError(t, updateErr) + if !strings.Contains(updateErr.Error(), "build ended with status") { + t.Fatalf("expected failed-build error message, got %v", updateErr) + } +} + +func TestTemplateStorageUpdateReturnsBadRequestWhenTemplateVersionBuildWaitTimesOut(t *testing.T) { + server, state := newMockCoderServer(t) + defer server.Close() + + templateStorage := NewTemplateStorage(newTestClientProvider(t, server.URL)) + ctx := namespacedContext("control-plane") + + createObj := &aggregationv1alpha1.CoderTemplate{ + ObjectMeta: metav1.ObjectMeta{Name: "acme.timeout-build-template"}, + Spec: aggregationv1alpha1.CoderTemplateSpec{ + Organization: "acme", + Files: map[string]string{"main.tf": "resource \"null_resource\" \"initial\" {}"}, + }, + } + createdObj, err := templateStorage.Create(ctx, createObj, rest.ValidateAllObjectFunc, nil) + if err != nil { + t.Fatalf("expected template create to succeed: %v", err) + } + createdTemplate := createdObj.(*aggregationv1alpha1.CoderTemplate) + + t.Setenv(templateVersionBuildWaitTimeoutEnv, "120ms") + t.Setenv(templateVersionBuildBackoffAfterEnv, "0s") + t.Setenv(templateVersionBuildInitialPollIntervalEnv, "10ms") + t.Setenv(templateVersionBuildMaxPollIntervalEnv, "20ms") + + state.setNextCreatedTemplateVersionStatus(codersdk.ProvisionerJobPending) + + desiredTemplate := createdTemplate.DeepCopy() + desiredTemplate.Spec.Files = map[string]string{"main.tf": "resource \"null_resource\" \"updated\" {}"} + + updatedObj, created, updateErr := templateStorage.Update( + ctx, + desiredTemplate.Name, + testUpdatedObjectInfo{obj: desiredTemplate}, + nil, + rest.ValidateAllObjectUpdateFunc, + false, + nil, + ) + if updateErr == nil { + t.Fatal("expected update to fail when template version build wait times out") + } + if updatedObj != nil { + t.Fatalf("expected nil updated object on timeout, got %T", updatedObj) + } + if created { + t.Fatal("expected update created=false") + } + if !apierrors.IsBadRequest(updateErr) { + t.Fatalf("expected BadRequest for build wait timeout, got %v", updateErr) + } + assertTopLevelStatusError(t, updateErr) + if !strings.Contains(updateErr.Error(), "did not succeed within") { + t.Fatalf("expected timeout error message, got %v", updateErr) + } +} + func TestLoadTemplateVersionBuildWaitConfigFromEnvDefaults(t *testing.T) { t.Setenv(templateVersionBuildWaitTimeoutEnv, "") t.Setenv(templateVersionBuildBackoffAfterEnv, "") @@ -3082,6 +3193,18 @@ func (s *mockCoderServerState) setFailActiveVersionPromotion(fail bool) { s.failActiveVersionPromotion = fail } +func (s *mockCoderServerState) setNextCreatedTemplateVersionStatus(status codersdk.ProvisionerJobStatus) { + if status == "" { + panic("assertion failed: template version status must not be empty") + } + + s.mu.Lock() + defer s.mu.Unlock() + + s.nextTemplateVersionInitialStatus = status + s.nextTemplateVersionPendingPolls = 0 +} + func (s *mockCoderServerState) setNextCreatedTemplateVersionPendingForPolls(polls int) { if polls <= 0 { panic(fmt.Sprintf("assertion failed: pending poll count must be > 0, got %d", polls)) diff --git a/internal/aggregated/storage/template.go b/internal/aggregated/storage/template.go index a2693d3d..88b83990 100644 --- a/internal/aggregated/storage/template.go +++ b/internal/aggregated/storage/template.go @@ -3,6 +3,7 @@ package storage import ( "bytes" "context" + "errors" "fmt" "os" "reflect" @@ -497,7 +498,7 @@ func (s *TemplateStorage) Update( } if waitErr := waitForTemplateVersionBuild(ctx, sdk, newVersion.ID); waitErr != nil { - return nil, false, fmt.Errorf("wait for template version %q build: %w", newVersion.ID.String(), waitErr) + return nil, false, mapTemplateVersionBuildWaitError(waitErr, name) } if err := sdk.UpdateActiveTemplateVersion(ctx, templateID, codersdk.UpdateActiveTemplateVersion{ID: newVersion.ID}); err != nil { @@ -596,6 +597,102 @@ func (s *TemplateStorage) ConvertToTable(ctx context.Context, object, tableOptio return s.tableConvertor.ConvertToTable(ctx, object, tableOptions) } +type templateVersionBuildTerminalError struct { + versionID uuid.UUID + status codersdk.ProvisionerJobStatus + detail string +} + +func (e *templateVersionBuildTerminalError) Error() string { + if e == nil { + panic("assertion failed: template version build terminal error must not be nil") + } + if e.versionID == uuid.Nil { + panic("assertion failed: template version build terminal error must include version ID") + } + if e.status == "" { + panic("assertion failed: template version build terminal error must include status") + } + + detail := e.detail + if detail == "" { + detail = "template version build did not succeed" + } + + return fmt.Sprintf( + "template version %q build ended with status %q: %s", + e.versionID.String(), + e.status, + detail, + ) +} + +type templateVersionBuildWaitTimeoutError struct { + versionID uuid.UUID + waitTimeout time.Duration + lastStatus codersdk.ProvisionerJobStatus + lastError string + cause error +} + +func (e *templateVersionBuildWaitTimeoutError) Error() string { + if e == nil { + panic("assertion failed: template version build timeout error must not be nil") + } + if e.versionID == uuid.Nil { + panic("assertion failed: template version build timeout error must include version ID") + } + if e.waitTimeout <= 0 { + panic("assertion failed: template version build timeout error must include positive timeout") + } + if e.lastStatus == "" { + panic("assertion failed: template version build timeout error must include last status") + } + if e.cause == nil { + panic("assertion failed: template version build timeout error must include cause") + } + + if e.lastError == "" { + return fmt.Sprintf( + "template version %q did not succeed within %s (last status: %q): %v", + e.versionID.String(), + e.waitTimeout, + e.lastStatus, + e.cause, + ) + } + + return fmt.Sprintf( + "template version %q did not succeed within %s (last status: %q, last error: %s): %v", + e.versionID.String(), + e.waitTimeout, + e.lastStatus, + e.lastError, + e.cause, + ) +} + +func mapTemplateVersionBuildWaitError(waitErr error, templateName string) error { + if waitErr == nil { + return fmt.Errorf("assertion failed: wait error must not be nil") + } + if templateName == "" { + return fmt.Errorf("assertion failed: template name must not be empty") + } + + var terminalErr *templateVersionBuildTerminalError + if errors.As(waitErr, &terminalErr) { + return apierrors.NewBadRequest(terminalErr.Error()) + } + + var timeoutErr *templateVersionBuildWaitTimeoutError + if errors.As(waitErr, &timeoutErr) { + return apierrors.NewBadRequest(timeoutErr.Error()) + } + + return coder.MapCoderError(waitErr, aggregationv1alpha1.Resource("codertemplates"), templateName) +} + func waitForTemplateVersionBuild(ctx context.Context, sdk *codersdk.Client, versionID uuid.UUID) error { if ctx == nil { return fmt.Errorf("assertion failed: context must not be nil") @@ -624,12 +721,13 @@ func waitForTemplateVersionBuild(ctx context.Context, sdk *codersdk.Client, vers version, err := sdk.TemplateVersion(waitCtx, versionID) if err != nil { if waitCtx.Err() != nil { - return fmt.Errorf( - "template version %q build wait canceled after status %q: %w", - versionID.String(), - lastStatus, - waitCtx.Err(), - ) + return &templateVersionBuildWaitTimeoutError{ + versionID: versionID, + waitTimeout: waitConfig.waitTimeout, + lastStatus: lastStatus, + lastError: lastError, + cause: waitCtx.Err(), + } } return fmt.Errorf("fetch template version %q: %w", versionID.String(), err) } @@ -648,12 +746,11 @@ func waitForTemplateVersionBuild(ctx context.Context, sdk *codersdk.Client, vers if lastError == "" { lastError = "template version build did not succeed" } - return fmt.Errorf( - "template version %q build ended with status %q: %s", - versionID.String(), - status, - lastError, - ) + return &templateVersionBuildTerminalError{ + versionID: versionID, + status: status, + detail: lastError, + } case codersdk.ProvisionerJobPending, codersdk.ProvisionerJobRunning, codersdk.ProvisionerJobCanceling, codersdk.ProvisionerJobUnknown: // Keep polling below. default: @@ -674,23 +771,16 @@ func waitForTemplateVersionBuild(ctx context.Context, sdk *codersdk.Client, vers select { case <-waitCtx.Done(): - if lastError == "" { - return fmt.Errorf( - "template version %q did not succeed within %s (last status: %q): %w", - versionID.String(), - waitConfig.waitTimeout, - lastStatus, - waitCtx.Err(), - ) + if waitCtx.Err() == nil { + return fmt.Errorf("assertion failed: wait context finished without an error") + } + return &templateVersionBuildWaitTimeoutError{ + versionID: versionID, + waitTimeout: waitConfig.waitTimeout, + lastStatus: lastStatus, + lastError: lastError, + cause: waitCtx.Err(), } - return fmt.Errorf( - "template version %q did not succeed within %s (last status: %q, last error: %s): %w", - versionID.String(), - waitConfig.waitTimeout, - lastStatus, - lastError, - waitCtx.Err(), - ) case <-time.After(pollInterval): } } From 63db29c352eaa9aa04c6664cf437eab2b6388860 Mon Sep 17 00:00:00 2001 From: Thomas Kosiewski Date: Fri, 13 Feb 2026 10:02:38 +0000 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=A4=96=20fix:=20align=20template=20bu?= =?UTF-8?q?ild=20wait=20timeout=20semantics?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address Codex review by aligning aggregated API request timeout defaults with template build waits and preserving timeout/cancel semantics when mapping build wait errors. - set aggregated API request timeout default to 30m - distinguish internal build wait expiry from request lifecycle cancellation/deadline - keep internal wait expiry as BadRequest while returning Timeout for canceled/deadline contexts - add coverage for request-timeout mapping and config defaults - document request-timeout relationship in aggregated deployment guide --- _Generated with `mux` • Model: `openai:gpt-5.3-codex` • Thinking: `xhigh` • Cost: `$0.57`_ --- docs/how-to/deploy-aggregated-apiserver.md | 4 + internal/aggregated/storage/storage_test.go | 117 +++++++++++++++++- internal/aggregated/storage/template.go | 76 ++++++++++-- internal/app/apiserverapp/apiserverapp.go | 3 + .../app/apiserverapp/apiserverapp_test.go | 33 +++++ 5 files changed, 224 insertions(+), 9 deletions(-) diff --git a/docs/how-to/deploy-aggregated-apiserver.md b/docs/how-to/deploy-aggregated-apiserver.md index fd4766c6..9eda72f5 100644 --- a/docs/how-to/deploy-aggregated-apiserver.md +++ b/docs/how-to/deploy-aggregated-apiserver.md @@ -103,6 +103,10 @@ The wait behavior is configurable via environment variables on the - `CODER_K8S_TEMPLATE_BUILD_INITIAL_POLL_INTERVAL` (default: `2s`) - `CODER_K8S_TEMPLATE_BUILD_MAX_POLL_INTERVAL` (default: `10s`) +- Aggregated API request timeout defaults to `30m`; keep it at or above the build wait timeout. + +- `CODER_K8S_TEMPLATE_BUILD_WAIT_TIMEOUT` values above `30m` are rejected, because they cannot exceed the API request timeout. + Behavior: - Polls at the initial interval for the first 2 minutes. diff --git a/internal/aggregated/storage/storage_test.go b/internal/aggregated/storage/storage_test.go index 599478c4..d5bfef3a 100644 --- a/internal/aggregated/storage/storage_test.go +++ b/internal/aggregated/storage/storage_test.go @@ -509,6 +509,110 @@ func TestTemplateStorageUpdateReturnsBadRequestWhenTemplateVersionBuildWaitTimes } } +func TestTemplateStorageUpdateReturnsTimeoutWhenRequestContextTimesOut(t *testing.T) { + server, state := newMockCoderServer(t) + defer server.Close() + + templateStorage := NewTemplateStorage(newTestClientProvider(t, server.URL)) + baseCtx := namespacedContext("control-plane") + + createObj := &aggregationv1alpha1.CoderTemplate{ + ObjectMeta: metav1.ObjectMeta{Name: "acme.request-timeout-template"}, + Spec: aggregationv1alpha1.CoderTemplateSpec{ + Organization: "acme", + Files: map[string]string{"main.tf": "resource \"null_resource\" \"initial\" {}"}, + }, + } + createdObj, err := templateStorage.Create(baseCtx, createObj, rest.ValidateAllObjectFunc, nil) + if err != nil { + t.Fatalf("expected template create to succeed: %v", err) + } + createdTemplate := createdObj.(*aggregationv1alpha1.CoderTemplate) + + t.Setenv(templateVersionBuildWaitTimeoutEnv, "5s") + t.Setenv(templateVersionBuildBackoffAfterEnv, "0s") + t.Setenv(templateVersionBuildInitialPollIntervalEnv, "10ms") + t.Setenv(templateVersionBuildMaxPollIntervalEnv, "20ms") + + state.setNextCreatedTemplateVersionStatus(codersdk.ProvisionerJobPending) + + updateCtx, cancel := context.WithTimeout(baseCtx, 120*time.Millisecond) + defer cancel() + + desiredTemplate := createdTemplate.DeepCopy() + desiredTemplate.Spec.Files = map[string]string{"main.tf": "resource \"null_resource\" \"updated\" {}"} + + updatedObj, created, updateErr := templateStorage.Update( + updateCtx, + desiredTemplate.Name, + testUpdatedObjectInfo{obj: desiredTemplate}, + nil, + rest.ValidateAllObjectUpdateFunc, + false, + nil, + ) + if updateErr == nil { + t.Fatal("expected update to fail when request context times out") + } + if updatedObj != nil { + t.Fatalf("expected nil updated object on request timeout, got %T", updatedObj) + } + if created { + t.Fatal("expected update created=false") + } + if apierrors.IsBadRequest(updateErr) { + t.Fatalf("expected non-BadRequest error for request context timeout, got %v", updateErr) + } + if !apierrors.IsTimeout(updateErr) { + t.Fatalf("expected timeout error for request context timeout, got %v", updateErr) + } + assertTopLevelStatusError(t, updateErr) +} + +func TestMapTemplateVersionBuildWaitErrorMapsRequestContextFailuresToTimeout(t *testing.T) { + tests := []struct { + name string + cause error + }{ + { + name: "deadline exceeded", + cause: context.DeadlineExceeded, + }, + { + name: "canceled", + cause: context.Canceled, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + mappedErr := mapTemplateVersionBuildWaitError( + &templateVersionBuildWaitTimeoutError{ + versionID: uuid.New(), + waitTimeout: time.Minute, + lastStatus: codersdk.ProvisionerJobRunning, + cause: tt.cause, + }, + "acme.template", + ) + + if mappedErr == nil { + t.Fatal("expected mapped timeout error") + } + if apierrors.IsBadRequest(mappedErr) { + t.Fatalf("expected timeout error, got BadRequest: %v", mappedErr) + } + if !apierrors.IsTimeout(mappedErr) { + t.Fatalf("expected timeout error, got %v", mappedErr) + } + assertTopLevelStatusError(t, mappedErr) + }) + } +} + func TestLoadTemplateVersionBuildWaitConfigFromEnvDefaults(t *testing.T) { t.Setenv(templateVersionBuildWaitTimeoutEnv, "") t.Setenv(templateVersionBuildBackoffAfterEnv, "") @@ -534,12 +638,23 @@ func TestLoadTemplateVersionBuildWaitConfigFromEnvDefaults(t *testing.T) { } func TestLoadTemplateVersionBuildWaitConfigFromEnvInvalid(t *testing.T) { - t.Setenv(templateVersionBuildWaitTimeoutEnv, "1m") + t.Setenv(templateVersionBuildWaitTimeoutEnv, "31m") t.Setenv(templateVersionBuildBackoffAfterEnv, "2m") t.Setenv(templateVersionBuildInitialPollIntervalEnv, "500ms") t.Setenv(templateVersionBuildMaxPollIntervalEnv, "10s") _, err := loadTemplateVersionBuildWaitConfigFromEnv() + if err == nil { + t.Fatal("expected max wait timeout validation to fail") + } + if !strings.Contains(err.Error(), templateVersionBuildWaitTimeoutEnv) { + t.Fatalf("expected error to mention %s, got %v", templateVersionBuildWaitTimeoutEnv, err) + } + + t.Setenv(templateVersionBuildWaitTimeoutEnv, "1m") + t.Setenv(templateVersionBuildBackoffAfterEnv, "2m") + + _, err = loadTemplateVersionBuildWaitConfigFromEnv() if err == nil { t.Fatal("expected invalid wait config to fail") } diff --git a/internal/aggregated/storage/template.go b/internal/aggregated/storage/template.go index 88b83990..16c8e85c 100644 --- a/internal/aggregated/storage/template.go +++ b/internal/aggregated/storage/template.go @@ -32,6 +32,8 @@ const ( defaultTemplateVersionBuildBackoffAfter = 2 * time.Minute defaultTemplateVersionBuildInitialPoll = 2 * time.Second defaultTemplateVersionBuildMaxPollInterval = 10 * time.Second + // MaxTemplateVersionBuildWaitTimeout keeps template build waits within aggregated API request deadlines. + MaxTemplateVersionBuildWaitTimeout = 30 * time.Minute ) type templateVersionBuildWaitConfig struct { @@ -50,6 +52,8 @@ var ( _ rest.GracefulDeleter = (*TemplateStorage)(nil) _ rest.Scoper = (*TemplateStorage)(nil) _ rest.SingularNameProvider = (*TemplateStorage)(nil) + + errTemplateVersionBuildWaitTimeoutExceeded = errors.New("template version build wait timeout exceeded") ) // TemplateStorage provides codersdk-backed CoderTemplate objects. @@ -687,7 +691,14 @@ func mapTemplateVersionBuildWaitError(waitErr error, templateName string) error var timeoutErr *templateVersionBuildWaitTimeoutError if errors.As(waitErr, &timeoutErr) { - return apierrors.NewBadRequest(timeoutErr.Error()) + switch { + case errors.Is(timeoutErr.cause, errTemplateVersionBuildWaitTimeoutExceeded): + return apierrors.NewBadRequest(timeoutErr.Error()) + case errors.Is(timeoutErr.cause, context.DeadlineExceeded), errors.Is(timeoutErr.cause, context.Canceled): + return apierrors.NewTimeoutError(timeoutErr.Error(), 0) + default: + return apierrors.NewInternalError(timeoutErr) + } } return coder.MapCoderError(waitErr, aggregationv1alpha1.Resource("codertemplates"), templateName) @@ -709,7 +720,15 @@ func waitForTemplateVersionBuild(ctx context.Context, sdk *codersdk.Client, vers return waitConfigErr } - waitCtx, cancel := context.WithTimeout(ctx, waitConfig.waitTimeout) + effectiveWaitTimeout := waitConfig.waitTimeout + if requestDeadline, hasRequestDeadline := ctx.Deadline(); hasRequestDeadline { + remaining := time.Until(requestDeadline) + if remaining > 0 && remaining < effectiveWaitTimeout { + effectiveWaitTimeout = remaining + } + } + + waitCtx, cancel := context.WithTimeout(ctx, effectiveWaitTimeout) defer cancel() pollInterval := waitConfig.initialPoll @@ -721,12 +740,17 @@ func waitForTemplateVersionBuild(ctx context.Context, sdk *codersdk.Client, vers version, err := sdk.TemplateVersion(waitCtx, versionID) if err != nil { if waitCtx.Err() != nil { + timeoutCause, timeoutCauseErr := templateVersionBuildWaitTimeoutCause(ctx, waitCtx) + if timeoutCauseErr != nil { + return timeoutCauseErr + } + return &templateVersionBuildWaitTimeoutError{ versionID: versionID, - waitTimeout: waitConfig.waitTimeout, + waitTimeout: effectiveWaitTimeout, lastStatus: lastStatus, lastError: lastError, - cause: waitCtx.Err(), + cause: timeoutCause, } } return fmt.Errorf("fetch template version %q: %w", versionID.String(), err) @@ -771,21 +795,47 @@ func waitForTemplateVersionBuild(ctx context.Context, sdk *codersdk.Client, vers select { case <-waitCtx.Done(): - if waitCtx.Err() == nil { - return fmt.Errorf("assertion failed: wait context finished without an error") + timeoutCause, timeoutCauseErr := templateVersionBuildWaitTimeoutCause(ctx, waitCtx) + if timeoutCauseErr != nil { + return timeoutCauseErr } + return &templateVersionBuildWaitTimeoutError{ versionID: versionID, - waitTimeout: waitConfig.waitTimeout, + waitTimeout: effectiveWaitTimeout, lastStatus: lastStatus, lastError: lastError, - cause: waitCtx.Err(), + cause: timeoutCause, } case <-time.After(pollInterval): } } } +func templateVersionBuildWaitTimeoutCause(requestCtx, waitCtx context.Context) (error, error) { + if requestCtx == nil { + return nil, fmt.Errorf("assertion failed: request context must not be nil") + } + if waitCtx == nil { + return nil, fmt.Errorf("assertion failed: wait context must not be nil") + } + + waitErr := waitCtx.Err() + if waitErr == nil { + return nil, fmt.Errorf("assertion failed: wait context finished without an error") + } + + if requestErr := requestCtx.Err(); requestErr != nil { + return requestErr, nil + } + + if errors.Is(waitErr, context.DeadlineExceeded) { + return errTemplateVersionBuildWaitTimeoutExceeded, nil + } + + return waitErr, nil +} + func loadTemplateVersionBuildWaitConfigFromEnv() (templateVersionBuildWaitConfig, error) { waitTimeout, waitTimeoutErr := parseDurationEnvOrDefault(templateVersionBuildWaitTimeoutEnv, defaultTemplateVersionBuildWaitTimeout) if waitTimeoutErr != nil { @@ -799,6 +849,16 @@ func loadTemplateVersionBuildWaitConfigFromEnv() (templateVersionBuildWaitConfig ) } + if waitTimeout > MaxTemplateVersionBuildWaitTimeout { + return templateVersionBuildWaitConfig{}, fmt.Errorf( + "assertion failed: %s (%s) must be <= %s (%s)", + templateVersionBuildWaitTimeoutEnv, + waitTimeout, + "aggregated API request timeout", + MaxTemplateVersionBuildWaitTimeout, + ) + } + backoffAfter, backoffAfterErr := parseDurationEnvOrDefault(templateVersionBuildBackoffAfterEnv, defaultTemplateVersionBuildBackoffAfter) if backoffAfterErr != nil { return templateVersionBuildWaitConfig{}, backoffAfterErr diff --git a/internal/app/apiserverapp/apiserverapp.go b/internal/app/apiserverapp/apiserverapp.go index 44bed2bd..e2727e9c 100644 --- a/internal/app/apiserverapp/apiserverapp.go +++ b/internal/app/apiserverapp/apiserverapp.go @@ -38,6 +38,8 @@ const ( // DefaultSecureServingPort is the secure serving port used by aggregated-apiserver mode. DefaultSecureServingPort = 6443 serverName = "coder-k8s-aggregated-apiserver" + // defaultRequestTimeout keeps API request lifetimes aligned with template build wait limits. + defaultRequestTimeout = storage.MaxTemplateVersionBuildWaitTimeout ) // Options configures aggregated-apiserver bootstrap behavior. @@ -192,6 +194,7 @@ func NewRecommendedConfig( recommendedConfig.RuleResolver = authz recommendedConfig.EffectiveVersion = apiservercompatibility.DefaultBuildEffectiveVersion() recommendedConfig.SkipOpenAPIInstallation = true + recommendedConfig.RequestTimeout = defaultRequestTimeout definitionNamer := apiserveropenapi.NewDefinitionNamer(scheme) recommendedConfig.OpenAPIConfig = genericapiserver.DefaultOpenAPIConfig(getOpenAPIDefinitions, definitionNamer) diff --git a/internal/app/apiserverapp/apiserverapp_test.go b/internal/app/apiserverapp/apiserverapp_test.go index 26f66b8b..d91c9bc3 100644 --- a/internal/app/apiserverapp/apiserverapp_test.go +++ b/internal/app/apiserverapp/apiserverapp_test.go @@ -128,6 +128,39 @@ func TestInstallAPIGroupRegistersDiscovery(t *testing.T) { } } +func TestNewRecommendedConfigSetsExtendedRequestTimeout(t *testing.T) { + t.Helper() + + scheme := NewScheme() + if scheme == nil { + t.Fatal("expected non-nil scheme") + } + codecs := serializer.NewCodecFactory(scheme) + + listener, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("create test listener: %v", err) + } + defer func() { + _ = listener.Close() + }() + + secureServingOptions := genericoptions.NewSecureServingOptions() + secureServingOptions.Listener = listener + secureServingOptions.BindPort = 0 + secureServingOptions.ServerCert.CertDirectory = "" + secureServingOptions.ServerCert.PairName = "" + + recommendedConfig, err := NewRecommendedConfig(scheme, codecs, secureServingOptions) + if err != nil { + t.Fatalf("build recommended config: %v", err) + } + + if got, want := recommendedConfig.RequestTimeout, defaultRequestTimeout; got != want { + t.Fatalf("expected request timeout %s, got %s", want, got) + } +} + func TestBuildClientProviderDefersMissingCoderConfigAsServiceUnavailable(t *testing.T) { t.Parallel()