From 4d7af26508051164cdc769a144536a52551879a1 Mon Sep 17 00:00:00 2001 From: Reflex Date: Fri, 12 Jun 2026 14:47:12 +0000 Subject: [PATCH 1/2] fix(smoketests): retry blueprint creation on transient infra-side build failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dev-cluster blueprint builds intermittently fail with infra-side errors — BuildKit i/o timeouts resolving base-image manifests from the in-cluster registry mirror, and stage-context S3 download transit errors. The SDK surfaces those as a RunloopError ("Blueprint is in non-complete state failed"), which previously cascaded through the shared `beforeAll` fixture in `object-oriented/blueprint.test.ts` and failed all six lifecycle tests on a single build flake. Adds `createBlueprintWithRetry` in `tests/smoketests/utils.ts` that catches that exact error shape, best-effort deletes the failed blueprint, and retries (default 3 attempts, 5s backoff). Deterministic build errors fail every attempt and still surface as a test failure — only flakes recover. Wraps the six `sdk.blueprint.create` call sites in the object-oriented blueprint smoketest. Co-Authored-By: Claude Opus 4.7 --- .../object-oriented/blueprint.test.ts | 27 ++++++--- tests/smoketests/utils.ts | 60 ++++++++++++++++++- 2 files changed, 79 insertions(+), 8 deletions(-) diff --git a/tests/smoketests/object-oriented/blueprint.test.ts b/tests/smoketests/object-oriented/blueprint.test.ts index 6d7878f3b..78962e61c 100644 --- a/tests/smoketests/object-oriented/blueprint.test.ts +++ b/tests/smoketests/object-oriented/blueprint.test.ts @@ -1,4 +1,11 @@ -import { SHORT_TIMEOUT, LONG_TIMEOUT, uniqueName, makeClientSDK, cleanUpPolicy } from '../utils'; +import { + SHORT_TIMEOUT, + LONG_TIMEOUT, + uniqueName, + makeClientSDK, + cleanUpPolicy, + createBlueprintWithRetry, +} from '../utils'; import { Blueprint, Devbox, NetworkPolicy, StorageObject } from '@runloop/api-client/sdk'; const sdk = makeClientSDK(); @@ -10,7 +17,8 @@ describe('smoketest: object-oriented blueprint', () => { // Create blueprint in beforeAll to avoid test order dependency beforeAll(async () => { - blueprint = await sdk.blueprint.create( + blueprint = await createBlueprintWithRetry( + sdk, { name: uniqueName('sdk-blueprint'), dockerfile: 'FROM ubuntu:22.04\nRUN apt-get update && apt-get install -y curl', @@ -136,7 +144,8 @@ describe('smoketest: object-oriented blueprint', () => { }); // Create blueprint that uses the uploaded object as build context - blueprint = await sdk.blueprint.create( + blueprint = await createBlueprintWithRetry( + sdk, { name: uniqueName('sdk-blueprint-context'), dockerfile: `FROM ubuntu:22.04 @@ -218,7 +227,8 @@ COPY . .`, if (!contextDir) { throw new Error('Context directory not created'); } - blueprint = await sdk.blueprint.create( + blueprint = await createBlueprintWithRetry( + sdk, { name: uniqueName('sdk-blueprint-context-dir'), dockerfile: `FROM ubuntu:22.04 @@ -283,7 +293,8 @@ COPY . .`, // First create a blueprint let blueprint: Blueprint | undefined; try { - blueprint = await sdk.blueprint.create( + blueprint = await createBlueprintWithRetry( + sdk, { name: uniqueName('sdk-blueprint-retrieve'), dockerfile: 'FROM ubuntu:22.04', @@ -320,7 +331,8 @@ COPY . .`, expect(policy.id).toBeTruthy(); // Create blueprint with network_policy_id at top level (for build) - blueprint = await sdk.blueprint.create( + blueprint = await createBlueprintWithRetry( + sdk, { name: uniqueName('sdk-blueprint-with-build-policy'), dockerfile: 'FROM ubuntu:22.04\nRUN apt-get update', @@ -361,7 +373,8 @@ COPY . .`, expect(policy.id).toBeTruthy(); // Create blueprint with launch_parameters including network_policy_id - blueprint = await sdk.blueprint.create( + blueprint = await createBlueprintWithRetry( + sdk, { name: uniqueName('sdk-blueprint-with-launch-policy'), dockerfile: 'FROM ubuntu:22.04', diff --git a/tests/smoketests/utils.ts b/tests/smoketests/utils.ts index d58b7cf5e..f3c93fd81 100644 --- a/tests/smoketests/utils.ts +++ b/tests/smoketests/utils.ts @@ -1,5 +1,8 @@ import { Runloop, RunloopSDK } from '@runloop/api-client'; -import { NetworkPolicy, GatewayConfig, McpConfig } from '@runloop/api-client/sdk'; +import { Blueprint, NetworkPolicy, GatewayConfig, McpConfig } from '@runloop/api-client/sdk'; +import type { CreateParams as BlueprintCreateParams } from '@runloop/api-client/sdk/blueprint'; +import type { BlueprintView } from '@runloop/api-client/resources/blueprints'; +import type { LongPollRequestOptions } from '@runloop/api-client/lib/polling'; /** * Run the smoke tests over HTTP/2 (the undici adapter) instead of the default @@ -39,6 +42,61 @@ export const SHORT_TIMEOUT = 120_000; export const MEDIUM_TIMEOUT = 300_000; export const LONG_TIMEOUT = 600_000; +/** + * Create a blueprint and retry on terminal-failed builds. + * + * Dev-cluster blueprint builds intermittently fail with infra-side errors — + * registry-mirror i/o timeouts when BuildKit resolves base images, S3 transit + * errors when the stage-context container downloads the build-context object. + * The SDK surfaces these as a RunloopError ("in non-complete state failed"), + * and a single flake otherwise cascades through `beforeAll` fixtures into many + * unrelated test failures. + * + * Retries on that exact error shape. A genuinely-broken Dockerfile reaches the + * same shape but fails every attempt, so determinstic failures still fail the + * test — only flakes are masked. + */ +export async function createBlueprintWithRetry( + sdk: RunloopSDK, + params: BlueprintCreateParams, + options?: LongPollRequestOptions & { attempts?: number; retryDelayMs?: number }, +): Promise { + const { attempts = 3, retryDelayMs = 5_000, ...createOptions } = options ?? {}; + let lastErr: unknown; + for (let attempt = 1; attempt <= attempts; attempt++) { + try { + return await sdk.blueprint.create(params, createOptions); + } catch (err) { + lastErr = err; + if (!isTransientBlueprintBuildFailure(err) || attempt === attempts) throw err; + const failedId = extractBlueprintIdFromError(err); + if (failedId) { + await sdk.blueprint + .fromId(failedId) + .delete() + .catch(() => {}); + } + // eslint-disable-next-line no-console + console.warn( + `[smoketest] blueprint create attempt ${attempt}/${attempts} failed (likely infra flake), retrying in ${retryDelayMs}ms: ${(err as Error).message}`, + ); + await new Promise((r) => setTimeout(r, retryDelayMs)); + } + } + throw lastErr; +} + +const BLUEPRINT_FAILED_MSG_RE = /Blueprint (bpt_\S+) is in non-complete state failed/; + +function isTransientBlueprintBuildFailure(err: unknown): boolean { + return err instanceof Error && BLUEPRINT_FAILED_MSG_RE.test(err.message); +} + +function extractBlueprintIdFromError(err: unknown): string | undefined { + if (!(err instanceof Error)) return undefined; + return err.message.match(BLUEPRINT_FAILED_MSG_RE)?.[1]; +} + /** * Helper to clean up a network policy, ignoring errors if already deleted. */ From dc1d7d7440bcba12ab75480aa27ec18352931713 Mon Sep 17 00:00:00 2001 From: Reflex Date: Fri, 12 Jun 2026 15:56:07 +0000 Subject: [PATCH 2/2] =?UTF-8?q?fix(smoketests):=20keep=20infra=20flakes=20?= =?UTF-8?q?visible=20=E2=80=94=20classify=20and=20fail=20with=20diagnostic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review: silently recovering from blueprint build flakes hides infra problems that also impact customers. Change the helper to PROBE for the underlying class of failure (transient vs persistent) but always surface a test failure, with a message that names the classification and points at where to investigate. The probe distinguishes: - TRANSIENT infra flake: first attempt failed, a subsequent attempt recovered. Likely the recurring registry-mirror / S3 transient described in the helper docstring. - PERSISTENT infra failure: every attempt terminal-failed. More likely a broken Dockerfile, a sustained outage, or a real bug. Either way the test fails red. The recovered blueprint (if any) is deleted so downstream steps don't accidentally see it. Co-Authored-By: Claude Opus 4.7 --- tests/smoketests/utils.ts | 127 +++++++++++++++++++++++++++++--------- 1 file changed, 97 insertions(+), 30 deletions(-) diff --git a/tests/smoketests/utils.ts b/tests/smoketests/utils.ts index f3c93fd81..717fa0d7d 100644 --- a/tests/smoketests/utils.ts +++ b/tests/smoketests/utils.ts @@ -43,51 +43,118 @@ export const MEDIUM_TIMEOUT = 300_000; export const LONG_TIMEOUT = 600_000; /** - * Create a blueprint and retry on terminal-failed builds. + * Create a blueprint and probe for infra-side flakes. * * Dev-cluster blueprint builds intermittently fail with infra-side errors — - * registry-mirror i/o timeouts when BuildKit resolves base images, S3 transit - * errors when the stage-context container downloads the build-context object. - * The SDK surfaces these as a RunloopError ("in non-complete state failed"), - * and a single flake otherwise cascades through `beforeAll` fixtures into many - * unrelated test failures. + * BuildKit i/o timeouts resolving base images from the in-cluster registry + * mirror, S3 transit errors when the stage-context container downloads the + * build-context object. The SDK surfaces those as a RunloopError ("Blueprint + * is in non-complete state failed"). A single flake in a `beforeAll` + * fixture otherwise cascades into many unrelated test failures, which buries + * the actual signal. * - * Retries on that exact error shape. A genuinely-broken Dockerfile reaches the - * same shape but fails every attempt, so determinstic failures still fail the - * test — only flakes are masked. + * This helper does NOT mask flakes — infra reliability is a customer-visible + * problem and the test suite needs to keep surfacing it. Behavior: + * - First attempt fails with the terminal-failed shape: retry up to + * `probeAttempts` more times to learn whether the failure is transient + * (recovers on retry) or persistent (reproduces every time). + * - The test ALWAYS fails — but with a diagnostic message that names the + * classification and points at where to investigate. + * - Other error shapes (auth, validation, etc.) re-throw unchanged. */ export async function createBlueprintWithRetry( sdk: RunloopSDK, params: BlueprintCreateParams, - options?: LongPollRequestOptions & { attempts?: number; retryDelayMs?: number }, + options?: LongPollRequestOptions & { + probeAttempts?: number; + retryDelayMs?: number; + }, ): Promise { - const { attempts = 3, retryDelayMs = 5_000, ...createOptions } = options ?? {}; - let lastErr: unknown; - for (let attempt = 1; attempt <= attempts; attempt++) { - try { - return await sdk.blueprint.create(params, createOptions); - } catch (err) { - lastErr = err; - if (!isTransientBlueprintBuildFailure(err) || attempt === attempts) throw err; - const failedId = extractBlueprintIdFromError(err); - if (failedId) { - await sdk.blueprint - .fromId(failedId) - .delete() - .catch(() => {}); - } - // eslint-disable-next-line no-console - console.warn( - `[smoketest] blueprint create attempt ${attempt}/${attempts} failed (likely infra flake), retrying in ${retryDelayMs}ms: ${(err as Error).message}`, - ); + const { probeAttempts = 2, retryDelayMs = 5_000, ...createOptions } = options ?? {}; + try { + return await sdk.blueprint.create(params, createOptions); + } catch (firstErr) { + if (!isTransientBlueprintBuildFailure(firstErr)) throw firstErr; + + const firstFailedId = extractBlueprintIdFromError(firstErr); + if (firstFailedId) { + await sdk.blueprint + .fromId(firstFailedId) + .delete() + .catch(() => {}); + } + + const attemptOutcomes: string[] = [`attempt 1: failed (${firstFailedId ?? 'no id'})`]; + let probeSucceededOn: number | undefined; + let probedBlueprint: Blueprint | undefined; + + for (let probe = 1; probe <= probeAttempts; probe++) { await new Promise((r) => setTimeout(r, retryDelayMs)); + try { + probedBlueprint = await sdk.blueprint.create(params, createOptions); + attemptOutcomes.push(`attempt ${probe + 1}: succeeded (${probedBlueprint.id})`); + probeSucceededOn = probe + 1; + break; + } catch (probeErr) { + if (!isTransientBlueprintBuildFailure(probeErr)) { + throw flakeError(firstErr, attemptOutcomes, 'inconclusive (later attempt threw non-build-failed error)', probeErr); + } + const probeFailedId = extractBlueprintIdFromError(probeErr); + attemptOutcomes.push(`attempt ${probe + 1}: failed (${probeFailedId ?? 'no id'})`); + if (probeFailedId) { + await sdk.blueprint + .fromId(probeFailedId) + .delete() + .catch(() => {}); + } + } } + + // Clean up the recovered blueprint so an eventual success doesn't leak resources — + // the test is going to fail, and downstream test steps shouldn't see this blueprint. + if (probedBlueprint) { + await probedBlueprint.delete().catch(() => {}); + } + + const classification = + probeSucceededOn !== undefined ? + `TRANSIENT infra flake (attempt ${probeSucceededOn} recovered)` + : `PERSISTENT infra failure (all ${1 + probeAttempts} attempts terminal-failed)`; + + throw flakeError(firstErr, attemptOutcomes, classification); } - throw lastErr; } const BLUEPRINT_FAILED_MSG_RE = /Blueprint (bpt_\S+) is in non-complete state failed/; +const INFRA_FLAKE_INVESTIGATION_HINT = [ + 'Where to look next:', + ' - blueprint-operator dataset in Honeycomb (test env), filter blueprint_id=, look for the blueprint_build span.', + ' - Loki: {namespace=~"build-.*"} for the build pod logs (container "build" = BuildKit, "stage-context" = build-context fetch).', + ' - Known recurring infra causes:', + ' (a) BuildKit -> in-cluster registry mirror i/o timeout resolving base image manifest.', + ' (b) stage-context S3 download transient transport error (only 2 attempts in the builder today).', +].join('\n'); + +function flakeError( + firstErr: unknown, + attemptOutcomes: string[], + classification: string, + retryErr?: unknown, +): Error { + const lines = [ + `Blueprint build failed during smoketest — surfacing as test failure to keep infra signal visible.`, + `Classification: ${classification}`, + `First error: ${(firstErr as Error).message}`, + ]; + if (retryErr) lines.push(`Probe error: ${(retryErr as Error).message}`); + lines.push('Probe sequence:'); + for (const o of attemptOutcomes) lines.push(` - ${o}`); + lines.push(''); + lines.push(INFRA_FLAKE_INVESTIGATION_HINT); + return new Error(lines.join('\n')); +} + function isTransientBlueprintBuildFailure(err: unknown): boolean { return err instanceof Error && BLUEPRINT_FAILED_MSG_RE.test(err.message); }