diff --git a/tests/smoketests/object-oriented/blueprint.test.ts b/tests/smoketests/object-oriented/blueprint.test.ts index 6d7878f3b..78962e61c 100644 --- a/tests/smoketests/object-oriented/blueprint.test.ts +++ b/tests/smoketests/object-oriented/blueprint.test.ts @@ -1,4 +1,11 @@ -import { SHORT_TIMEOUT, LONG_TIMEOUT, uniqueName, makeClientSDK, cleanUpPolicy } from '../utils'; +import { + SHORT_TIMEOUT, + LONG_TIMEOUT, + uniqueName, + makeClientSDK, + cleanUpPolicy, + createBlueprintWithRetry, +} from '../utils'; import { Blueprint, Devbox, NetworkPolicy, StorageObject } from '@runloop/api-client/sdk'; const sdk = makeClientSDK(); @@ -10,7 +17,8 @@ describe('smoketest: object-oriented blueprint', () => { // Create blueprint in beforeAll to avoid test order dependency beforeAll(async () => { - blueprint = await sdk.blueprint.create( + blueprint = await createBlueprintWithRetry( + sdk, { name: uniqueName('sdk-blueprint'), dockerfile: 'FROM ubuntu:22.04\nRUN apt-get update && apt-get install -y curl', @@ -136,7 +144,8 @@ describe('smoketest: object-oriented blueprint', () => { }); // Create blueprint that uses the uploaded object as build context - blueprint = await sdk.blueprint.create( + blueprint = await createBlueprintWithRetry( + sdk, { name: uniqueName('sdk-blueprint-context'), dockerfile: `FROM ubuntu:22.04 @@ -218,7 +227,8 @@ COPY . .`, if (!contextDir) { throw new Error('Context directory not created'); } - blueprint = await sdk.blueprint.create( + blueprint = await createBlueprintWithRetry( + sdk, { name: uniqueName('sdk-blueprint-context-dir'), dockerfile: `FROM ubuntu:22.04 @@ -283,7 +293,8 @@ COPY . .`, // First create a blueprint let blueprint: Blueprint | undefined; try { - blueprint = await sdk.blueprint.create( + blueprint = await createBlueprintWithRetry( + sdk, { name: uniqueName('sdk-blueprint-retrieve'), dockerfile: 'FROM ubuntu:22.04', @@ -320,7 +331,8 @@ COPY . .`, expect(policy.id).toBeTruthy(); // Create blueprint with network_policy_id at top level (for build) - blueprint = await sdk.blueprint.create( + blueprint = await createBlueprintWithRetry( + sdk, { name: uniqueName('sdk-blueprint-with-build-policy'), dockerfile: 'FROM ubuntu:22.04\nRUN apt-get update', @@ -361,7 +373,8 @@ COPY . .`, expect(policy.id).toBeTruthy(); // Create blueprint with launch_parameters including network_policy_id - blueprint = await sdk.blueprint.create( + blueprint = await createBlueprintWithRetry( + sdk, { name: uniqueName('sdk-blueprint-with-launch-policy'), dockerfile: 'FROM ubuntu:22.04', diff --git a/tests/smoketests/utils.ts b/tests/smoketests/utils.ts index d58b7cf5e..717fa0d7d 100644 --- a/tests/smoketests/utils.ts +++ b/tests/smoketests/utils.ts @@ -1,5 +1,8 @@ import { Runloop, RunloopSDK } from '@runloop/api-client'; -import { NetworkPolicy, GatewayConfig, McpConfig } from '@runloop/api-client/sdk'; +import { Blueprint, NetworkPolicy, GatewayConfig, McpConfig } from '@runloop/api-client/sdk'; +import type { CreateParams as BlueprintCreateParams } from '@runloop/api-client/sdk/blueprint'; +import type { BlueprintView } from '@runloop/api-client/resources/blueprints'; +import type { LongPollRequestOptions } from '@runloop/api-client/lib/polling'; /** * Run the smoke tests over HTTP/2 (the undici adapter) instead of the default @@ -39,6 +42,128 @@ export const SHORT_TIMEOUT = 120_000; export const MEDIUM_TIMEOUT = 300_000; export const LONG_TIMEOUT = 600_000; +/** + * Create a blueprint and probe for infra-side flakes. + * + * Dev-cluster blueprint builds intermittently fail with infra-side errors — + * BuildKit i/o timeouts resolving base images from the in-cluster registry + * mirror, S3 transit errors when the stage-context container downloads the + * build-context object. The SDK surfaces those as a RunloopError ("Blueprint + * is in non-complete state failed"). A single flake in a `beforeAll` + * fixture otherwise cascades into many unrelated test failures, which buries + * the actual signal. + * + * This helper does NOT mask flakes — infra reliability is a customer-visible + * problem and the test suite needs to keep surfacing it. Behavior: + * - First attempt fails with the terminal-failed shape: retry up to + * `probeAttempts` more times to learn whether the failure is transient + * (recovers on retry) or persistent (reproduces every time). + * - The test ALWAYS fails — but with a diagnostic message that names the + * classification and points at where to investigate. + * - Other error shapes (auth, validation, etc.) re-throw unchanged. + */ +export async function createBlueprintWithRetry( + sdk: RunloopSDK, + params: BlueprintCreateParams, + options?: LongPollRequestOptions & { + probeAttempts?: number; + retryDelayMs?: number; + }, +): Promise { + const { probeAttempts = 2, retryDelayMs = 5_000, ...createOptions } = options ?? {}; + try { + return await sdk.blueprint.create(params, createOptions); + } catch (firstErr) { + if (!isTransientBlueprintBuildFailure(firstErr)) throw firstErr; + + const firstFailedId = extractBlueprintIdFromError(firstErr); + if (firstFailedId) { + await sdk.blueprint + .fromId(firstFailedId) + .delete() + .catch(() => {}); + } + + const attemptOutcomes: string[] = [`attempt 1: failed (${firstFailedId ?? 'no id'})`]; + let probeSucceededOn: number | undefined; + let probedBlueprint: Blueprint | undefined; + + for (let probe = 1; probe <= probeAttempts; probe++) { + await new Promise((r) => setTimeout(r, retryDelayMs)); + try { + probedBlueprint = await sdk.blueprint.create(params, createOptions); + attemptOutcomes.push(`attempt ${probe + 1}: succeeded (${probedBlueprint.id})`); + probeSucceededOn = probe + 1; + break; + } catch (probeErr) { + if (!isTransientBlueprintBuildFailure(probeErr)) { + throw flakeError(firstErr, attemptOutcomes, 'inconclusive (later attempt threw non-build-failed error)', probeErr); + } + const probeFailedId = extractBlueprintIdFromError(probeErr); + attemptOutcomes.push(`attempt ${probe + 1}: failed (${probeFailedId ?? 'no id'})`); + if (probeFailedId) { + await sdk.blueprint + .fromId(probeFailedId) + .delete() + .catch(() => {}); + } + } + } + + // Clean up the recovered blueprint so an eventual success doesn't leak resources — + // the test is going to fail, and downstream test steps shouldn't see this blueprint. + if (probedBlueprint) { + await probedBlueprint.delete().catch(() => {}); + } + + const classification = + probeSucceededOn !== undefined ? + `TRANSIENT infra flake (attempt ${probeSucceededOn} recovered)` + : `PERSISTENT infra failure (all ${1 + probeAttempts} attempts terminal-failed)`; + + throw flakeError(firstErr, attemptOutcomes, classification); + } +} + +const BLUEPRINT_FAILED_MSG_RE = /Blueprint (bpt_\S+) is in non-complete state failed/; + +const INFRA_FLAKE_INVESTIGATION_HINT = [ + 'Where to look next:', + ' - blueprint-operator dataset in Honeycomb (test env), filter blueprint_id=, look for the blueprint_build span.', + ' - Loki: {namespace=~"build-.*"} for the build pod logs (container "build" = BuildKit, "stage-context" = build-context fetch).', + ' - Known recurring infra causes:', + ' (a) BuildKit -> in-cluster registry mirror i/o timeout resolving base image manifest.', + ' (b) stage-context S3 download transient transport error (only 2 attempts in the builder today).', +].join('\n'); + +function flakeError( + firstErr: unknown, + attemptOutcomes: string[], + classification: string, + retryErr?: unknown, +): Error { + const lines = [ + `Blueprint build failed during smoketest — surfacing as test failure to keep infra signal visible.`, + `Classification: ${classification}`, + `First error: ${(firstErr as Error).message}`, + ]; + if (retryErr) lines.push(`Probe error: ${(retryErr as Error).message}`); + lines.push('Probe sequence:'); + for (const o of attemptOutcomes) lines.push(` - ${o}`); + lines.push(''); + lines.push(INFRA_FLAKE_INVESTIGATION_HINT); + return new Error(lines.join('\n')); +} + +function isTransientBlueprintBuildFailure(err: unknown): boolean { + return err instanceof Error && BLUEPRINT_FAILED_MSG_RE.test(err.message); +} + +function extractBlueprintIdFromError(err: unknown): string | undefined { + if (!(err instanceof Error)) return undefined; + return err.message.match(BLUEPRINT_FAILED_MSG_RE)?.[1]; +} + /** * Helper to clean up a network policy, ignoring errors if already deleted. */