Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 20 additions & 7 deletions tests/smoketests/object-oriented/blueprint.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
import { SHORT_TIMEOUT, LONG_TIMEOUT, uniqueName, makeClientSDK, cleanUpPolicy } from '../utils';
import {
SHORT_TIMEOUT,
LONG_TIMEOUT,
uniqueName,
makeClientSDK,
cleanUpPolicy,
createBlueprintWithRetry,
} from '../utils';
import { Blueprint, Devbox, NetworkPolicy, StorageObject } from '@runloop/api-client/sdk';

const sdk = makeClientSDK();
Expand All @@ -10,7 +17,8 @@ describe('smoketest: object-oriented blueprint', () => {

// Create blueprint in beforeAll to avoid test order dependency
beforeAll(async () => {
blueprint = await sdk.blueprint.create(
blueprint = await createBlueprintWithRetry(
sdk,
{
name: uniqueName('sdk-blueprint'),
dockerfile: 'FROM ubuntu:22.04\nRUN apt-get update && apt-get install -y curl',
Expand Down Expand Up @@ -136,7 +144,8 @@ describe('smoketest: object-oriented blueprint', () => {
});

// Create blueprint that uses the uploaded object as build context
blueprint = await sdk.blueprint.create(
blueprint = await createBlueprintWithRetry(
sdk,
{
name: uniqueName('sdk-blueprint-context'),
dockerfile: `FROM ubuntu:22.04
Expand Down Expand Up @@ -218,7 +227,8 @@ COPY . .`,
if (!contextDir) {
throw new Error('Context directory not created');
}
blueprint = await sdk.blueprint.create(
blueprint = await createBlueprintWithRetry(
sdk,
{
name: uniqueName('sdk-blueprint-context-dir'),
dockerfile: `FROM ubuntu:22.04
Expand Down Expand Up @@ -283,7 +293,8 @@ COPY . .`,
// First create a blueprint
let blueprint: Blueprint | undefined;
try {
blueprint = await sdk.blueprint.create(
blueprint = await createBlueprintWithRetry(
sdk,
{
name: uniqueName('sdk-blueprint-retrieve'),
dockerfile: 'FROM ubuntu:22.04',
Expand Down Expand Up @@ -320,7 +331,8 @@ COPY . .`,
expect(policy.id).toBeTruthy();

// Create blueprint with network_policy_id at top level (for build)
blueprint = await sdk.blueprint.create(
blueprint = await createBlueprintWithRetry(
sdk,
{
name: uniqueName('sdk-blueprint-with-build-policy'),
dockerfile: 'FROM ubuntu:22.04\nRUN apt-get update',
Expand Down Expand Up @@ -361,7 +373,8 @@ COPY . .`,
expect(policy.id).toBeTruthy();

// Create blueprint with launch_parameters including network_policy_id
blueprint = await sdk.blueprint.create(
blueprint = await createBlueprintWithRetry(
sdk,
{
name: uniqueName('sdk-blueprint-with-launch-policy'),
dockerfile: 'FROM ubuntu:22.04',
Expand Down
127 changes: 126 additions & 1 deletion tests/smoketests/utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import { Runloop, RunloopSDK } from '@runloop/api-client';
import { NetworkPolicy, GatewayConfig, McpConfig } from '@runloop/api-client/sdk';
import { Blueprint, NetworkPolicy, GatewayConfig, McpConfig } from '@runloop/api-client/sdk';
import type { CreateParams as BlueprintCreateParams } from '@runloop/api-client/sdk/blueprint';
import type { BlueprintView } from '@runloop/api-client/resources/blueprints';
import type { LongPollRequestOptions } from '@runloop/api-client/lib/polling';

/**
* Run the smoke tests over HTTP/2 (the undici adapter) instead of the default
Expand Down Expand Up @@ -39,6 +42,128 @@ export const SHORT_TIMEOUT = 120_000;
export const MEDIUM_TIMEOUT = 300_000;
export const LONG_TIMEOUT = 600_000;

/**
* Create a blueprint and probe for infra-side flakes.
*
* Dev-cluster blueprint builds intermittently fail with infra-side errors —
* BuildKit i/o timeouts resolving base images from the in-cluster registry
* mirror, S3 transit errors when the stage-context container downloads the
* build-context object. The SDK surfaces those as a RunloopError ("Blueprint
* <id> is in non-complete state failed"). A single flake in a `beforeAll`
* fixture otherwise cascades into many unrelated test failures, which buries
* the actual signal.
*
* This helper does NOT mask flakes — infra reliability is a customer-visible
* problem and the test suite needs to keep surfacing it. Behavior:
* - First attempt fails with the terminal-failed shape: retry up to
* `probeAttempts` more times to learn whether the failure is transient
* (recovers on retry) or persistent (reproduces every time).
* - The test ALWAYS fails — but with a diagnostic message that names the
* classification and points at where to investigate.
* - Other error shapes (auth, validation, etc.) re-throw unchanged.
*/
export async function createBlueprintWithRetry(
sdk: RunloopSDK,
params: BlueprintCreateParams,
options?: LongPollRequestOptions<BlueprintView> & {
probeAttempts?: number;
retryDelayMs?: number;
},
): Promise<Blueprint> {
const { probeAttempts = 2, retryDelayMs = 5_000, ...createOptions } = options ?? {};
try {
return await sdk.blueprint.create(params, createOptions);
} catch (firstErr) {
if (!isTransientBlueprintBuildFailure(firstErr)) throw firstErr;

const firstFailedId = extractBlueprintIdFromError(firstErr);
if (firstFailedId) {
await sdk.blueprint
.fromId(firstFailedId)
.delete()
.catch(() => {});
}

const attemptOutcomes: string[] = [`attempt 1: failed (${firstFailedId ?? 'no id'})`];
let probeSucceededOn: number | undefined;
let probedBlueprint: Blueprint | undefined;

for (let probe = 1; probe <= probeAttempts; probe++) {
await new Promise((r) => setTimeout(r, retryDelayMs));
try {
probedBlueprint = await sdk.blueprint.create(params, createOptions);
attemptOutcomes.push(`attempt ${probe + 1}: succeeded (${probedBlueprint.id})`);
probeSucceededOn = probe + 1;
break;
} catch (probeErr) {
if (!isTransientBlueprintBuildFailure(probeErr)) {
throw flakeError(firstErr, attemptOutcomes, 'inconclusive (later attempt threw non-build-failed error)', probeErr);
}
const probeFailedId = extractBlueprintIdFromError(probeErr);
attemptOutcomes.push(`attempt ${probe + 1}: failed (${probeFailedId ?? 'no id'})`);
if (probeFailedId) {
await sdk.blueprint
.fromId(probeFailedId)
.delete()
.catch(() => {});
}
}
}

// Clean up the recovered blueprint so an eventual success doesn't leak resources —
// the test is going to fail, and downstream test steps shouldn't see this blueprint.
if (probedBlueprint) {
await probedBlueprint.delete().catch(() => {});
}

const classification =
probeSucceededOn !== undefined ?
`TRANSIENT infra flake (attempt ${probeSucceededOn} recovered)`
: `PERSISTENT infra failure (all ${1 + probeAttempts} attempts terminal-failed)`;

throw flakeError(firstErr, attemptOutcomes, classification);
}
}

const BLUEPRINT_FAILED_MSG_RE = /Blueprint (bpt_\S+) is in non-complete state failed/;

const INFRA_FLAKE_INVESTIGATION_HINT = [
'Where to look next:',
' - blueprint-operator dataset in Honeycomb (test env), filter blueprint_id=<id above>, look for the blueprint_build span.',
' - Loki: {namespace=~"build-.*"} for the build pod logs (container "build" = BuildKit, "stage-context" = build-context fetch).',
' - Known recurring infra causes:',
' (a) BuildKit -> in-cluster registry mirror i/o timeout resolving base image manifest.',
' (b) stage-context S3 download transient transport error (only 2 attempts in the builder today).',
].join('\n');

function flakeError(
firstErr: unknown,
attemptOutcomes: string[],
classification: string,
retryErr?: unknown,
): Error {
const lines = [
`Blueprint build failed during smoketest — surfacing as test failure to keep infra signal visible.`,
`Classification: ${classification}`,
`First error: ${(firstErr as Error).message}`,
];
if (retryErr) lines.push(`Probe error: ${(retryErr as Error).message}`);
lines.push('Probe sequence:');
for (const o of attemptOutcomes) lines.push(` - ${o}`);
lines.push('');
lines.push(INFRA_FLAKE_INVESTIGATION_HINT);
return new Error(lines.join('\n'));
}

function isTransientBlueprintBuildFailure(err: unknown): boolean {
return err instanceof Error && BLUEPRINT_FAILED_MSG_RE.test(err.message);
}

function extractBlueprintIdFromError(err: unknown): string | undefined {
if (!(err instanceof Error)) return undefined;
return err.message.match(BLUEPRINT_FAILED_MSG_RE)?.[1];
}

/**
* Helper to clean up a network policy, ignoring errors if already deleted.
*/
Expand Down
Loading