From 7351f0fd9ed5fd3f942b2f86ff48d9a08c6d03ae Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Fri, 22 May 2026 00:07:30 +0300 Subject: [PATCH] refactor: unify the LLM retry classifier across client + judge retry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Merge B of the agent-eval consolidation. callLlm and withJudgeRetry each carried a private retry-classification surface — two backoff functions, two retryable-status sets, two divergent transient-error pattern lists. Collapse to one exported primitive in llm-client.ts: - isTransientLlmError(err) — THE retry classifier. Inspects an error's name/message/code, honors a numeric HTTP status (LlmCallError or a foreign SDK error), and recurses into error.cause (undici nests the real socket fault under .cause). withJudgeRetry's default predicate and callLlm's catch path both route through it. - backoffMs(attempt) — exported; withJudgeRetry's default backoff. judge-retry.ts drops its ABORT_PATTERNS, RETRYABLE_HTTP_STATUS, DEFAULT_BACKOFF, and defaultIsRetryable in favor of the shared pair. Bug class fixed: neither old classifier matched undici HTTP/2 transport faults (`terminated`, NGHTTP2_INTERNAL_ERROR, UND_ERR_*, `other side closed`). An HTTP/2 connection dropping mid-response escaped the retry loop — surfacing as an uncaught rejection in the HTTP client and a silent non-retried trial failure in TCloud-backed judges. The unified pattern list covers them and the cause chain is followed. Regression tests: callLlm retries an HTTP/2 fault to recovery; withJudgeRetry does the same via the shared classifier; isTransientLlmError unit-tested across HTTP/2, network, abort, status, deterministic-failure, and self-referential-cause inputs. typecheck + 1273 tests + biome + build all green. --- src/index.ts | 2 ++ src/judge-retry.ts | 41 ++++++------------------ src/llm-client.test.ts | 65 ++++++++++++++++++++++++++++++++++++++ src/llm-client.ts | 66 +++++++++++++++++++++++++++++++++------ tests/judge-retry.test.ts | 20 ++++++++++++ 5 files changed, 152 insertions(+), 42 deletions(-) diff --git a/src/index.ts b/src/index.ts index c0a6846..43694d1 100644 --- a/src/index.ts +++ b/src/index.ts @@ -827,8 +827,10 @@ export type { } from './llm-client' export { assertLlmRoute, + backoffMs, callLlm, callLlmJson, + isTransientLlmError, LlmCallError, LlmClient, LlmRouteAssertionError, diff --git a/src/judge-retry.ts b/src/judge-retry.ts index 2e88b42..74bbf95 100644 --- a/src/judge-retry.ts +++ b/src/judge-retry.ts @@ -10,6 +10,8 @@ * with `mode: 'exclude-failed'` drops the trial. */ +import { backoffMs, isTransientLlmError } from './llm-client' + /** Retry policy for judge LLM calls. */ export interface JudgeRetryPolicy { /** Max attempts per model. Default 3 (one initial + two retries). */ @@ -28,10 +30,11 @@ export interface JudgeRetryPolicy { /** Exponential backoff function, default `attempt → min(500 * 2^attempt, 16_000)`. */ backoffMs?: (attempt: number) => number /** - * Predicate deciding whether an error should trigger a retry. Default - * retries on: AbortError, TimeoutError, `fetch failed`, `ECONNRESET`, - * `[This operation was aborted]`, and any LlmCallError with status in - * {429, 502, 503, 504}. JSON-parse errors are NOT retriable (the model + * Predicate deciding whether an error should trigger a retry. Defaults to + * `isTransientLlmError` — the package-wide classifier shared with + * `callLlm` — which retries aborts/timeouts, network faults, HTTP/2 + * transport faults, and any `LlmCallError` with status in {429,502,503,504}. + * JSON-parse and schema-rejection errors are NOT retriable (the model * needs prompt adjustment, not another shot). */ isRetryable?: (err: unknown) => boolean @@ -55,32 +58,6 @@ export interface JudgeRetryOutcome { const DEFAULT_MAX_ATTEMPTS = 3 const DEFAULT_TIMEOUT_MS = 90_000 -const DEFAULT_BACKOFF = (attempt: number) => Math.min(500 * 2 ** attempt, 16_000) - -const ABORT_PATTERNS = [ - /AbortError/i, - /TimeoutError/i, - /fetch failed/i, - /ECONNRESET/i, - /ETIMEDOUT/i, - /EAI_AGAIN/i, - /this operation was aborted/i, - /stream.*ended.*unexpectedly/i, - /socket hang up/i, -] - -const RETRYABLE_HTTP_STATUS = new Set([429, 502, 503, 504]) - -function defaultIsRetryable(err: unknown): boolean { - if (err instanceof Error) { - if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true - // LlmCallError exposes `status` as a numeric property; check without - // hard-importing to avoid a circular dep. - const status = (err as unknown as { status?: number }).status - if (typeof status === 'number' && RETRYABLE_HTTP_STATUS.has(status)) return true - } - return false -} function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)) @@ -103,8 +80,8 @@ export async function withJudgeRetry( ): Promise> { const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS - const backoff = policy.backoffMs ?? DEFAULT_BACKOFF - const isRetryable = policy.isRetryable ?? defaultIsRetryable + const backoff = policy.backoffMs ?? backoffMs + const isRetryable = policy.isRetryable ?? isTransientLlmError const models = policy.models && policy.models.length > 0 ? policy.models : [undefined as unknown as string] diff --git a/src/llm-client.test.ts b/src/llm-client.test.ts index 0fb358f..9303bb6 100644 --- a/src/llm-client.test.ts +++ b/src/llm-client.test.ts @@ -3,6 +3,7 @@ import { callLlm, callLlmJson, extractJsonPayload, + isTransientLlmError, LlmCallError, LlmClient, stripFencedJson, @@ -235,6 +236,70 @@ describe('llm-client — retry semantics', () => { const r = await callLlm({ model: 'm', messages: [] }, { fetch, maxRetries: 3 }) expect(r.content).toBe('recovered') }) + + it('retries an HTTP/2 transport fault instead of crashing', async () => { + // Regression: undici raises `terminated` / NGHTTP2_INTERNAL_ERROR for an + // HTTP/2 connection that drops mid-response. The old classifier only + // matched `fetch failed|ECONNRESET|ETIMEDOUT|EAI_AGAIN`, so this escaped + // the retry loop and surfaced as an uncaught rejection. + let call = 0 + const fetch: typeof globalThis.fetch = (async (_url: string) => { + call++ + if (call === 1) { + const cause = new Error('NGHTTP2_INTERNAL_ERROR') + throw new TypeError('terminated', { cause }) + } + return mkOkResponse({ choices: [{ message: { content: 'recovered' } }], usage: {} }) + }) as unknown as typeof globalThis.fetch + const r = await callLlm({ model: 'm', messages: [] }, { fetch, maxRetries: 3 }) + expect(r.content).toBe('recovered') + expect(call).toBe(2) + }) +}) + +describe('llm-client — isTransientLlmError classification', () => { + it('classifies HTTP/2 + undici transport faults as transient', () => { + expect(isTransientLlmError(new Error('terminated'))).toBe(true) + expect(isTransientLlmError(new Error('NGHTTP2_INTERNAL_ERROR'))).toBe(true) + expect(isTransientLlmError(new Error('other side closed'))).toBe(true) + expect(isTransientLlmError(new Error('socket hang up'))).toBe(true) + const coded = Object.assign(new Error('connection lost'), { code: 'UND_ERR_SOCKET' }) + expect(isTransientLlmError(coded)).toBe(true) + }) + + it('follows the undici cause chain to the real socket fault', () => { + const wrapped = new TypeError('fetch failed', { + cause: new Error('terminated', { cause: new Error('NGHTTP2_INTERNAL_ERROR') }), + }) + expect(isTransientLlmError(wrapped)).toBe(true) + }) + + it('classifies network + abort faults as transient', () => { + expect(isTransientLlmError(new Error('ECONNRESET'))).toBe(true) + const abort = new Error('aborted') + abort.name = 'AbortError' + expect(isTransientLlmError(abort)).toBe(true) + }) + + it('treats LlmCallError as transient only on retriable status', () => { + expect(isTransientLlmError(new LlmCallError('rate limited', 429, '', 'm'))).toBe(true) + expect(isTransientLlmError(new LlmCallError('bad gateway', 503, '', 'm'))).toBe(true) + expect(isTransientLlmError(new LlmCallError('bad request', 400, '', 'm'))).toBe(false) + expect(isTransientLlmError(new LlmCallError('unauthorized', 401, '', 'm'))).toBe(false) + }) + + it('does NOT retry deterministic failures', () => { + expect(isTransientLlmError(new SyntaxError('Unexpected token < in JSON'))).toBe(false) + expect(isTransientLlmError(new Error('response_format json_schema not supported'))).toBe(false) + expect(isTransientLlmError('not an error')).toBe(false) + expect(isTransientLlmError(undefined)).toBe(false) + }) + + it('terminates on a self-referential cause chain', () => { + const err = new Error('boom') as Error & { cause?: unknown } + err.cause = err + expect(isTransientLlmError(err)).toBe(false) + }) }) describe('llm-client — callLlmJson + schema degrade', () => { diff --git a/src/llm-client.ts b/src/llm-client.ts index 1d86bdb..681503e 100644 --- a/src/llm-client.ts +++ b/src/llm-client.ts @@ -135,14 +135,60 @@ const DEFAULT_MAX_RETRIES = 3 const RETRYABLE_STATUS = new Set([429, 502, 503, 504]) -function isRetryableError(err: unknown): boolean { +/** + * Transient transport/network error signatures, matched against an error's + * name, message, and `code`. Covers fetch/undici network failures, aborts + * and timeouts, and — critically — HTTP/2 transport faults a keep-alive + * connection raises mid-response: `terminated`, `NGHTTP2_INTERNAL_ERROR`, + * `UND_ERR_*`, `other side closed`. Those last ones carry no clean HTTP + * status; unrecognised, they escape the retry loop and surface as an + * uncaught rejection. + */ +const TRANSIENT_ERROR_PATTERNS: readonly RegExp[] = [ + /AbortError/i, + /TimeoutError/i, + /this operation was aborted/i, + /fetch failed/i, + /ECONNRESET/i, + /ETIMEDOUT/i, + /EAI_AGAIN/i, + /socket hang up/i, + /stream.*ended.*unexpectedly/i, + /terminated/i, + /other side closed/i, + /NGHTTP2/i, + /UND_ERR/i, +] + +/** + * True when an error is a transient transport/network fault worth retrying, + * as opposed to a deterministic failure (4xx schema reject, JSON parse) that + * a retry cannot fix. Inspects `LlmCallError.status`, then the error's + * name/message/code, then recurses into `error.cause` — undici nests the + * real socket fault one or more levels under `.cause`. + * + * This is THE retry classifier for the package: `callLlm` and + * `withJudgeRetry` both route through it, so a connection-class error is + * treated identically whether it surfaces in the HTTP client or a + * TCloud-backed judge. + */ +export function isTransientLlmError(err: unknown): boolean { + return classifyTransient(err, 0) +} + +function classifyTransient(err: unknown, depth: number): boolean { if (err instanceof LlmCallError) return RETRYABLE_STATUS.has(err.status) - if (err instanceof Error) { - return ( - err.name === 'AbortError' || - err.name === 'TimeoutError' || - /fetch failed|ECONNRESET|ETIMEDOUT|EAI_AGAIN/i.test(err.message) - ) + if (!(err instanceof Error)) return false + // Foreign errors (e.g. a TCloud judge SDK error) can carry a numeric HTTP + // status without being an LlmCallError — a retryable status is decisive. + const status = (err as { status?: unknown }).status + if (typeof status === 'number' && RETRYABLE_STATUS.has(status)) return true + const code = (err as { code?: unknown }).code + const haystack = `${err.name}\n${err.message}\n${typeof code === 'string' ? code : ''}` + if (TRANSIENT_ERROR_PATTERNS.some((p) => p.test(haystack))) return true + const cause = (err as { cause?: unknown }).cause + if (depth < 4 && cause instanceof Error && cause !== err) { + return classifyTransient(cause, depth + 1) } return false } @@ -157,8 +203,8 @@ function parseRetryAfter(headers: Headers): number | null { return null } -function backoffMs(attempt: number): number { - // 500ms, 1s, 2s, 4s, ... +/** Exponential backoff: 500ms, 1s, 2s, 4s, ... capped at 16s. Attempt is 0-indexed. */ +export function backoffMs(attempt: number): number { return Math.min(500 * 2 ** attempt, 16_000) } @@ -478,7 +524,7 @@ export async function callLlm( redactedFields: [], }) } - if (attempt < maxRetries - 1 && isRetryableError(err)) { + if (attempt < maxRetries - 1 && isTransientLlmError(err)) { await sleep(backoffMs(attempt)) continue } diff --git a/tests/judge-retry.test.ts b/tests/judge-retry.test.ts index dabee5d..2de20ee 100644 --- a/tests/judge-retry.test.ts +++ b/tests/judge-retry.test.ts @@ -100,4 +100,24 @@ describe('withJudgeRetry — substrate guard against silent-zero judge corruptio expect(out.succeeded).toBe(true) expect(out.attempts).toBe(2) }) + + it('retries an HTTP/2 transport fault via the shared classifier', async () => { + // Regression: judge-retry and llm-client carried separate retry-pattern + // lists; neither matched undici HTTP/2 faults (`terminated`, + // NGHTTP2_INTERNAL_ERROR). A TCloud-backed judge hitting one would fail + // the trial as a silent non-retry. Both now route through + // isTransientLlmError. + let calls = 0 + const fn = async () => { + calls += 1 + if (calls === 1) { + throw new TypeError('terminated', { cause: new Error('NGHTTP2_INTERNAL_ERROR') }) + } + return { score: 0.81 } + } + const out = await withJudgeRetry(fn, { maxAttempts: 3, backoffMs: () => 1 }) + expect(out.succeeded).toBe(true) + expect(out.attempts).toBe(2) + expect(out.value).toEqual({ score: 0.81 }) + }) })