diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 521c5659..11a36ab3 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -1610,7 +1610,9 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise; + const parts: string[] = []; + if (typeof obj.message === 'string' && obj.message) { + parts.push(obj.message); + } + if (typeof obj.code === 'number') { + parts.push(`(code ${obj.code})`); + } + if (parts.length > 0) { + return parts.join(' '); + } + // Fallback: serialize the object so we never return "[object Object]" + try { + return JSON.stringify(error); + } catch { + // circular reference or other serialization failure + } } - const value = String(error).toLowerCase(); - return value.includes('timeout'); + return String(error); +} + +/** Exponential backoff: 2^attempt * 1000ms (1s, 2s, 4s, …), capped at 30s. */ +function retryBackoffMs(attempt: number): number { + return Math.min(2 ** attempt * 1000, 30_000); +} + +function sleep(ms: number, signal?: AbortSignal): Promise { + if (signal?.aborted) return Promise.resolve(); + return new Promise((resolve) => { + const timer = setTimeout(resolve, ms); + signal?.addEventListener( + 'abort', + () => { + clearTimeout(timer); + resolve(); + }, + { once: true }, + ); + }); } function mapChildResults( diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 9031cbdc..4249b9fe 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -223,7 +223,7 @@ describe('runTestCase', () => { expect(provider.callIndex).toBe(1); }); - it('retries timeout errors up to maxRetries', async () => { + it('retries provider errors up to maxRetries', async () => { const provider = new SequenceProvider('mock', { errors: [new Error('Request timeout')], responses: [ @@ -244,6 +244,51 @@ describe('runTestCase', () => { expect(result.score).toBeGreaterThan(0); }); + it('retries non-timeout provider errors up to maxRetries', async () => { + const provider = new SequenceProvider('mock', { + errors: [new Error('Provider failure')], + responses: [ + { + output: [{ role: 'assistant', content: 'Add structured logging.' }], + }, + ], + }); + + const result = await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: evaluatorRegistry, + maxRetries: 1, + }); + + expect(result.score).toBeGreaterThan(0); + }); + + it('applies exponential backoff between retries', async () => { + const provider = new SequenceProvider('mock', { + errors: [new Error('Transient failure')], + responses: [ + { + output: [{ role: 'assistant', content: 'Add structured logging.' }], + }, + ], + }); + + const startMs = Date.now(); + await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: evaluatorRegistry, + maxRetries: 1, + }); + const elapsedMs = Date.now() - startMs; + + // First retry has 2^0 * 1000 = 1000ms backoff + expect(elapsedMs).toBeGreaterThanOrEqual(900); + }); + it('returns error result on unrecoverable failure', async () => { const provider = new SequenceProvider('mock', { errors: [new Error('Provider failure')], @@ -266,6 +311,32 @@ describe('runTestCase', () => { expect(result.executionError?.message).toContain('Provider failure'); }); + it('surfaces JSON-RPC error objects with readable messages', async () => { + // Simulates @agentclientprotocol/sdk rejecting with a plain JSON-RPC error object + const jsonRpcError = { code: -32600, message: 'Invalid request' }; + const provider: Provider = { + id: 'mock:jsonrpc', + kind: 'mock' as const, + targetName: 'mock', + async invoke(): Promise { + throw jsonRpcError; + }, + }; + + const result = await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: evaluatorRegistry, + }); + + expect(result.score).toBe(0); + expect(result.executionStatus).toBe('execution_error'); + expect(result.error).toContain('Invalid request'); + expect(result.error).toContain('code -32600'); + expect(result.error).not.toContain('[object Object]'); + }); + it('surfaces provider raw.error as evaluation error', async () => { const provider = new SequenceProvider('mock', { responses: [