Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions packages/core/src/evaluation/providers/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -352,17 +352,23 @@ export class CliProvider implements Provider {
);
const renderedCommand = renderTemplate(this.config.command, templateValues);

// Use per-request cwd override (from workspace) if any request provides one,
// otherwise fall back to the target's configured cwd.
// All requests in a batch share the same workspace, so the first request's cwd
// is representative of the entire batch.
const effectiveCwd = requests[0]?.cwd ?? this.config.cwd;

if (this.verbose) {
console.log(
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ''} command=${renderedCommand}`,
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${effectiveCwd ?? ''} command=${renderedCommand}`,
);
}

// Measure wall-clock time for batch (used as fallback if records don't provide duration)
try {
const startTime = Date.now();
const result = await this.runCommand(renderedCommand, {
cwd: this.config.cwd,
cwd: effectiveCwd,
env: process.env,
timeoutMs: this.config.timeoutMs,
signal: controller.signal,
Expand Down Expand Up @@ -402,7 +408,7 @@ export class CliProvider implements Provider {
command: renderedCommand,
stderr: result.stderr,
exitCode: result.exitCode ?? 0,
cwd: this.config.cwd,
cwd: effectiveCwd,
outputFile: outputFilePath,
},
};
Expand All @@ -423,7 +429,7 @@ export class CliProvider implements Provider {
command: renderedCommand,
stderr: result.stderr,
exitCode: result.exitCode ?? 0,
cwd: this.config.cwd,
cwd: effectiveCwd,
outputFile: outputFilePath,
error: errorMessage,
},
Expand All @@ -439,7 +445,7 @@ export class CliProvider implements Provider {
command: renderedCommand,
stderr: result.stderr,
exitCode: result.exitCode ?? 0,
cwd: this.config.cwd,
cwd: effectiveCwd,
outputFile: outputFilePath,
recordId: evalCaseId,
},
Expand Down
32 changes: 32 additions & 0 deletions packages/core/test/evaluation/orchestrator.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,38 @@ describe('runTestCase', () => {
expect(provider.callIndex).toBe(1);
});

it('preserves workspace cwd across retry attempts', async () => {
const cwdsSeen: (string | undefined)[] = [];
const provider: Provider = {
id: 'mock:cwd-test',
kind: 'mock' as const,
targetName: 'cwd-test',
async invoke(request: ProviderRequest): Promise<ProviderResponse> {
cwdsSeen.push(request.cwd);
if (cwdsSeen.length === 1) {
throw new Error('Transient failure');
}
return {
output: [{ role: 'assistant', content: 'Success on retry' }],
};
},
};

const result = await runEvalCase({
evalCase: baseTestCase,
provider,
target: baseTarget,
evaluators: evaluatorRegistry,
maxRetries: 1,
sharedWorkspacePath: '/fake/workspace/path',
});

expect(result.score).toBeGreaterThan(0);
expect(cwdsSeen).toHaveLength(2);
expect(cwdsSeen[0]).toBe('/fake/workspace/path');
expect(cwdsSeen[1]).toBe('/fake/workspace/path');
});

it('retries provider errors up to maxRetries', async () => {
const provider = new SequenceProvider('mock', {
errors: [new Error('Request timeout')],
Expand Down
85 changes: 85 additions & 0 deletions packages/core/test/evaluation/providers/cli.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,91 @@ describe('CliProvider', () => {
await expect(provider.invoke(baseRequest)).rejects.toThrow(/timed out/i);
});

it('uses request.cwd as working directory override in invoke', async () => {
let capturedCwd: string | undefined;
const runner = mock(async (command: string, options): Promise<CommandRunResult> => {
capturedCwd = options?.cwd;
const match = command.match(/agentv-case-1-\d+-\w+\.json/);
if (match) {
const outputFilePath = path.join(os.tmpdir(), match[0]);
await writeFile(outputFilePath, 'response', 'utf-8');
createdFiles.push(outputFilePath);
}
return { stdout: '', stderr: '', exitCode: 0, failed: false };
});

const configWithCwd: CliResolvedConfig = { ...baseConfig, cwd: '/config/cwd' };
const provider = new CliProvider('cli-target', configWithCwd, runner);

// request.cwd should override config.cwd
await provider.invoke({ ...baseRequest, cwd: '/workspace/path' });
expect(capturedCwd).toBe('/workspace/path');
});

it('falls back to config.cwd when request.cwd is undefined in invoke', async () => {
let capturedCwd: string | undefined;
const runner = mock(async (command: string, options): Promise<CommandRunResult> => {
capturedCwd = options?.cwd;
const match = command.match(/agentv-case-1-\d+-\w+\.json/);
if (match) {
const outputFilePath = path.join(os.tmpdir(), match[0]);
await writeFile(outputFilePath, 'response', 'utf-8');
createdFiles.push(outputFilePath);
}
return { stdout: '', stderr: '', exitCode: 0, failed: false };
});

const configWithCwd: CliResolvedConfig = { ...baseConfig, cwd: '/config/cwd' };
const provider = new CliProvider('cli-target', configWithCwd, runner);

await provider.invoke(baseRequest); // no cwd in request
expect(capturedCwd).toBe('/config/cwd');
});

it('uses request.cwd as working directory override in invokeBatch', async () => {
let capturedCwd: string | undefined;
const runner = mock(async (command: string, options): Promise<CommandRunResult> => {
capturedCwd = options?.cwd;
const match = command.match(/agentv-batch-\d+-\w+\.jsonl/);
if (match) {
const outputFilePath = path.join(os.tmpdir(), match[0]);
const jsonl = `${JSON.stringify({ id: 'case-1', text: 'ok' })}\n`;
await writeFile(outputFilePath, jsonl, 'utf-8');
createdFiles.push(outputFilePath);
}
return { stdout: '', stderr: '', exitCode: 0, failed: false };
});

const configWithCwd: CliResolvedConfig = { ...baseConfig, cwd: '/config/cwd' };
const provider = new CliProvider('cli-target', configWithCwd, runner);

// First request's cwd should override config.cwd for the batch
await provider.invokeBatch([{ ...baseRequest, cwd: '/workspace/path' }]);
expect(capturedCwd).toBe('/workspace/path');
});

it('falls back to config.cwd when request.cwd is undefined in invokeBatch', async () => {
let capturedCwd: string | undefined;
const runner = mock(async (command: string, options): Promise<CommandRunResult> => {
capturedCwd = options?.cwd;
const match = command.match(/agentv-batch-\d+-\w+\.jsonl/);
if (match) {
const outputFilePath = path.join(os.tmpdir(), match[0]);
const jsonl = `${JSON.stringify({ id: 'case-1', text: 'ok' })}\n`;
await writeFile(outputFilePath, jsonl, 'utf-8');
createdFiles.push(outputFilePath);
}
return { stdout: '', stderr: '', exitCode: 0, failed: false };
});

const configWithCwd: CliResolvedConfig = { ...baseConfig, cwd: '/config/cwd' };
const provider = new CliProvider('cli-target', configWithCwd, runner);

// No cwd in request — should fall back to config.cwd
await provider.invokeBatch([baseRequest]);
expect(capturedCwd).toBe('/config/cwd');
});

it('supports batch mode by reading JSONL records keyed by id', async () => {
const runner = mock(async (command: string): Promise<CommandRunResult> => {
const match = command.match(/agentv-batch-\d+-\w+\.jsonl/);
Expand Down
Loading