Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 161 additions & 2 deletions apps/cli/ai/eval-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
} from 'cli/ai/auth';
import { runStudioAgentTurn } from 'cli/ai/runtimes/pi';
import { STUDIO_SITES_ROOT } from 'cli/lib/site-paths';
import type { StopReason, Usage } from '@mariozechner/pi-ai';
import type { AgentSessionEvent } from '@mariozechner/pi-coding-agent';
import type { AiProviderId } from 'cli/ai/providers';

Expand All @@ -28,6 +29,61 @@ interface EvalRunnerInput {
model?: AiModelId;
}

// Tool names that indicate the agent did "real work" (mutated state, ran a
// command, scaffolded files, etc.). Used as a heuristic to detect runs where
// the agent reported success but produced no assistant text and never invoked
// a state-changing tool — typically a "false success" failure mode where the
// model trivially ended a turn without addressing the prompt.
const MUTATING_TOOL_NAMES = new Set( [
'Write',
'Edit',
'Bash',
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes me think we should normalize how do we name tools, these camel case names are just named like that because we inherited them from claude initially.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like a good idea. The Pi harness is a great direction.

One question i had about this PR was: Do we want to keep the full transcript opt-in, or enable it by default?

It can be argued that if you are doing evals, the transcript is always helpful.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be honest, I didn't think about evals too much yet (as much as you did haha), so feel free to go in any direction you think is good.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. I will try it both ways and see what feels better.

These evals will help a lot as we add more features, more models, and evolve the system prompt.

'wp_cli',
'site_create',
'site_delete',
'site_import',
'site_export',
'site_pull',
'site_push',
'site_start',
'site_stop',
'scaffold_theme',
'install_taxonomy_scripts',
'preview_create',
'preview_delete',
'preview_update',
'wpcom_request',
] );

// Cap individual text/tool-result strings inside the opt-in transcript so a
// single huge tool output doesn't explode the eval artifact.
const TRANSCRIPT_TEXT_MAX_LENGTH = 4000;

function truncateText( value: string, maxLength = TRANSCRIPT_TEXT_MAX_LENGTH ): string {
return value.length > maxLength
? `${ value.slice( 0, maxLength ) }…[truncated ${ value.length - maxLength } chars]`
: value;
}

type TranscriptEvent = {
index: number;
type: AgentSessionEvent[ 'type' ];
turnIndex: number;
elapsedMs: number;
text?: string[];
toolCalls?: ToolCallRecord[];
toolResult?: {
toolUseId: string;
toolName: string | null;
isError: boolean;
text?: string;
};
stopReason?: StopReason;
errorMessage?: string;
compaction?: { reason: string; aborted?: boolean };
autoRetry?: { attempt: number; success?: boolean; error?: string };
};

function extractToolCalls( event: AgentSessionEvent ) {
if ( event.type !== 'message_end' || event.message.role !== 'assistant' ) {
return [];
Expand Down Expand Up @@ -168,15 +224,32 @@ async function runEval( input: EvalRunnerInput ) {
let numTurns = 0;
let numTurnsResult: number | null = null;
let success = false;
let interrupted = false;
let error: string | null = null;
let timedOut = false;
let resultStopReason: StopReason | null = null;
let resultText = '';
let resultErrorMessage: string | null = null;
let resultUsage: Usage | null = null;
const includeTranscript = process.env.STUDIO_EVAL_INCLUDE_TRANSCRIPT === '1';
const transcript: TranscriptEvent[] = [];
let transcriptIndex = 0;

phaseStartedAt = Date.now();
const session = SessionManager.inMemory( STUDIO_SITES_ROOT );
const queryStartedAt = Date.now();
let turnStart = queryStartedAt;

const handleEvent = ( event: AgentSessionEvent ): void => {
const transcriptEvent: TranscriptEvent | null = includeTranscript
? {
index: ++transcriptIndex,
type: event.type,
turnIndex,
elapsedMs: elapsed(),
}
: null;

if ( event.type === 'message_end' && event.message.role === 'assistant' ) {
const now = Date.now();
turnDurationsMs.push( now - turnStart );
Expand All @@ -185,8 +258,16 @@ async function runEval( input: EvalRunnerInput ) {
phaseTimingsMs.first_assistant_message_ms = now - queryStartedAt;
}
turnStart = now;
if ( transcriptEvent ) {
transcriptEvent.turnIndex = turnIndex;
transcriptEvent.stopReason = event.message.stopReason;
if ( event.message.errorMessage ) {
transcriptEvent.errorMessage = event.message.errorMessage;
}
}
}
for ( const tc of extractToolCalls( event ) ) {
const messageToolCalls = extractToolCalls( event );
for ( const tc of messageToolCalls ) {
toolCalls.push( tc );
toolNameById.set( tc.id, tc.name );
const evt: ToolEvent = {
Expand All @@ -199,7 +280,14 @@ async function runEval( input: EvalRunnerInput ) {
toolEvents.push( evt );
toolEventById.set( tc.id, evt );
}
textSegments.push( ...extractTextSegments( event ) );
if ( transcriptEvent && messageToolCalls.length > 0 ) {
transcriptEvent.toolCalls = messageToolCalls;
}
const messageTextSegments = extractTextSegments( event );
textSegments.push( ...messageTextSegments );
if ( transcriptEvent && messageTextSegments.length > 0 ) {
transcriptEvent.text = messageTextSegments.map( ( segment ) => truncateText( segment ) );
}

if ( event.type === 'tool_execution_end' ) {
const tr = extractToolResult( event );
Expand All @@ -226,19 +314,77 @@ async function runEval( input: EvalRunnerInput ) {
isError: tr.isError,
...( tr.text ? { text: tr.text } : {} ),
} );
if ( transcriptEvent ) {
transcriptEvent.toolResult = {
toolUseId: id,
toolName: toolNameById.get( id ) ?? null,
isError: tr.isError,
...( tr.text ? { text: truncateText( tr.text ) } : {} ),
};
}
}
}

if ( event.type === 'turn_end' ) {
numTurns += 1;
}

if ( event.type === 'compaction_start' || event.type === 'compaction_end' ) {
if ( transcriptEvent ) {
transcriptEvent.compaction = {
reason: event.reason,
...( event.type === 'compaction_end' ? { aborted: event.aborted } : {} ),
};
}
}

if ( event.type === 'auto_retry_start' ) {
if ( transcriptEvent ) {
transcriptEvent.autoRetry = {
attempt: event.attempt,
error: event.errorMessage,
};
}
}
if ( event.type === 'auto_retry_end' ) {
if ( transcriptEvent ) {
transcriptEvent.autoRetry = {
attempt: event.attempt,
success: event.success,
...( event.finalError ? { error: event.finalError } : {} ),
};
}
}

if ( event.type === 'agent_end' ) {
const lastAssistant = findLastAssistant( event.messages );
success =
! lastAssistant ||
( lastAssistant.stopReason !== 'error' && lastAssistant.stopReason !== 'aborted' );
interrupted = lastAssistant?.stopReason === 'aborted';
if ( lastAssistant ) {
resultStopReason = lastAssistant.stopReason;
resultErrorMessage = lastAssistant.errorMessage ?? null;
resultUsage = lastAssistant.usage;
resultText = lastAssistant.content
.filter( ( c ): c is { type: 'text'; text: string } => c.type === 'text' )
.map( ( c ) => c.text )
.join( '\n' )
.trim();
}
numTurnsResult = numTurns;
if ( transcriptEvent ) {
if ( resultStopReason ) {
transcriptEvent.stopReason = resultStopReason;
}
if ( resultErrorMessage ) {
transcriptEvent.errorMessage = resultErrorMessage;
}
}
}

if ( transcriptEvent ) {
transcript.push( transcriptEvent );
}
};

Expand All @@ -265,8 +411,15 @@ async function runEval( input: EvalRunnerInput ) {
}
phaseTimingsMs.total_eval_ms = elapsed();

const hasAnyAssistantText = textSegments.some( ( segment ) => segment.trim().length > 0 );
const hasSuccessfulMutatingTool = toolResults.some(
( tr ) => ! tr.isError && tr.toolName !== null && MUTATING_TOOL_NAMES.has( tr.toolName )
);
const producedNoUsefulOutput = success && ! hasAnyAssistantText && ! hasSuccessfulMutatingTool;

return {
success,
interrupted,
error,
timedOut,
numTurns: numTurnsResult,
Expand All @@ -277,6 +430,12 @@ async function runEval( input: EvalRunnerInput ) {
toolEvents,
firstToolError,
textSegments,
resultStopReason,
resultText,
resultErrorMessage,
resultUsage,
producedNoUsefulOutput,
...( includeTranscript ? { transcript } : {} ),
};
}

Expand Down
Loading