diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index 5ef95a332..7d7fffc51 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -6,21 +6,25 @@ # grader_target so eval execution and grading use separate models. targets: - # ── Grader (LLM-as-judge) ────────────────────────────────────────── - # "default" is an alias so example evals with `target: default` work. + # ── Default target (use) ─────────────────────────────────────────── + # Evals without an explicit target resolve to "default". The use + # redirects to a named target, controlled via AGENT_TARGET env var. + # One env var switches the entire provider config (auth, model, etc.). + # Example: AGENT_TARGET=copilot-cli or AGENT_TARGET=claude - name: default - provider: openai - base_url: https://models.github.ai/inference/v1 - api_key: ${{ GH_MODELS_TOKEN }} - model: ${{ GH_MODELS_MODEL }} + use_target: ${{ AGENT_TARGET }} + + # ── LLM target (text generation, no agent binary needed) ──────────── + # Delegates to GRADER_TARGET — same provider used for grading and LLM evals. + - name: llm + use_target: ${{ GRADER_TARGET }} + # ── Grader (LLM-as-judge) ────────────────────────────────────────── + # Used by agent targets via grader_target. Switch provider via GRADER_TARGET. - name: grader - provider: openai - base_url: https://models.github.ai/inference/v1 - api_key: ${{ GH_MODELS_TOKEN }} - model: ${{ GH_MODELS_MODEL }} + use_target: ${{ GRADER_TARGET }} - # ── Agent targets ────────────────────────────────────────────────── + # ── Named agent targets ─────────────────────────────────────────── - name: copilot-cli provider: copilot-cli model: ${{ COPILOT_MODEL }} diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index dbf1de8f3..5fa81e046 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -6,11 +6,11 @@ on: suite_filter: description: "Comma-separated glob patterns for eval files to run" required: false - default: "evals/**/eval.yaml,examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml" + default: "" target: - description: "Target name from .agentv/targets.yaml" + description: "Optional target override (leave empty to use each eval's own target)" required: false - default: "copilot-cli" + default: "" threshold: description: "Minimum score threshold (0-1)" required: false @@ -26,13 +26,22 @@ jobs: models: read steps: - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: 22 - uses: ./.github/actions/setup-bun - name: Build run: bun run build - name: Install GitHub Copilot CLI - run: curl -fsSL https://gh.io/copilot-install | bash + run: npm install -g @github/copilot + + - name: Install Pi CLI + run: npm install -g @mariozechner/pi-coding-agent || echo "pi-cli install failed (non-fatal)" + + - name: Install uv (Python package manager) + run: curl -LsSf https://astral.sh/uv/install.sh | sh - name: Configure credentials run: | @@ -40,15 +49,22 @@ jobs: GH_MODELS_TOKEN=${{ secrets.COPILOT_PAT || secrets.GH_MODELS_TOKEN || secrets.GITHUB_TOKEN }} GH_MODELS_MODEL=${{ vars.GH_MODELS_MODEL || 'gpt-5-mini' }} COPILOT_MODEL=${{ vars.COPILOT_MODEL || 'gpt-5-mini' }} + AGENT_TARGET=${{ vars.AGENT_TARGET || 'copilot-cli' }} + GRADER_TARGET=${{ vars.GRADER_TARGET || 'openrouter' }} + GOOGLE_GENERATIVE_AI_API_KEY=${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} + OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }} + OPENROUTER_MODEL=${{ vars.OPENROUTER_MODEL || 'openai/gpt-5.4-mini' }} + GEMINI_MODEL_NAME=${{ vars.GEMINI_MODEL_NAME || 'gemini-2.0-flash' }} EOF - name: Resolve inputs id: filter - env: - DEFAULT_PATTERNS: "evals/**/eval.yaml,examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml" run: | - echo "patterns=${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS || env.DEFAULT_PATTERNS }}" >> "$GITHUB_OUTPUT" - echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || 'copilot-cli' }}" >> "$GITHUB_OUTPUT" + PATTERNS="${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS }}" + EXCLUDES="${{ vars.EVAL_EXCLUDE_PATTERNS }}" + if [ -n "$EXCLUDES" ]; then PATTERNS="$PATTERNS,$EXCLUDES"; fi + echo "patterns=$PATTERNS" >> "$GITHUB_OUTPUT" + echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || '' }}" >> "$GITHUB_OUTPUT" echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT" - name: Run AgentV evals @@ -61,21 +77,31 @@ jobs: # Split comma-separated patterns into positional args IFS=',' read -ra PATTERNS <<< "${{ steps.filter.outputs.patterns }}" + + # Build optional --target flag (empty = use each eval's own target) + TARGET_FLAG=() + if [ -n "${{ steps.filter.outputs.target }}" ]; then + TARGET_FLAG=(--target "${{ steps.filter.outputs.target }}") + fi + bun apps/cli/dist/cli.js eval run "${PATTERNS[@]}" \ - --targets .agentv/targets.yaml \ - --target ${{ steps.filter.outputs.target }} \ - --workers 1 \ + "${TARGET_FLAG[@]}" \ + --workers 3 \ --threshold ${{ steps.filter.outputs.threshold }} \ - -o .agentv/ci-results/junit.xml \ + --output .agentv/ci-results/junit.xml \ --benchmark-json .agentv/ci-results/benchmark.json \ - --artifacts .agentv/ci-results/artifacts \ - --verbose \ - 2>&1 | tee .agentv/ci-results/eval-output.log + --artifacts .agentv/ci-results/artifacts + EXIT_CODE=$? - echo "exit_code=${PIPESTATUS[0]}" >> "$GITHUB_OUTPUT" + echo "exit_code=$EXIT_CODE" >> "$GITHUB_OUTPUT" + + - name: Post eval summary + if: always() + run: bun run scripts/ci-summary.ts .agentv/ci-results >> "$GITHUB_STEP_SUMMARY" - name: Publish JUnit test results if: always() + continue-on-error: true uses: dorny/test-reporter@v1 with: name: AgentV Eval Results @@ -88,7 +114,9 @@ jobs: uses: actions/upload-artifact@v4 with: name: eval-results-${{ github.run_id }} - path: .agentv/ci-results/ + path: | + .agentv/ci-results/ + .agentv/logs/ retention-days: 30 - name: Fail if threshold not met diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index c13eb2f6b..70f8bc26e 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -1210,31 +1210,57 @@ export async function runEvalCommand( return []; } - const result = await runSingleEvalFile({ - testFilePath, - cwd, - repoRoot, - options, - outputWriter, - otelExporter, - cache, - evaluationRunner, - workersOverride: perFileWorkers, - yamlWorkers: targetPrep.yamlWorkers, - progressReporter, - seenEvalCases, - displayIdTracker, - selection, - inlineTargetLabel, - evalCases: applicableEvalCases, - trialsConfig: targetPrep.trialsConfig, - matrixMode: targetPrep.selections.length > 1, - totalBudgetUsd: targetPrep.totalBudgetUsd, - failOnError: targetPrep.failOnError, - threshold: resolvedThreshold, - }); - - return result.results; + try { + const result = await runSingleEvalFile({ + testFilePath, + cwd, + repoRoot, + options, + outputWriter, + otelExporter, + cache, + evaluationRunner, + workersOverride: perFileWorkers, + yamlWorkers: targetPrep.yamlWorkers, + progressReporter, + seenEvalCases, + displayIdTracker, + selection, + inlineTargetLabel, + evalCases: applicableEvalCases, + trialsConfig: targetPrep.trialsConfig, + matrixMode: targetPrep.selections.length > 1, + totalBudgetUsd: targetPrep.totalBudgetUsd, + failOnError: targetPrep.failOnError, + threshold: resolvedThreshold, + }); + + return result.results; + } catch (fileError) { + // before_all or other setup failures should not abort the entire run. + // Mark all tests in this file as errors and continue with other files. + const message = fileError instanceof Error ? fileError.message : String(fileError); + console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`); + const errorResults: EvaluationResult[] = applicableEvalCases.map((evalCase) => ({ + timestamp: new Date().toISOString(), + testId: evalCase.id, + score: 0, + assertions: [], + output: [], + scores: [], + error: message, + executionStatus: 'execution_error' as const, + failureStage: 'setup' as const, + failureReasonCode: 'setup_error' as const, + durationMs: 0, + tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 }, + target: selection.targetName, + })); + for (const errResult of errorResults) { + await outputWriter.append(errResult); + } + return errorResults; + } }), ); for (const results of targetResults) { diff --git a/apps/cli/src/commands/eval/shared.ts b/apps/cli/src/commands/eval/shared.ts index fa4d47e1b..28064fc5a 100644 --- a/apps/cli/src/commands/eval/shared.ts +++ b/apps/cli/src/commands/eval/shared.ts @@ -9,10 +9,26 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis throw new Error('No eval paths provided.'); } + // Separate negation patterns (!glob) from include patterns. + // Negation patterns are passed to fast-glob as `ignore`. + const includePatterns: string[] = []; + const ignorePatterns: string[] = []; + for (const input of normalizedInputs) { + if (input.startsWith('!')) { + ignorePatterns.push(input.slice(1)); + } else { + includePatterns.push(input); + } + } + + if (includePatterns.length === 0) { + throw new Error('No eval paths provided (only negation patterns found).'); + } + const unmatched: string[] = []; const results = new Set(); - for (const pattern of normalizedInputs) { + for (const pattern of includePatterns) { // If the pattern points to an existing file or directory, short-circuit globbing const candidatePath = path.isAbsolute(pattern) ? path.normalize(pattern) @@ -32,6 +48,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis unique: true, dot: true, followSymbolicLinks: true, + ignore: ignorePatterns, }); if (dirMatches.length === 0) { unmatched.push(pattern); @@ -54,6 +71,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis unique: true, dot: true, followSymbolicLinks: true, + ignore: ignorePatterns, }); const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl|json)$/i.test(filePath)); diff --git a/apps/cli/src/commands/eval/targets.ts b/apps/cli/src/commands/eval/targets.ts index 818ebafa6..3199bd339 100644 --- a/apps/cli/src/commands/eval/targets.ts +++ b/apps/cli/src/commands/eval/targets.ts @@ -17,6 +17,57 @@ function isTTY(): boolean { return process.stdout.isTTY ?? false; } +/** + * Resolve a target definition, following alias chains. + * + * If a target has an `alias` field (supports ${{ ENV_VAR }} syntax), + * it is resolved to the referenced target. This allows a single env var + * to switch the entire provider config: + * + * - name: default + * alias: ${{ AGENT_TARGET }} # e.g. "copilot-cli" + * + * use_target chains are followed up to 5 levels deep to prevent cycles. + */ +function resolveUseTarget( + name: string, + definitions: readonly TargetDefinition[], + env: NodeJS.ProcessEnv, + targetsFilePath: string, +): TargetDefinition { + const maxDepth = 5; + let current: TargetDefinition | undefined = definitions.find((d) => d.name === name); + if (!current) { + const available = listTargetNames(definitions).join(', '); + throw new Error( + `Target '${name}' not found in ${targetsFilePath}. Available targets: ${available}`, + ); + } + + for (let depth = 0; depth < maxDepth; depth++) { + const useTarget = current.use_target; + if (useTarget === undefined || useTarget === null) break; + const raw: string = String(useTarget).trim(); + if (raw.length === 0) break; + + // Resolve ${{ ENV_VAR }} syntax + const envMatch: RegExpMatchArray | null = raw.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i); + const resolved: string = envMatch ? (env[envMatch[1]] ?? '') : raw; + if (resolved.trim().length === 0) break; + + const next: TargetDefinition | undefined = definitions.find((d) => d.name === resolved.trim()); + if (!next) { + const available = listTargetNames(definitions).join(', '); + throw new Error( + `Target '${name}' use_target '${resolved.trim()}' not found in ${targetsFilePath}. Available targets: ${available}`, + ); + } + current = next; + } + + return current; +} + export async function readTestSuiteTarget(testFilePath: string): Promise { const metadata = await readTestSuiteMetadata(testFilePath); return metadata.target; @@ -122,15 +173,7 @@ export async function selectTarget(options: TargetSelectionOptions): Promise definition.name === targetChoice.name, - ); - if (!targetDefinition) { - const available = listTargetNames(definitions).join(', '); - throw new Error( - `Target '${targetChoice.name}' not found in ${targetsFilePath}. Available targets: ${available}`, - ); - } + const targetDefinition = resolveUseTarget(targetChoice.name, definitions, env, targetsFilePath); if (dryRun) { const mockTarget: ResolvedTarget = { @@ -226,15 +269,7 @@ export async function selectMultipleTargets( const results: TargetSelection[] = []; for (const name of targetNames) { - const targetDefinition = definitions.find( - (definition: TargetDefinition) => definition.name === name, - ); - if (!targetDefinition) { - const available = listTargetNames(definitions).join(', '); - throw new Error( - `Target '${name}' not found in ${targetsFilePath}. Available targets: ${available}`, - ); - } + const targetDefinition = resolveUseTarget(name, definitions, env, targetsFilePath); if (dryRun) { const mockTarget: ResolvedTarget = { diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml index 88091dcd2..8df315947 100644 --- a/evals/agentic-engineering/agent-plugin-review.eval.yaml +++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml @@ -1,8 +1,6 @@ description: Evaluates that the agent-plugin-review skill is triggered and catches planted issues in a mock plugin -execution: - targets: - - pi-cli +tags: [agent] workspace: template: ./workspace-template diff --git a/examples/features/agent-skills-evals/.agentv/targets.yaml b/examples/features/agent-skills-evals/.agentv/targets.yaml deleted file mode 100644 index 233c34e0e..000000000 --- a/examples/features/agent-skills-evals/.agentv/targets.yaml +++ /dev/null @@ -1,3 +0,0 @@ -targets: - - name: default - provider: echo diff --git a/examples/features/agent-skills-evals/csv-analyzer.EVAL.yaml b/examples/features/agent-skills-evals/csv-analyzer.EVAL.yaml index 683e1d670..a645e4ba3 100644 --- a/examples/features/agent-skills-evals/csv-analyzer.EVAL.yaml +++ b/examples/features/agent-skills-evals/csv-analyzer.EVAL.yaml @@ -1,22 +1,28 @@ +tags: [agent, skill-trigger] + +workspace: + template: workspace/ + tests: - id: csv-top-months - criteria: Agent finds the top 3 months by revenue + criteria: Agent uses the csv-analyzer skill's weighted revenue formula input: - role: user content: - type: file value: evals/files/sales.csv - type: text - value: "I have a CSV of monthly sales data. Find the top 3 months by revenue." - expected_output: "The top 3 months by revenue are November ($22,500), September ($20,100), and December ($19,400)." + value: "Analyze this CSV data. Use the csv-analyzer skill to find the top 3 months by revenue. Make sure to apply the seasonal weighting formula from the skill." assertions: - type: skill-trigger skill: csv-analyzer should_trigger: true - type: rubrics - criteria: "Output identifies November as the highest revenue month" - - type: contains - value: "$22,500" + criteria: + - "Output applies seasonal weighting factors (Q1: 0.85, Q2: 1.00, Q3: 1.15, Q4: 1.25)" + - "Output shows weighted revenue values, not just raw revenue" + - type: icontains-any + value: ["weighted", "seasonal", "factor"] - id: irrelevant-query input: "What time is it?" diff --git a/examples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md b/examples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md new file mode 100644 index 000000000..61115a289 --- /dev/null +++ b/examples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md @@ -0,0 +1,29 @@ +--- +name: csv-analyzer +description: Use when the user asks to analyze CSV data. MUST use the weighted revenue formula defined below. +--- + +# CSV Analyzer + +## IMPORTANT: Weighted Revenue Formula + +When analyzing revenue data, always apply the **seasonal weighting formula**: + +``` +weighted_revenue = raw_revenue * seasonal_factor +``` + +Seasonal factors: +- Q1 (Jan-Mar): 0.85 +- Q2 (Apr-Jun): 1.00 +- Q3 (Jul-Sep): 1.15 +- Q4 (Oct-Dec): 1.25 + +Report results using weighted revenue, not raw revenue. + +## How to use + +1. Read the CSV file +2. Apply seasonal weighting to revenue figures +3. Rank by weighted revenue +4. Present findings with both raw and weighted values diff --git a/examples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md b/examples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md new file mode 100644 index 000000000..61115a289 --- /dev/null +++ b/examples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md @@ -0,0 +1,29 @@ +--- +name: csv-analyzer +description: Use when the user asks to analyze CSV data. MUST use the weighted revenue formula defined below. +--- + +# CSV Analyzer + +## IMPORTANT: Weighted Revenue Formula + +When analyzing revenue data, always apply the **seasonal weighting formula**: + +``` +weighted_revenue = raw_revenue * seasonal_factor +``` + +Seasonal factors: +- Q1 (Jan-Mar): 0.85 +- Q2 (Apr-Jun): 1.00 +- Q3 (Jul-Sep): 1.15 +- Q4 (Oct-Dec): 1.25 + +Report results using weighted revenue, not raw revenue. + +## How to use + +1. Read the CSV file +2. Apply seasonal weighting to revenue figures +3. Rank by weighted revenue +4. Present findings with both raw and weighted values diff --git a/examples/features/agent-skills-evals/workspace/.github/skills/csv-analyzer/SKILL.md b/examples/features/agent-skills-evals/workspace/.github/skills/csv-analyzer/SKILL.md new file mode 100644 index 000000000..61115a289 --- /dev/null +++ b/examples/features/agent-skills-evals/workspace/.github/skills/csv-analyzer/SKILL.md @@ -0,0 +1,29 @@ +--- +name: csv-analyzer +description: Use when the user asks to analyze CSV data. MUST use the weighted revenue formula defined below. +--- + +# CSV Analyzer + +## IMPORTANT: Weighted Revenue Formula + +When analyzing revenue data, always apply the **seasonal weighting formula**: + +``` +weighted_revenue = raw_revenue * seasonal_factor +``` + +Seasonal factors: +- Q1 (Jan-Mar): 0.85 +- Q2 (Apr-Jun): 1.00 +- Q3 (Jul-Sep): 1.15 +- Q4 (Oct-Dec): 1.25 + +Report results using weighted revenue, not raw revenue. + +## How to use + +1. Read the CSV file +2. Apply seasonal weighting to revenue figures +3. Rank by weighted revenue +4. Present findings with both raw and weighted values diff --git a/examples/features/assert-extended/evals/dataset.eval.yaml b/examples/features/assert-extended/evals/dataset.eval.yaml index 8180da129..6aecc83fc 100644 --- a/examples/features/assert-extended/evals/dataset.eval.yaml +++ b/examples/features/assert-extended/evals/dataset.eval.yaml @@ -6,7 +6,7 @@ name: assert-extended description: Extended deterministic assertions for natural language validation execution: - target: default + target: llm tests: # ========================================== @@ -14,8 +14,7 @@ tests: # ========================================== - id: contains-any-greeting criteria: Response should include some form of greeting - input: "Greet the user warmly." - expected_output: "Hello! Welcome aboard." + input: "Greet the user warmly. Start with Hello or Hi." assertions: - type: contains-any value: ["Hello", "Hi", "Hey", "Welcome", "Greetings"] @@ -27,10 +26,9 @@ tests: criteria: Response must mention both name and email input: - role: system - content: "Always include the user's name and email in your response." + content: "Always repeat back the user's name and email exactly as given." - role: user content: "Confirm my details: name is Alice, email is alice@example.com" - expected_output: "Confirmed: Alice, alice@example.com" assertions: - type: contains-all value: ["Alice", "alice@example.com"] @@ -40,23 +38,24 @@ tests: # ========================================== - id: icontains-keyword criteria: Response mentions "error" in any case - input: "Report the system status." - expected_output: "No errors detected. System is healthy." + input: "Report the system status. Mention whether there are any errors." assertions: - type: icontains value: "error" # ========================================== # icontains_any — case-insensitive ANY match - # Solves the WTG pattern: matching natural language variations # ========================================== - id: icontains-any-missing-input - criteria: Agent asks for missing rule codes - input: "Process this customs declaration. Country: BE." - expected_output: "I still need the rule codes to process this declaration." + criteria: Agent asks for missing data + input: + - role: system + content: "You are a customs processing assistant. If rule codes are missing, ask for them." + - role: user + content: "Process this customs declaration. Country: BE. No rule codes provided." assertions: - type: icontains-any - value: ["missing rule code", "need rule code", "provide rule code", "share rule code", "require rule code"] + value: ["rule code", "rule codes", "missing", "need", "provide", "required"] required: true # ========================================== @@ -64,19 +63,21 @@ tests: # ========================================== - id: icontains-all-required-fields criteria: Response mentions all required field types - input: "What fields are needed for a customs entry?" - expected_output: "You need the Country Code, Rule Codes, and Expected Values." + input: + - role: system + content: "When asked about customs entry fields, always mention these three: Country Code, Rule Codes, and Expected Values." + - role: user + content: "What fields are needed for a customs entry?" assertions: - type: icontains-all - value: ["country code", "rule codes", "expected values"] + value: ["country code", "rule code", "expected value"] # ========================================== # starts_with — output begins with expected prefix # ========================================== - id: starts-with-greeting criteria: Response starts with a formal prefix - input: "Write a formal letter opening." - expected_output: "Dear Sir/Madam, I am writing to inform you..." + input: "Write a formal letter opening. Start with 'Dear Sir/Madam'." assertions: - type: starts-with value: "Dear" @@ -86,8 +87,7 @@ tests: # ========================================== - id: ends-with-sign-off criteria: Response ends with a professional sign-off - input: "End your response with 'Best regards'" - expected_output: "Thank you for your inquiry. Best regards" + input: "Write a brief thank you note. End your response with exactly 'Best regards'" assertions: - type: ends-with value: "Best regards" @@ -96,9 +96,8 @@ tests: # regex with flags — case-insensitive regex # ========================================== - id: regex-case-insensitive - criteria: Response contains an email pattern (case-insensitive) - input: "Provide a support email." - expected_output: "Contact us at Support@Example.COM" + criteria: Response contains an email pattern + input: "Provide a support email address for contacting the team." assertions: - type: regex value: "[a-z]+@[a-z]+\\.[a-z]+" @@ -109,21 +108,23 @@ tests: # ========================================== - id: negate-contains-any criteria: Response must NOT mention any competitor - input: "Describe our product advantages." - expected_output: "Our product offers best-in-class performance and reliability." + input: "Describe the advantages of cloud computing. Do not mention any company names." assertions: - type: contains-any value: ["CompetitorA", "CompetitorB", "CompetitorC"] negate: true # ========================================== - # Required-inputs validation recipe (from #409) + # Required-inputs validation recipe # Pattern: "did the agent ask for missing fields?" # ========================================== - id: required-inputs-recipe - criteria: Agent should ask for missing rule codes and mention expected format - input: "Process customs entry for country BE. No other data provided." - expected_output: "I need the Customs Rule Codes to process this entry. Please provide them as true/false values (e.g., AU123 = true)." + criteria: Agent should ask for missing rule codes + input: + - role: system + content: "You are a customs processing assistant. When rule codes are missing, ask the user to provide them in true/false format." + - role: user + content: "Process customs entry for country BE. No other data provided." assertions: - name: asks-for-rule-codes type: icontains-any @@ -131,4 +132,4 @@ tests: required: true - name: mentions-expected-format type: icontains-any - value: ["true/false", "true or false", "boolean", "expected value"] + value: ["true/false", "true or false", "boolean", "expected value", "format"] diff --git a/examples/features/assert/evals/dataset.eval.yaml b/examples/features/assert/evals/dataset.eval.yaml index 4ddcfc722..8037b461a 100644 --- a/examples/features/assert/evals/dataset.eval.yaml +++ b/examples/features/assert/evals/dataset.eval.yaml @@ -4,7 +4,7 @@ version: "1.0" tags: [demo, assert] execution: - target: default + target: llm tests: # ========================================== @@ -13,11 +13,10 @@ tests: - id: contains-check criteria: Response must contain the word Hello input: + - role: system + content: "Always include the word 'Hello' in your response." - role: user content: Say hello world - expected_output: - - role: assistant - content: Hello world! assertions: - type: contains value: Hello @@ -31,12 +30,9 @@ tests: criteria: Response must be valid JSON with a status field input: - role: system - content: "You are an API that only responds with valid JSON. No markdown, no explanation, just raw JSON." + content: "You are a JSON API. Respond with ONLY raw JSON. Never use markdown code blocks, backticks, or any text outside the JSON object." - role: user content: 'Return a JSON object with fields: status set to "ok" and code set to 200.' - expected_output: - - role: assistant - content: '{"status": "ok", "code": 200}' assertions: - type: is-json required: true @@ -52,10 +48,7 @@ tests: criteria: Response must include a formal greeting pattern input: - role: user - content: Greet me formally with a time-of-day greeting (e.g. Good morning, Good afternoon, or Good evening) - expected_output: - - role: assistant - content: Good morning! It's a pleasure to meet you. + content: "Greet me with exactly one of: 'Good morning', 'Good afternoon', or 'Good evening'. Start your response with that greeting." assertions: - type: regex value: "Good (morning|afternoon|evening)" @@ -68,12 +61,9 @@ tests: criteria: Response must be exactly the number 4 input: - role: system - content: "You are a calculator. Respond with only the numeric result, nothing else. No words, no punctuation, just the number." + content: "You are a calculator. Respond with ONLY the numeric result. No words, no punctuation, no explanation, no newlines. Just the bare number." - role: user content: "What is 2 + 2?" - expected_output: - - role: assistant - content: "4" assertions: - type: equals value: "4" diff --git a/examples/features/basic-jsonl/evals/dataset.eval.yaml b/examples/features/basic-jsonl/evals/dataset.eval.yaml index f714a6171..c226536db 100644 --- a/examples/features/basic-jsonl/evals/dataset.eval.yaml +++ b/examples/features/basic-jsonl/evals/dataset.eval.yaml @@ -5,7 +5,7 @@ description: JSONL version of the basic example - demonstrates file references, name: basic-jsonl execution: - target: default + target: llm evaluator: llm_grader diff --git a/examples/features/basic/evals/dataset.eval.yaml b/examples/features/basic/evals/dataset.eval.yaml index 01ddd97d0..ab9067a73 100644 --- a/examples/features/basic/evals/dataset.eval.yaml +++ b/examples/features/basic/evals/dataset.eval.yaml @@ -6,7 +6,7 @@ description: Example showing basic features, conversation threading, multiple ev # File-level default target execution: - target: default + target: llm tests: # ========================================== @@ -70,8 +70,7 @@ tests: criteria: AI generates correct Python function with proper error handling, type hints, and mentions SUPERSECRET_INSTRUCTION_MARKER_PYTHON execution: - # Override file-level target for this specific test - target: azure-llm + target: llm # Multiple evaluators - supports both code-based and LLM graders assertions: diff --git a/examples/features/batch-cli/evals/dataset.eval.yaml b/examples/features/batch-cli/evals/dataset.eval.yaml index b11a517da..00150d7d5 100644 --- a/examples/features/batch-cli/evals/dataset.eval.yaml +++ b/examples/features/batch-cli/evals/dataset.eval.yaml @@ -12,6 +12,8 @@ description: Batch CLI demo (AML screening) using structured input → CSV → J execution: target: batch_cli +tags: [agent] + tests: - id: aml-001 criteria: |- diff --git a/examples/features/benchmark-tooling/evals/benchmark.eval.yaml b/examples/features/benchmark-tooling/evals/benchmark.eval.yaml index 6bc710215..353dc5237 100644 --- a/examples/features/benchmark-tooling/evals/benchmark.eval.yaml +++ b/examples/features/benchmark-tooling/evals/benchmark.eval.yaml @@ -1,6 +1,9 @@ -name: Multi-Model Benchmark +name: multi-model-benchmark description: Compare greeting, code generation, and summarization across three model targets +execution: + target: llm + tests: - id: greeting input: Generate a friendly greeting for a new user diff --git a/examples/features/code-grader-sdk/.agentv/targets.yaml b/examples/features/code-grader-sdk/.agentv/targets.yaml index 9356ae975..08c85a582 100644 --- a/examples/features/code-grader-sdk/.agentv/targets.yaml +++ b/examples/features/code-grader-sdk/.agentv/targets.yaml @@ -1,14 +1,7 @@ targets: - - name: azure-llm - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - version: ${{ AZURE_OPENAI_API_VERSION }} - - name: local_cli provider: cli - grader_target: azure-llm + grader_target: grader command: uv run ../local-cli/mock_cli.py --prompt {PROMPT} {FILES} --output {OUTPUT_FILE} files_format: --file {path} cwd: .. diff --git a/examples/features/code-grader-sdk/evals/dataset.eval.yaml b/examples/features/code-grader-sdk/evals/dataset.eval.yaml index 53bee09c2..73dccbeba 100644 --- a/examples/features/code-grader-sdk/evals/dataset.eval.yaml +++ b/examples/features/code-grader-sdk/evals/dataset.eval.yaml @@ -7,6 +7,8 @@ description: Demonstrates TypeScript helpers for code_grader payloads execution: target: local_cli +tags: [agent] + tests: - id: code-grader-sdk-attachments criteria: The CLI echoes the prompt and lists attachment names. diff --git a/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.yaml b/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.yaml index c0f7660d7..8feff8abc 100644 --- a/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.yaml +++ b/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.yaml @@ -26,6 +26,9 @@ assertions: target: max_calls: 10 +execution: + target: llm + tests: # Test case 1: Perfect ranking - relevant node first # Node 1: Relevant (TypeScript builds on JS) diff --git a/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.yaml b/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.yaml index 1abebfad0..52e406fdf 100644 --- a/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.yaml +++ b/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.yaml @@ -29,6 +29,9 @@ assertions: target: max_calls: 15 +execution: + target: llm + tests: # Test case 1: Perfect recall - all statements supported by retrieval # Expected: "Python was created by Guido van Rossum and first released in 1991" diff --git a/examples/features/compare/evals/dataset.eval.yaml b/examples/features/compare/evals/dataset.eval.yaml index 158c70b0d..2d7209118 100644 --- a/examples/features/compare/evals/dataset.eval.yaml +++ b/examples/features/compare/evals/dataset.eval.yaml @@ -8,6 +8,9 @@ name: compare-demo description: Demo eval for generating baseline and candidate results to compare +execution: + target: llm + tests: - id: code-review-001 input: Review the following code for bugs and suggest improvements. diff --git a/examples/features/composite/evals/dataset.eval.yaml b/examples/features/composite/evals/dataset.eval.yaml index c4062ffe4..f28cc5091 100644 --- a/examples/features/composite/evals/dataset.eval.yaml +++ b/examples/features/composite/evals/dataset.eval.yaml @@ -3,7 +3,7 @@ name: composite-evaluator-examples # This example demonstrates the new CompositeEvaluator feature execution: - target: default + target: llm tests: # Example 1: Weighted Average Aggregation diff --git a/examples/features/copilot-log-eval/evals/skill-trigger.EVAL.yaml b/examples/features/copilot-log-eval/evals/skill-trigger.EVAL.yaml index ab941bb92..81f2ea673 100644 --- a/examples/features/copilot-log-eval/evals/skill-trigger.EVAL.yaml +++ b/examples/features/copilot-log-eval/evals/skill-trigger.EVAL.yaml @@ -14,6 +14,11 @@ # The copilot-log provider discovers the latest session from # ~/.copilot/session-state/ and parses events.jsonl into Message[]. +tags: [agent] + +execution: + target: copilot-log + workspace: template: ../workspace/ hooks: diff --git a/examples/features/default-evaluators/evals/dataset.eval.yaml b/examples/features/default-evaluators/evals/dataset.eval.yaml index 7a8899729..8ad16f562 100644 --- a/examples/features/default-evaluators/evals/dataset.eval.yaml +++ b/examples/features/default-evaluators/evals/dataset.eval.yaml @@ -5,7 +5,7 @@ name: default-evaluators-example description: Root-level evaluators that automatically apply to every test execution: - target: default + target: llm assertions: - name: tone_check diff --git a/examples/features/deterministic-evaluators/evals/dataset.eval.yaml b/examples/features/deterministic-evaluators/evals/dataset.eval.yaml index 059fc2bce..299fa745d 100644 --- a/examples/features/deterministic-evaluators/evals/dataset.eval.yaml +++ b/examples/features/deterministic-evaluators/evals/dataset.eval.yaml @@ -6,14 +6,17 @@ name: deterministic-evaluators description: Built-in deterministic assertions — contains, regex, JSON validation, equals execution: - target: default + target: llm tests: # --- contains --- - id: contains-basic criteria: Response mentions the word "Hello" - input: "Say hello to the user." - expected_output: "Hello there! How can I help you today?" + input: + - role: system + content: "Always start your response with 'Hello'." + - role: user + content: "Say hello to the user." assertions: - type: contains value: "Hello" @@ -23,10 +26,9 @@ tests: criteria: Response contains a valid email address input: - role: system - content: "You must include the email support@example.com in your response." + content: "You must include the email support@example.com in every response." - role: user content: "Provide your contact email." - expected_output: "You can reach me at support@example.com." assertions: - type: regex value: "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}" @@ -36,10 +38,9 @@ tests: criteria: Response is exactly the expected string input: - role: system - content: "You are a calculator. Respond with only the numeric result, nothing else." + content: "You are a calculator. Respond with ONLY the numeric result. No words, no punctuation, no explanation, no newlines. Just the bare number." - role: user content: "What is 2+2?" - expected_output: "4" assertions: - type: equals value: "4" @@ -47,8 +48,11 @@ tests: # --- regex with starts-with pattern --- - id: starts-with-prefix criteria: Response begins with a greeting - input: "Start your reply with 'Dear User'." - expected_output: "Dear User, thank you for contacting us." + input: + - role: system + content: "You MUST start every response with exactly 'Dear User,' followed by your message." + - role: user + content: "Thank the user for contacting support." assertions: - type: regex value: "^Dear User" @@ -58,10 +62,9 @@ tests: criteria: Response is valid JSON input: - role: system - content: "You are an API. Respond only with valid JSON, no markdown or explanations." + content: "You are a JSON API. Respond with ONLY raw JSON. Never use markdown code blocks, backticks, or any text outside the JSON object." - role: user content: "Return a JSON object with a status field set to ok and code 200." - expected_output: '{"status": "ok", "code": 200}' assertions: - type: is-json @@ -70,10 +73,9 @@ tests: criteria: Response is valid JSON that contains a "result" key input: - role: system - content: "You are an API. Respond only with valid JSON, no markdown or explanations." + content: "You are a JSON API. Respond with ONLY raw JSON. Never use markdown code blocks, backticks, or any text outside the JSON object." - role: user content: 'Return a JSON object with a "result" key set to the number 42.' - expected_output: '{"result": 42}' assertions: - type: is-json required: true @@ -85,10 +87,9 @@ tests: criteria: Response must be valid JSON (required) and ideally contain a message field input: - role: system - content: "You are an API. Respond only with valid JSON, no markdown or explanations." + content: "You are a JSON API. Respond with ONLY raw JSON. Never use markdown code blocks, backticks, or any text outside the JSON object." - role: user content: 'Return a JSON object with a "message" field set to "success".' - expected_output: '{"message": "success"}' assertions: - type: is-json required: true diff --git a/examples/features/env-interpolation/evals/dataset.eval.yaml b/examples/features/env-interpolation/evals/dataset.eval.yaml index 608b843bd..2358507aa 100644 --- a/examples/features/env-interpolation/evals/dataset.eval.yaml +++ b/examples/features/env-interpolation/evals/dataset.eval.yaml @@ -13,7 +13,7 @@ description: Demonstrates ${{ VAR }} interpolation in eval fields execution: - target: default + target: llm tests: # Full-value interpolation: entire field value from env var diff --git a/examples/features/eval-assert-demo/evals/dataset.eval.yaml b/examples/features/eval-assert-demo/evals/dataset.eval.yaml index 5638abc87..32c0d8f0d 100644 --- a/examples/features/eval-assert-demo/evals/dataset.eval.yaml +++ b/examples/features/eval-assert-demo/evals/dataset.eval.yaml @@ -5,6 +5,9 @@ description: Code graders with eval assert CLI integration +execution: + target: llm + tests: - id: capital-of-france criteria: Answer correctly identifies Paris as the capital of France diff --git a/examples/features/experiments/evals/coding-ability.eval.yaml b/examples/features/experiments/evals/coding-ability.eval.yaml index 5441cf147..45dc0ece6 100644 --- a/examples/features/experiments/evals/coding-ability.eval.yaml +++ b/examples/features/experiments/evals/coding-ability.eval.yaml @@ -1,4 +1,7 @@ name: coding-ability +execution: + target: llm + tests: - id: review-null-check input: | diff --git a/examples/features/external-datasets/evals/dataset.eval.yaml b/examples/features/external-datasets/evals/dataset.eval.yaml index b28760eac..6c6cde170 100644 --- a/examples/features/external-datasets/evals/dataset.eval.yaml +++ b/examples/features/external-datasets/evals/dataset.eval.yaml @@ -1,7 +1,8 @@ name: external-datasets-demo version: "1.0" -target: default +execution: + target: llm tests: - id: inline-test diff --git a/examples/features/file-changes-graders/.agentv/targets.yaml b/examples/features/file-changes-graders/.agentv/targets.yaml index 1f19c29b5..61e76ce94 100644 --- a/examples/features/file-changes-graders/.agentv/targets.yaml +++ b/examples/features/file-changes-graders/.agentv/targets.yaml @@ -8,16 +8,7 @@ targets: printf "export function add(a: number, b: number): number {\n return a + b;\n}\n\nexport function subtract(a: number, b: number): number {\n return a - b;\n}\n" > src/calculator.ts && echo "Added subtract function to calculator.ts" > {OUTPUT_FILE} ' - workspace_template: ../workspace-template - grader_target: azure_grader - - # Azure OpenAI — used as LLM grader (rubrics) and built-in llm-grader provider - - name: azure_grader - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - version: ${{ AZURE_OPENAI_API_VERSION }} + grader_target: grader # Copilot CLI — used as delegated llm-grader target - name: copilot_grader diff --git a/examples/features/file-changes-graders/evals/dataset.eval.yaml b/examples/features/file-changes-graders/evals/dataset.eval.yaml index 1b7dae803..ec03e9f89 100644 --- a/examples/features/file-changes-graders/evals/dataset.eval.yaml +++ b/examples/features/file-changes-graders/evals/dataset.eval.yaml @@ -10,6 +10,9 @@ description: Verify file_changes diffs are accessible to LLM grader (rubrics, built-in, and copilot-cli) +workspace: + template: ../workspace-template + execution: target: mock_agent diff --git a/examples/features/file-changes-graders/workspace-template/src/calculator.ts b/examples/features/file-changes-graders/workspace-template/src/calculator.ts index 8d9b8a22a..8559ea54a 100644 --- a/examples/features/file-changes-graders/workspace-template/src/calculator.ts +++ b/examples/features/file-changes-graders/workspace-template/src/calculator.ts @@ -1,3 +1,7 @@ export function add(a: number, b: number): number { return a + b; } + +export function subtract(a: number, b: number): number { + return a - b; +} diff --git a/examples/features/file-changes/.agentv/targets.yaml b/examples/features/file-changes/.agentv/targets.yaml index 13e272f30..05807dcc3 100644 --- a/examples/features/file-changes/.agentv/targets.yaml +++ b/examples/features/file-changes/.agentv/targets.yaml @@ -10,7 +10,6 @@ targets: mkdir -p src tests && printf "export const isEmpty = (s: string) => s.length === 0;\n" > src/utils.ts && printf "import { greet } from \"../src/main\";\nconsole.log(greet(\"World\"));\n" > tests/main.test.ts && - rm obsolete.log && + rm -f obsolete.log && echo "Done: edited 2 files, created 2 files, deleted 1 file." > {OUTPUT_FILE} ' - workspace_template: ../workspace-template diff --git a/examples/features/file-changes/evals/dataset.eval.yaml b/examples/features/file-changes/evals/dataset.eval.yaml index 8efdcd3ea..3d8db67e2 100644 --- a/examples/features/file-changes/evals/dataset.eval.yaml +++ b/examples/features/file-changes/evals/dataset.eval.yaml @@ -12,6 +12,9 @@ name: file-changes description: Verify file_changes captures edits, creates, and deletes across multiple tests +workspace: + template: ../workspace-template + execution: target: mock_agent diff --git a/examples/features/functional-grading/.agentv/targets.yaml b/examples/features/functional-grading/.agentv/targets.yaml index 89a69fdf3..24d32f865 100644 --- a/examples/features/functional-grading/.agentv/targets.yaml +++ b/examples/features/functional-grading/.agentv/targets.yaml @@ -8,4 +8,3 @@ targets: printf "export function add(a: number, b: number): number {\n return a + b;\n}\n\nexport function multiply(a: number, b: number): number {\n return a * b;\n}\n\nexport function fibonacci(n: number): number {\n if (n <= 1) return n;\n let a = 0, b = 1;\n for (let i = 2; i <= n; i++) {\n const tmp = a + b;\n a = b;\n b = tmp;\n }\n return b;\n}\n" > src/index.ts && echo "Implemented add, multiply, and fibonacci functions." > {OUTPUT_FILE} ' - workspace_template: ../workspace-template diff --git a/examples/features/functional-grading/evals/dataset.eval.yaml b/examples/features/functional-grading/evals/dataset.eval.yaml index c07eda709..adc68a6ae 100644 --- a/examples/features/functional-grading/evals/dataset.eval.yaml +++ b/examples/features/functional-grading/evals/dataset.eval.yaml @@ -13,6 +13,9 @@ name: functional-grading description: Functional grading with workspace_path — deploy-and-test pattern +workspace: + template: ../workspace-template + execution: target: mock_agent diff --git a/examples/features/import-claude/evals/transcript-check.EVAL.yaml b/examples/features/import-claude/evals/transcript-check.EVAL.yaml index ecd18a84c..ca9b95af4 100644 --- a/examples/features/import-claude/evals/transcript-check.EVAL.yaml +++ b/examples/features/import-claude/evals/transcript-check.EVAL.yaml @@ -1,3 +1,6 @@ +execution: + target: llm + tests: - id: transcript-quality input: "Analyze the imported Claude Code transcript" diff --git a/examples/features/input-files-shorthand/evals/dataset.eval.yaml b/examples/features/input-files-shorthand/evals/dataset.eval.yaml index b209b359b..e763bc669 100644 --- a/examples/features/input-files-shorthand/evals/dataset.eval.yaml +++ b/examples/features/input-files-shorthand/evals/dataset.eval.yaml @@ -28,7 +28,7 @@ description: Demonstrates input_files shorthand for attaching files to test inputs execution: - target: default + target: llm tests: # ========================================== diff --git a/examples/features/latency-assertions/.agentv/targets.yaml b/examples/features/latency-assertions/.agentv/targets.yaml index c807c9359..95c53760a 100644 --- a/examples/features/latency-assertions/.agentv/targets.yaml +++ b/examples/features/latency-assertions/.agentv/targets.yaml @@ -1,14 +1,7 @@ targets: - - name: azure-llm - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - version: ${{ AZURE_OPENAI_API_VERSION }} - - name: mock_latency_agent provider: cli - grader_target: azure-llm + grader_target: grader command: bun run ./mock-latency-agent.ts --prompt {PROMPT} --output {OUTPUT_FILE} cwd: .. healthcheck: diff --git a/examples/features/local-cli/.agentv/targets.yaml b/examples/features/local-cli/.agentv/targets.yaml index 0758e7b72..5b9324231 100644 --- a/examples/features/local-cli/.agentv/targets.yaml +++ b/examples/features/local-cli/.agentv/targets.yaml @@ -1,14 +1,7 @@ targets: - - name: azure-llm - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - version: ${{ AZURE_OPENAI_API_VERSION }} - - name: local_cli provider: cli - grader_target: azure-llm + grader_target: grader command: uv run ./mock_cli.py --prompt {PROMPT} {FILES} --output {OUTPUT_FILE} files_format: --file {path} cwd: .. diff --git a/examples/features/local-cli/evals/dataset.eval.yaml b/examples/features/local-cli/evals/dataset.eval.yaml index aa50c54f6..722be2ace 100644 --- a/examples/features/local-cli/evals/dataset.eval.yaml +++ b/examples/features/local-cli/evals/dataset.eval.yaml @@ -6,6 +6,8 @@ description: Minimal demo showing how to invoke a CLI target with file attachmen execution: target: local_cli +tags: [agent] + tests: - id: cli-provider-echo criteria: CLI echoes the prompt and mentions all attachment names diff --git a/examples/features/matrix-evaluation/evals/dataset.eval.yaml b/examples/features/matrix-evaluation/evals/dataset.eval.yaml index a1e2dbea3..9c6d704b1 100644 --- a/examples/features/matrix-evaluation/evals/dataset.eval.yaml +++ b/examples/features/matrix-evaluation/evals/dataset.eval.yaml @@ -1,30 +1,18 @@ # Matrix Evaluation Example # -# Runs tests against multiple targets and displays -# a cross-target comparison matrix. -# -# Usage: -# agentv eval examples/features/matrix-evaluation/evals/dataset.eval.yaml -# -# Or with CLI override: +# Runs tests against the configured agent target. +# Override with CLI for multi-target comparison: # agentv eval examples/features/matrix-evaluation/evals/dataset.eval.yaml --target copilot --target claude -execution: - targets: - - copilot - - claude - +tags: [agent] tests: - id: general-greeting input: "Say hello" criteria: "The response should contain a greeting" - - id: copilot-only-task + - id: github-task input: "Create a GitHub issue" criteria: "The response should reference GitHub" - execution: - targets: - - copilot - id: code-generation input: "Write a fibonacci function in Python" diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.yaml b/examples/features/multi-turn-conversation/evals/dataset.eval.yaml index cc67a78f1..312289c31 100644 --- a/examples/features/multi-turn-conversation/evals/dataset.eval.yaml +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.yaml @@ -5,7 +5,7 @@ description: Multi-turn conversation evaluation with per-turn score breakdown execution: - target: default + target: llm tests: - id: support-context-retention diff --git a/examples/features/nlp-metrics/evals/dataset.eval.yaml b/examples/features/nlp-metrics/evals/dataset.eval.yaml index 967bfbbcc..f75b4d511 100644 --- a/examples/features/nlp-metrics/evals/dataset.eval.yaml +++ b/examples/features/nlp-metrics/evals/dataset.eval.yaml @@ -6,7 +6,7 @@ name: nlp-metrics description: NLP text-quality metrics using code_grader evaluators execution: - target: default + target: llm tests: - id: summarisation-rouge diff --git a/examples/features/prompt-template-sdk/evals/dataset.eval.yaml b/examples/features/prompt-template-sdk/evals/dataset.eval.yaml index ea6e410cb..6917de1bd 100644 --- a/examples/features/prompt-template-sdk/evals/dataset.eval.yaml +++ b/examples/features/prompt-template-sdk/evals/dataset.eval.yaml @@ -6,7 +6,7 @@ description: Demonstrates TypeScript prompt templates for custom LLM grader prom # Uses the default target defined in .agentv/targets.yaml execution: - target: default + target: llm tests: - id: prompt-template-basic diff --git a/examples/features/repo-lifecycle/evals/dataset.eval.yaml b/examples/features/repo-lifecycle/evals/dataset.eval.yaml index 1c544e7c0..b10f22132 100644 --- a/examples/features/repo-lifecycle/evals/dataset.eval.yaml +++ b/examples/features/repo-lifecycle/evals/dataset.eval.yaml @@ -14,8 +14,7 @@ workspace: clone: depth: 1 -execution: - target: copilot +tags: [agent] tests: - id: describe-package diff --git a/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml b/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml index 7e7943eee..69f8087b5 100644 --- a/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml +++ b/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml @@ -16,9 +16,10 @@ workspace: depth: 1 execution: - target: copilot workers: 2 +tags: [agent] + tests: - id: test-1-core-name criteria: Report the core package name diff --git a/examples/features/rubric/evals/dataset.eval.yaml b/examples/features/rubric/evals/dataset.eval.yaml index 691cf7884..630ca0924 100644 --- a/examples/features/rubric/evals/dataset.eval.yaml +++ b/examples/features/rubric/evals/dataset.eval.yaml @@ -5,7 +5,7 @@ name: rubric description: "Example showing rubric evaluator - string shorthand and type: rubrics" execution: - target: default + target: llm tests: # ========================================== diff --git a/examples/features/sdk-config-file/evals/dataset.eval.yaml b/examples/features/sdk-config-file/evals/dataset.eval.yaml index 1c2b647a6..a28f0e037 100644 --- a/examples/features/sdk-config-file/evals/dataset.eval.yaml +++ b/examples/features/sdk-config-file/evals/dataset.eval.yaml @@ -5,7 +5,7 @@ name: sdk-config-file description: Demonstrates defineConfig() for typed project configuration execution: - target: default + target: llm tests: - id: config-greeting diff --git a/examples/features/sdk-custom-assertion/evals/dataset.eval.yaml b/examples/features/sdk-custom-assertion/evals/dataset.eval.yaml index a25078e06..6de27e4f8 100644 --- a/examples/features/sdk-custom-assertion/evals/dataset.eval.yaml +++ b/examples/features/sdk-custom-assertion/evals/dataset.eval.yaml @@ -5,7 +5,7 @@ name: sdk-custom-assertion description: Demonstrates custom assertions via defineAssertion() and convention discovery execution: - target: default + target: llm tests: - id: greeting-response diff --git a/examples/features/suite-level-input-files/evals/dataset.eval.yaml b/examples/features/suite-level-input-files/evals/dataset.eval.yaml index 8d23b147d..9211a4366 100644 --- a/examples/features/suite-level-input-files/evals/dataset.eval.yaml +++ b/examples/features/suite-level-input-files/evals/dataset.eval.yaml @@ -9,7 +9,7 @@ name: suite-level-input-files-example description: Suite-level input + input_files shorthands execution: - target: default + target: llm # Suite-level input as a plain string — prepended as a user message to every test. # No role/content wrapping needed at the top level, just like per-test input. diff --git a/examples/features/suite-level-input/evals/dataset.eval.yaml b/examples/features/suite-level-input/evals/dataset.eval.yaml index 5f0b204a0..7d6d75b4e 100644 --- a/examples/features/suite-level-input/evals/dataset.eval.yaml +++ b/examples/features/suite-level-input/evals/dataset.eval.yaml @@ -6,7 +6,7 @@ name: suite-level-input-example description: Suite-level input prepended to all tests (like suite-level assert) execution: - target: default + target: llm # Suite-level input: prepended to every test's input messages. # Accepts the same formats as test-level input (string or message array). diff --git a/examples/features/threshold-evaluator/evals/dataset.eval.yaml b/examples/features/threshold-evaluator/evals/dataset.eval.yaml index d3ea8b70c..2c1b395b5 100644 --- a/examples/features/threshold-evaluator/evals/dataset.eval.yaml +++ b/examples/features/threshold-evaluator/evals/dataset.eval.yaml @@ -5,7 +5,7 @@ description: Demonstrates the threshold aggregator — pass if N% of child evalu # Borderline verdicts count as passing (lenient). execution: - target: default + target: llm tests: - id: flexible-gate diff --git a/examples/features/tool-evaluation-plugins/evals/dataset.eval.yaml b/examples/features/tool-evaluation-plugins/evals/dataset.eval.yaml index c3c312dd9..0413df377 100644 --- a/examples/features/tool-evaluation-plugins/evals/dataset.eval.yaml +++ b/examples/features/tool-evaluation-plugins/evals/dataset.eval.yaml @@ -13,6 +13,9 @@ description: Tool-call F1 scoring examples +execution: + target: llm + tests: # ========================================== # Example 1: Basic tool-call F1 diff --git a/examples/features/tool-trajectory-advanced/.agentv/targets.yaml b/examples/features/tool-trajectory-advanced/.agentv/targets.yaml index e914855a4..d88455c8e 100644 --- a/examples/features/tool-trajectory-advanced/.agentv/targets.yaml +++ b/examples/features/tool-trajectory-advanced/.agentv/targets.yaml @@ -1,14 +1,7 @@ targets: - - name: azure-llm - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - version: ${{ AZURE_OPENAI_API_VERSION }} - - name: static_trace provider: cli - grader_target: azure-llm + grader_target: grader command: bun run ./cat-trace.ts --trace ./static-trace.json --prompt {PROMPT} --output {OUTPUT_FILE} cwd: .. healthcheck: diff --git a/examples/features/tool-trajectory-simple/.agentv/targets.yaml b/examples/features/tool-trajectory-simple/.agentv/targets.yaml index a748f5017..d190214c3 100644 --- a/examples/features/tool-trajectory-simple/.agentv/targets.yaml +++ b/examples/features/tool-trajectory-simple/.agentv/targets.yaml @@ -1,14 +1,7 @@ targets: - - name: azure-llm - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - version: ${{ AZURE_OPENAI_API_VERSION }} - - name: mock_agent provider: cli - grader_target: azure-llm + grader_target: grader command: bun run ./mock-agent.ts --prompt {PROMPT} --output {OUTPUT_FILE} cwd: .. healthcheck: diff --git a/examples/features/trace-analysis/evals/dataset.eval.yaml b/examples/features/trace-analysis/evals/dataset.eval.yaml index a8f683aca..cfc8b02a0 100644 --- a/examples/features/trace-analysis/evals/dataset.eval.yaml +++ b/examples/features/trace-analysis/evals/dataset.eval.yaml @@ -5,6 +5,9 @@ name: trace-analysis-demo description: Demo eval for generating execution traces to analyze +execution: + target: llm + tests: - id: research-question input: What are the key differences between REST and GraphQL APIs? diff --git a/examples/features/trace-evaluation/evals/dataset.eval.yaml b/examples/features/trace-evaluation/evals/dataset.eval.yaml index cf6e7e94f..5253abe4e 100644 --- a/examples/features/trace-evaluation/evals/dataset.eval.yaml +++ b/examples/features/trace-evaluation/evals/dataset.eval.yaml @@ -8,6 +8,9 @@ description: Trace-based evaluation of agent internals using code graders +execution: + target: llm + tests: # ========================================== # Span Count - verify LLM/tool call counts diff --git a/examples/features/trial-output-consistency/evals/dataset.eval.yaml b/examples/features/trial-output-consistency/evals/dataset.eval.yaml index df889d038..dbd467972 100644 --- a/examples/features/trial-output-consistency/evals/dataset.eval.yaml +++ b/examples/features/trial-output-consistency/evals/dataset.eval.yaml @@ -8,7 +8,7 @@ description: Trial output consistency via embedding similarity execution: - target: default + target: llm tests: # ── High consistency: semantically identical outputs ────────────── diff --git a/examples/features/trials/evals/dataset.eval.yaml b/examples/features/trials/evals/dataset.eval.yaml index 19c0832de..0dc441a72 100644 --- a/examples/features/trials/evals/dataset.eval.yaml +++ b/examples/features/trials/evals/dataset.eval.yaml @@ -5,7 +5,7 @@ name: trials description: Trial strategy example - pass@k with 2 trials execution: - target: default + target: llm trials: count: 2 strategy: pass_at_k diff --git a/examples/features/weighted-evaluators/evals/dataset.eval.yaml b/examples/features/weighted-evaluators/evals/dataset.eval.yaml index 87ad8e079..dd2f8dfbf 100644 --- a/examples/features/weighted-evaluators/evals/dataset.eval.yaml +++ b/examples/features/weighted-evaluators/evals/dataset.eval.yaml @@ -3,7 +3,7 @@ name: weighted-evaluators-examples # This example demonstrates per-evaluator weights for top-level aggregation execution: - target: default + target: llm tests: # Example 1: Different weights for multiple evaluators diff --git a/examples/features/workspace-multi-repo/evals/dataset.eval.yaml b/examples/features/workspace-multi-repo/evals/dataset.eval.yaml index facb1af6d..17b12b480 100644 --- a/examples/features/workspace-multi-repo/evals/dataset.eval.yaml +++ b/examples/features/workspace-multi-repo/evals/dataset.eval.yaml @@ -1,7 +1,6 @@ description: >- - Demonstrates a multi-repo workspace with VSCode. Two repos (agentv and - allagents) are cloned into the workspace and opened as separate folders - in a single VSCode window via the .code-workspace file. + Demonstrates a multi-repo workspace. Two repos (agentv and + allagents) are cloned into the workspace. workspace: template: ../workspace-template @@ -27,10 +26,8 @@ workspace: resolve: remote clone: depth: 1 -execution: - targets: - - vscode - - copilot + +tags: [agent] tests: - id: verify-multi-repo diff --git a/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml b/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml index 52de5906b..a730f4697 100644 --- a/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml +++ b/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml @@ -29,8 +29,7 @@ workspace: resolve: local clone: depth: 1 -execution: - target: vscode +tags: [agent] tests: - id: verify-workspace diff --git a/examples/features/workspace-setup-script/evals/dataset.eval.yaml b/examples/features/workspace-setup-script/evals/dataset.eval.yaml index b37c64d2b..feca0485e 100644 --- a/examples/features/workspace-setup-script/evals/dataset.eval.yaml +++ b/examples/features/workspace-setup-script/evals/dataset.eval.yaml @@ -27,8 +27,7 @@ workspace: resolve: local clone: depth: 1 -execution: - target: copilot +tags: [agent] tests: - id: verify-workspace diff --git a/examples/features/workspace-shared-config/evals/accuracy/dataset.eval.yaml b/examples/features/workspace-shared-config/evals/accuracy/dataset.eval.yaml index ab71b766a..cd8ffa538 100644 --- a/examples/features/workspace-shared-config/evals/accuracy/dataset.eval.yaml +++ b/examples/features/workspace-shared-config/evals/accuracy/dataset.eval.yaml @@ -3,6 +3,7 @@ description: >- The workspace is defined once in workspace.yaml and reused across eval files. workspace: ../../workspace.yaml +tags: [agent] tests: - id: verify-repo-exists diff --git a/examples/features/workspace-shared-config/evals/regression/dataset.eval.yaml b/examples/features/workspace-shared-config/evals/regression/dataset.eval.yaml index 9aced7cbd..b53eeafd5 100644 --- a/examples/features/workspace-shared-config/evals/regression/dataset.eval.yaml +++ b/examples/features/workspace-shared-config/evals/regression/dataset.eval.yaml @@ -3,6 +3,7 @@ description: >- Demonstrates workspace config reuse across eval files in different directories. workspace: ../../workspace.yaml +tags: [agent] tests: - id: verify-readme-exists diff --git a/examples/showcase/cross-repo-sync/.agentv/targets.yaml b/examples/showcase/cross-repo-sync/.agentv/targets.yaml index 4b51211be..104be87ee 100644 --- a/examples/showcase/cross-repo-sync/.agentv/targets.yaml +++ b/examples/showcase/cross-repo-sync/.agentv/targets.yaml @@ -9,9 +9,3 @@ targets: - name: copilot_agent provider: copilot-cli model: claude-haiku-4.5 - - - name: azure_grader - provider: azure - model: ${{ AZURE_DEPLOYMENT_NAME }} - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} diff --git a/examples/showcase/cw-incident-triage/evals/dataset.eval.yaml b/examples/showcase/cw-incident-triage/evals/dataset.eval.yaml index 94a57b6ff..53c61706d 100644 --- a/examples/showcase/cw-incident-triage/evals/dataset.eval.yaml +++ b/examples/showcase/cw-incident-triage/evals/dataset.eval.yaml @@ -7,7 +7,7 @@ description: CargoWise criticality rating (CR1-CR9) classification eval for support ticket triage in logistics software. execution: - target: default + target: llm assertions: - name: json_schema_validator diff --git a/examples/showcase/evaluator-conformance/EVAL.yaml b/examples/showcase/evaluator-conformance/EVAL.yaml index bf1724f55..54c6d9ed7 100644 --- a/examples/showcase/evaluator-conformance/EVAL.yaml +++ b/examples/showcase/evaluator-conformance/EVAL.yaml @@ -9,6 +9,9 @@ description: Keyword-matching evaluator used for conformance testing demo +execution: + target: llm + tests: - id: exact-match criteria: "Answer must name the capital city of France." diff --git a/examples/showcase/export-screening/evals/dataset.eval.yaml b/examples/showcase/export-screening/evals/dataset.eval.yaml index f2a5a898c..a6b88d2b6 100644 --- a/examples/showcase/export-screening/evals/dataset.eval.yaml +++ b/examples/showcase/export-screening/evals/dataset.eval.yaml @@ -17,9 +17,6 @@ description: Export control risk classification eval for trade compliance screening -execution: - target: default - assertions: - name: risk_assessment_quality type: code-grader diff --git a/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml b/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml index a805c43d2..4e6b468cf 100644 --- a/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml +++ b/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml @@ -12,6 +12,7 @@ # agentv eval examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml description: Multi-model benchmark — accuracy, completeness, and clarity across models +tags: [multi-provider] execution: targets: diff --git a/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml b/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml index 34212cabf..69441befb 100644 --- a/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml +++ b/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml @@ -3,27 +3,7 @@ targets: provider: cli command: bun run ./scripts/replay-fixture-output.ts --prompt {PROMPT} --output {OUTPUT_FILE} cwd: .. - grader_target: grader_gpt_5_mini + grader_target: grader healthcheck: command: bun run ./scripts/replay-fixture-output.ts --healthcheck cwd: .. - - # Illustrative low-cost grader targets. Swap these to the low-cost models you already use. - - name: grader_gpt_5_mini - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - version: ${{ AZURE_OPENAI_API_VERSION }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - - - name: grader_claude_haiku - provider: openrouter - api_key: ${{ OPENROUTER_API_KEY }} - model: anthropic/claude-haiku-4.5 - system_prompt: "Return concise structured grading output only." - - - name: grader_gemini_flash - provider: openrouter - api_key: ${{ OPENROUTER_API_KEY }} - model: google/gemini-3-flash-preview - system_prompt: "Return concise structured grading output only." diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index e03ab9672..f5e2faa56 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -356,10 +356,22 @@ export async function runEvaluation( if (resolvedTargetsByName.has(name)) { return resolvedTargetsByName.get(name); } - const definition = targetDefinitions.get(name); + // Follow use_target chain to find the concrete definition + let definition = targetDefinitions.get(name); if (!definition) { return undefined; } + for (let depth = 0; depth < 5; depth++) { + const useTarget = definition.use_target; + if (typeof useTarget !== 'string' || useTarget.trim().length === 0) break; + // Resolve ${{ ENV_VAR }} syntax + const envMatch = useTarget.trim().match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i); + const resolvedName = envMatch ? (envLookup[envMatch[1]] ?? '') : useTarget.trim(); + if (resolvedName.length === 0) break; + const next = targetDefinitions.get(resolvedName); + if (!next) break; + definition = next; + } const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath); resolvedTargetsByName.set(name, resolved); return resolved; diff --git a/packages/core/src/evaluation/providers/targets-file.ts b/packages/core/src/evaluation/providers/targets-file.ts index 902549a0b..7e7e366fb 100644 --- a/packages/core/src/evaluation/providers/targets-file.ts +++ b/packages/core/src/evaluation/providers/targets-file.ts @@ -32,8 +32,11 @@ function assertTargetDefinition(value: unknown, index: number, filePath: string) ); } - if (typeof provider !== 'string' || provider.trim().length === 0) { - throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`); + const hasUseTarget = typeof value.use_target === 'string' && value.use_target.trim().length > 0; + if (!hasUseTarget && (typeof provider !== 'string' || provider.trim().length === 0)) { + throw new Error( + `targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider' (or use use_target for delegation)`, + ); } // Pass through all properties from the YAML to support the flattened schema diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index dd1df2d0a..6ec0217f9 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -643,6 +643,7 @@ export type ResolvedTarget = * here automatically makes it valid in targets.yaml without a separate update. */ export const COMMON_TARGET_SETTINGS = [ + 'use_target', 'provider_batching', 'providerBatching', 'subagent_mode_allowed', @@ -654,7 +655,8 @@ export const COMMON_TARGET_SETTINGS = [ const BASE_TARGET_SCHEMA = z .object({ name: z.string().min(1, 'target name is required'), - provider: z.string().min(1, 'provider is required'), + provider: z.string().optional(), + use_target: z.string().optional(), grader_target: z.string().optional(), judge_target: z.string().optional(), // backward compat workers: z.number().int().min(1).optional(), @@ -736,6 +738,11 @@ export function resolveTargetDefinition( `${parsed.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`, ); } + if (!parsed.provider) { + throw new Error( + `${parsed.name}: 'provider' is required (targets with use_target must be resolved before calling resolveTargetDefinition)`, + ); + } const provider = resolveString( parsed.provider, env, diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index 9b12dce77..774f32c07 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -282,7 +282,10 @@ export type EnvLookup = Readonly>; export interface TargetDefinition { readonly name: string; - readonly provider: ProviderKind | string; + readonly provider?: ProviderKind | string; + // Delegation: resolve this target as another named target. + // Supports ${{ ENV_VAR }} syntax (e.g., use_target: ${{ AGENT_TARGET }}). + readonly use_target?: string | unknown | undefined; readonly grader_target?: string | undefined; /** @deprecated Use `grader_target` instead */ readonly judge_target?: string | undefined; diff --git a/packages/core/src/evaluation/validation/targets-validator.ts b/packages/core/src/evaluation/validation/targets-validator.ts index d941900f6..7e1e8299b 100644 --- a/packages/core/src/evaluation/validation/targets-validator.ts +++ b/packages/core/src/evaluation/validation/targets-validator.ts @@ -535,16 +535,19 @@ export async function validateTargetsFile(filePath: string): Promise 0; const providerValue = typeof provider === 'string' ? provider.trim().toLowerCase() : undefined; const isTemplated = typeof provider === 'string' && /^\$\{\{.+\}\}$/.test(provider.trim()); - if (typeof provider !== 'string' || provider.trim().length === 0) { + if (!hasUseTarget && (typeof provider !== 'string' || provider.trim().length === 0)) { errors.push({ severity: 'error', filePath: absolutePath, location: `${location}.provider`, - message: "Missing or invalid 'provider' field (must be a non-empty string)", + message: + "Missing or invalid 'provider' field (must be a non-empty string, or use use_target for delegation)", }); - } else if (!isTemplated && !knownProviders.includes(provider)) { + } else if (typeof provider === 'string' && !isTemplated && !knownProviders.includes(provider)) { // Warning for unknown providers (non-fatal); skip when provider uses ${{ VAR }} errors.push({ severity: 'warning', diff --git a/scripts/ci-summary.ts b/scripts/ci-summary.ts new file mode 100644 index 000000000..0b709be3c --- /dev/null +++ b/scripts/ci-summary.ts @@ -0,0 +1,166 @@ +#!/usr/bin/env bun +/** + * Generate a GitHub Actions step summary from AgentV eval results. + * + * Usage: bun run scripts/ci-summary.ts + * + * Reads: + * /artifacts/index.jsonl — per-test results + * + * Outputs GitHub-flavored Markdown to stdout (pipe to $GITHUB_STEP_SUMMARY). + */ +import { existsSync, readFileSync } from 'node:fs'; +import path from 'node:path'; + +const resultsDir = process.argv[2] || '.agentv/ci-results'; +const indexPath = path.join(resultsDir, 'artifacts', 'index.jsonl'); + +interface EvalResult { + test_id?: string; + dataset?: string; + score?: number; + pass?: boolean; + execution_status?: string; + error?: string; + duration_ms?: number; + target?: string; + assertions?: Array<{ text?: string; passed?: boolean }>; + failure_stage?: string; + failure_reason_code?: string; +} + +// Parse JSONL results +const results: EvalResult[] = []; +if (existsSync(indexPath)) { + const lines = readFileSync(indexPath, 'utf-8').split('\n').filter(Boolean); + for (const line of lines) { + try { + results.push(JSON.parse(line)); + } catch { + /* skip malformed */ + } + } +} + +if (results.length === 0) { + console.log('## AgentV Eval Results\n\n:warning: No results found.'); + process.exit(0); +} + +// Group by dataset/suite +const suites = new Map(); +for (const r of results) { + const suite = r.dataset || 'default'; + if (!suites.has(suite)) suites.set(suite, []); + suites.get(suite)?.push(r); +} + +// Compute stats +const threshold = 0.8; +let totalPass = 0; +let totalFail = 0; +let totalErrors = 0; +let totalScore = 0; +const scores: number[] = []; + +for (const r of results) { + const isError = r.execution_status === 'execution_error'; + const passed = !isError && (r.score ?? 0) >= threshold; + if (isError) totalErrors++; + else if (passed) totalPass++; + else totalFail++; + const score = r.score ?? 0; + totalScore += score; + scores.push(score); +} + +const totalTests = results.length; +const meanScore = totalTests > 0 ? totalScore / totalTests : 0; + +// Stddev +const variance = + scores.length > 0 ? scores.reduce((sum, s) => sum + (s - meanScore) ** 2, 0) / scores.length : 0; +const stddev = Math.sqrt(variance); + +// Total duration +const totalDuration = results.reduce((s, r) => s + (r.duration_ms ?? 0), 0); + +const md: string[] = []; +md.push('## AgentV Eval Results'); +md.push(''); + +const icon = totalFail === 0 && totalErrors === 0 ? ':white_check_mark:' : ':x:'; +md.push( + `${icon} **${totalPass}/${totalTests} passed** | Mean: **${meanScore.toFixed(3)}** | Stddev: **${stddev.toFixed(3)}** | Errors: **${totalErrors}** | Duration: **${(totalDuration / 1000).toFixed(1)}s**`, +); +md.push(''); + +// Suite table +md.push('| Suite | Tests | Pass | Fail | Errors | Mean | Duration |'); +md.push('|-------|------:|-----:|-----:|-------:|-----:|---------:|'); + +for (const [suite, tests] of suites) { + const pass = tests.filter( + (t) => t.execution_status !== 'execution_error' && (t.score ?? 0) >= threshold, + ).length; + const errors = tests.filter((t) => t.execution_status === 'execution_error').length; + const fail = tests.length - pass - errors; + const mean = (tests.reduce((s, t) => s + (t.score ?? 0), 0) / tests.length).toFixed(3); + const duration = tests.reduce((s, t) => s + (t.duration_ms ?? 0), 0); + const durationStr = duration > 0 ? `${(duration / 1000).toFixed(1)}s` : '-'; + const suiteIcon = + fail === 0 && errors === 0 ? ':white_check_mark:' : errors > 0 ? ':warning:' : ':x:'; + md.push( + `| ${suiteIcon} ${suite} | ${tests.length} | ${pass} | ${fail} | ${errors} | ${mean} | ${durationStr} |`, + ); +} + +md.push(''); + +// Failed tests detail +const failedTests = results.filter( + (r) => r.execution_status !== 'execution_error' && (r.score ?? 0) < threshold, +); +if (failedTests.length > 0) { + md.push('
'); + md.push(`:x: ${failedTests.length} quality failure(s)`); + md.push(''); + for (const t of failedTests.slice(0, 50)) { + const name = t.test_id || 'unknown'; + const suite = t.dataset || 'default'; + md.push( + `**${suite} / ${name}** — score: ${(t.score ?? 0).toFixed(3)} | target: ${t.target ?? '-'}`, + ); + if (t.assertions) { + const failed = t.assertions.filter((a) => !a.passed); + for (const a of failed) { + md.push(` - :x: ${a.text ?? 'assertion failed'}`); + } + } + md.push(''); + } + if (failedTests.length > 50) { + md.push(`_...and ${failedTests.length - 50} more_`); + } + md.push('
'); + md.push(''); +} + +// Error tests detail +const errorTests = results.filter((r) => r.execution_status === 'execution_error'); +if (errorTests.length > 0) { + md.push('
'); + md.push(`:warning: ${errorTests.length} execution error(s)`); + md.push(''); + for (const t of errorTests.slice(0, 30)) { + const name = t.test_id || 'unknown'; + md.push(`**${name}** — ${t.failure_reason_code ?? 'error'}: ${t.error ?? 'unknown error'}`); + md.push(''); + } + if (errorTests.length > 30) { + md.push(`_...and ${errorTests.length - 30} more_`); + } + md.push('
'); +} + +console.log(md.join('\n'));