From 262f2981f7dfd5de622dad7cc017870aba104163 Mon Sep 17 00:00:00 2001 From: Christopher Date: Tue, 24 Mar 2026 02:38:02 +0000 Subject: [PATCH 1/3] feat: rebase multi-turn conversation example --- examples/features/.agentv/targets.yaml | 14 +- .../multi-turn-conversation/README.md | 33 +++++ .../evals/dataset.eval.baseline.jsonl | 2 + .../evals/dataset.eval.yaml | 125 ++++++++++++++++++ .../graders/context-retention.md | 33 +++++ .../graders/conversation-relevancy.md | 34 +++++ .../graders/role-adherence.md | 35 +++++ .../src/evaluation/evaluators/llm-grader.ts | 13 +- .../core/src/evaluation/evaluators/types.ts | 4 +- .../evaluation/evaluators_variables.test.ts | 37 ++++++ 10 files changed, 318 insertions(+), 12 deletions(-) create mode 100644 examples/features/multi-turn-conversation/README.md create mode 100644 examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl create mode 100644 examples/features/multi-turn-conversation/evals/dataset.eval.yaml create mode 100644 examples/features/multi-turn-conversation/graders/context-retention.md create mode 100644 examples/features/multi-turn-conversation/graders/conversation-relevancy.md create mode 100644 examples/features/multi-turn-conversation/graders/role-adherence.md diff --git a/examples/features/.agentv/targets.yaml b/examples/features/.agentv/targets.yaml index 091b3fe7e..4d23d73f6 100644 --- a/examples/features/.agentv/targets.yaml +++ b/examples/features/.agentv/targets.yaml @@ -4,11 +4,9 @@ targets: - name: default - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - # version: ${{ AZURE_OPENAI_API_VERSION }} # Optional: uncomment to override default (2024-12-01-preview) + provider: gemini + api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} + model: ${{ GEMINI_MODEL_NAME }} - name: openai provider: openai @@ -23,7 +21,7 @@ targets: - name: codex provider: codex - grader_target: azure-llm + grader_target: gemini-llm # Uses the Codex CLI (defaults to `codex` on PATH) # executable: ${{ CODEX_CLI_PATH }} # Optional: override executable path # args: # Optional additional CLI arguments @@ -65,13 +63,13 @@ targets: - name: copilot provider: copilot model: gpt-5-mini - grader_target: azure-llm + grader_target: gemini-llm log_format: json # Claude - Anthropic's Claude Agent SDK - name: claude provider: claude - grader_target: azure-llm + grader_target: gemini-llm # model: claude-sonnet-4-20250514 # Optional: override model log_format: json # 'summary' (default) or 'json' for raw event logs # system_prompt: optional override (default instructs agent to include code in response) diff --git a/examples/features/multi-turn-conversation/README.md b/examples/features/multi-turn-conversation/README.md new file mode 100644 index 000000000..ffd39c5d6 --- /dev/null +++ b/examples/features/multi-turn-conversation/README.md @@ -0,0 +1,33 @@ +# Multi-Turn Conversation Evaluation + +Demonstrates evaluating multi-turn conversation quality using composable +`llm-grader` prompt templates with per-turn score breakdowns. + +## What this shows + +1. Multi-turn input with 4+ user/assistant turns where context retention matters +2. Conversation-aware grader prompts that receive the full `{{ input }}` message array +3. Per-turn score breakdown via structured `details` +4. Composability: multiple `llm-grader` evaluators combined with deterministic assertions + +## Grader dimensions + +| Grader | What it evaluates | +|-------|-------------------| +| `context-retention.md` | Does the agent remember information from earlier turns? | +| `conversation-relevancy.md` | Are responses relevant to the current request and conversation? | +| `role-adherence.md` | Does the agent maintain its assigned persona? | + +## Running + +```bash +bun apps/cli/src/cli.ts eval examples/features/multi-turn-conversation/evals/dataset.eval.yaml +``` + +## Creating your own conversation evaluator + +1. Create a markdown file in `graders/` +2. Use `{{ input }}` to receive the full conversation message array with roles +3. Use `{{ criteria }}` for the test-specific evaluation criteria +4. Instruct the grader to return `details` with per-turn metrics when useful +5. Reference it in your YAML with `type: llm-grader` and `prompt: ./graders/your-grader.md` diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl new file mode 100644 index 000000000..241575c1f --- /dev/null +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl @@ -0,0 +1,2 @@ +{"timestamp":"2026-03-09T10:16:33.509Z","test_id":"support-context-retention","eval_set":"dataset","score":0.8,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"The assistant addresses Sarah by name and references order #98765 while handling the shipping issue."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Each assistant turn directly addresses the user's immediate request","passed":true,"evidence":"The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a professional and empathetic support persona","passed":true,"evidence":"The tone remains consistent with the system instructions across all turns."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]}],"assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"context_retention: The assistant addresses Sarah by name and references order #98765 while handling the shipping issue. | conversation_relevancy: The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents. | role_adherence: The tone remains consistent with the system instructions across all turns."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true},{"text":"Each assistant turn directly addresses the user's immediate request","passed":true},{"text":"Assistant maintains a professional and empathetic support persona","passed":true},{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]} +{"timestamp":"2026-03-09T10:16:34.697Z","test_id":"support-troubleshooting-flow","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom."}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true,"evidence":"The responses remain focused on diagnosing the WiFi issue and next-step guidance."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true,"evidence":"The tone stays patient and the instructions remain simple across the conversation."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}}],"assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"context_retention: The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom. | conversation_relevancy: The responses remain focused on diagnosing the WiFi issue and next-step guidance. | role_adherence: The tone stays patient and the instructions remain simple across the conversation."},{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true},{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true}]} diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.yaml b/examples/features/multi-turn-conversation/evals/dataset.eval.yaml new file mode 100644 index 000000000..cc67a78f1 --- /dev/null +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.yaml @@ -0,0 +1,125 @@ +# Multi-turn conversation evaluation example +# Demonstrates conversation-level grading with composable llm-grader prompts +# and structured per-turn score breakdowns in the `details` field. + +description: Multi-turn conversation evaluation with per-turn score breakdown + +execution: + target: default + +tests: + - id: support-context-retention + criteria: |- + Agent maintains context across all turns: remembers customer name (Sarah), + order number (#98765), and Friday delivery deadline. Provides relevant, + helpful responses. Maintains professional support persona throughout. + + input: + - role: system + content: |- + You are a customer support agent for ShopFast. Be professional, empathetic, + and solution-oriented. Always address customers by name once introduced. + - role: user + content: |- + Hi, my name is Sarah. I placed an order last week and it still hasn't arrived. + My order number is #98765. + - role: assistant + content: |- + Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up + order #98765 for you right away. I can see it was shipped on March 3rd via + standard delivery. It appears there's been a delay at the regional sorting + facility. Would you like me to check on expedited options? + - role: user + content: |- + Yes please. I really need it by Friday - it's a birthday gift for my daughter. + - role: assistant + content: |- + I completely understand the urgency, Sarah. A birthday gift definitely can't + be late! I've flagged order #98765 for priority handling. I can offer you two + options: (1) upgrade to express shipping at no extra cost, estimated delivery + Thursday, or (2) reroute to our nearest pickup location where it could be + available as early as Wednesday. + - role: user + content: |- + The express shipping sounds great. Also, can you add gift wrapping? + + expected_output: + - role: assistant + content: |- + Absolutely, Sarah! I've upgraded order #98765 to express shipping - it should + arrive by Thursday, well before Friday. I've also added complimentary gift + wrapping since this is a special occasion. You'll receive a tracking update + shortly. Is there anything else I can help with for your daughter's birthday? + + assertions: + - name: context_retention + type: llm-grader + prompt: ../graders/context-retention.md + required: true + - name: conversation_relevancy + type: llm-grader + prompt: ../graders/conversation-relevancy.md + weight: 2 + - name: role_adherence + type: llm-grader + prompt: ../graders/role-adherence.md + - type: contains + value: "#98765" + + - id: support-troubleshooting-flow + criteria: |- + Agent tracks problem context (WiFi connectivity), steps already attempted + (router restart), and user's stated technical comfort level (beginner) + across turns. Does not repeat already-tried solutions. Adjusts explanation + complexity to match the user's level. + + input: + - role: system + content: |- + You are a technical support agent for NetConnect ISP. Adapt your + explanations to the customer's technical level. Never suggest steps + the customer has already tried. + - role: user + content: |- + My WiFi keeps dropping every 10 minutes. I already restarted my router + twice today. I'm not very tech-savvy so please keep it simple. + - role: assistant + content: |- + I'm sorry you're dealing with that - WiFi dropping out is really + frustrating! Since you've already restarted the router, let's skip that + step. Could you tell me: does the light on the front of your router stay + solid green, or does it blink or turn a different color when the WiFi drops? + - role: user + content: |- + It blinks orange for a few seconds then goes back to green. + - role: assistant + content: |- + That orange blinking usually means the router is briefly losing its + connection to our network. Let's try something simple: can you find the + small cable plugged into the back of your router labeled "WAN" or + "Internet"? Just unplug it, wait 10 seconds, and plug it back in firmly. + Sometimes it just needs a better connection. + - role: user + content: |- + OK I did that. What if it happens again? + + expected_output: + - role: assistant + content: |- + Great, let's see if that helps! If the WiFi drops again in the next + 30 minutes, it might be an issue on our end. In that case, I'll schedule + a technician visit at a time that works for you - no charge. Just let me + know if it happens again. + + assertions: + - name: context_retention + type: llm-grader + prompt: ../graders/context-retention.md + required: true + - name: conversation_relevancy + type: llm-grader + prompt: ../graders/conversation-relevancy.md + weight: 2 + - name: role_adherence + type: llm-grader + prompt: ../graders/role-adherence.md diff --git a/examples/features/multi-turn-conversation/graders/context-retention.md b/examples/features/multi-turn-conversation/graders/context-retention.md new file mode 100644 index 000000000..d103fa971 --- /dev/null +++ b/examples/features/multi-turn-conversation/graders/context-retention.md @@ -0,0 +1,33 @@ +You are evaluating whether an AI assistant retains context from earlier turns +in a multi-turn conversation. + +Below you will see the full conversation with role annotations (system, user, +assistant). The conversation history contains prior turns, and the agent +response is the final assistant turn. Evaluate all assistant turns across the +entire conversation. Number assistant turns sequentially starting from Turn 1. + +For each assistant turn, check whether the assistant correctly references or +builds on information introduced in previous turns, such as names, numbers, +constraints, and preferences. + +Score each assistant turn: +- 1.0 if the turn demonstrates awareness of relevant earlier context +- 0.5 if the turn partially retains context and forgets some details +- 0.0 if the turn ignores or contradicts earlier context + +In your `assertions`, include specific turn-level checks. +In your `details`, return: +- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn +- `relevant_turns`: count of turns that demonstrated context retention +- `total_turns`: total number of assistant turns evaluated + +Your overall `score` should be the average of per-turn scores. + +[[ ## criteria ## ]] +{{ criteria }} + +[[ ## conversation (all turns with roles) ## ]] +{{ input }} + +[[ ## agent response (final turn) ## ]] +{{ output_text }} diff --git a/examples/features/multi-turn-conversation/graders/conversation-relevancy.md b/examples/features/multi-turn-conversation/graders/conversation-relevancy.md new file mode 100644 index 000000000..b051c04c3 --- /dev/null +++ b/examples/features/multi-turn-conversation/graders/conversation-relevancy.md @@ -0,0 +1,34 @@ +You are evaluating whether each assistant response in a multi-turn conversation +is relevant to the user's current request and the broader conversation context. + +Below you will see the full conversation with role annotations (system, user, +assistant). The conversation history contains prior turns, and the agent +response is the final assistant turn. Evaluate all assistant turns across the +entire conversation. Number assistant turns sequentially starting from Turn 1. + +For each assistant turn, assess: +- Does the response directly address what the user asked? +- Is the response appropriate given the full conversation history? +- Does the assistant avoid tangential or off-topic information? + +Score each assistant turn: +- 1.0 if the response is fully relevant to the user's request and context +- 0.5 if the response is partially relevant but includes unnecessary information +- 0.0 if the response is off-topic or fails to address the user's request + +In your `assertions`, include specific turn-level checks. +In your `details`, return: +- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn +- `on_topic_turns`: count of turns scored 1.0 +- `total_turns`: total number of assistant turns evaluated + +Your overall `score` should be the average of per-turn scores. + +[[ ## criteria ## ]] +{{ criteria }} + +[[ ## conversation (all turns with roles) ## ]] +{{ input }} + +[[ ## agent response (final turn) ## ]] +{{ output_text }} diff --git a/examples/features/multi-turn-conversation/graders/role-adherence.md b/examples/features/multi-turn-conversation/graders/role-adherence.md new file mode 100644 index 000000000..37ba78d8a --- /dev/null +++ b/examples/features/multi-turn-conversation/graders/role-adherence.md @@ -0,0 +1,35 @@ +You are evaluating whether an AI assistant maintains its assigned persona and +role consistently across a multi-turn conversation. + +Below you will see the full conversation with role annotations (system, user, +assistant). The conversation history contains prior turns, and the agent +response is the final assistant turn. Evaluate all assistant turns across the +entire conversation. Number assistant turns sequentially starting from Turn 1. + +Consider the system prompt, if present, as the role definition. For each +assistant turn, assess: +- Does the assistant stay in character? +- Is the tone consistent with the assigned role? +- Does the assistant avoid behavior inconsistent with its role? + +Score each assistant turn: +- 1.0 if the turn fully adheres to the assigned role and tone +- 0.5 if the turn partially adheres but shows inconsistency +- 0.0 if the turn breaks character or contradicts the assigned role + +In your `assertions`, include specific turn-level checks. +In your `details`, return: +- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn +- `consistent_turns`: count of turns scored 1.0 +- `total_turns`: total number of assistant turns evaluated + +Your overall `score` should be the average of per-turn scores. + +[[ ## criteria ## ]] +{{ criteria }} + +[[ ## conversation (all turns with roles) ## ]] +{{ input }} + +[[ ## agent response (final turn) ## ]] +{{ output_text }} diff --git a/packages/core/src/evaluation/evaluators/llm-grader.ts b/packages/core/src/evaluation/evaluators/llm-grader.ts index b57e75262..0f570e418 100644 --- a/packages/core/src/evaluation/evaluators/llm-grader.ts +++ b/packages/core/src/evaluation/evaluators/llm-grader.ts @@ -109,6 +109,10 @@ const freeformEvaluationSchema = z.object({ ) .describe('Per-aspect evaluation results — one entry per aspect checked') .optional(), + details: z + .record(z.unknown()) + .describe('Optional structured metadata for domain-specific metrics') + .optional(), }); const rubricCheckResultSchema = z.object({ @@ -261,6 +265,7 @@ export class LlmGraderEvaluator implements Evaluator { expectedAspectCount: Math.max(assertions.length, 1), evaluatorRawRequest, graderTarget: graderProvider.targetName, + details: data.details as JsonObject | undefined, tokenUsage, }; } catch (e: unknown) { @@ -784,7 +789,10 @@ export class LlmGraderEvaluator implements Evaluator { expectedAspectCount: Math.max(assertions.length, 1), evaluatorRawRequest, graderTarget, - details, + details: + data.details && Object.keys(data.details).length > 0 + ? ({ ...details, ...data.details } as JsonObject) + : details, }; } catch { return { @@ -992,7 +1000,8 @@ export function buildOutputSchema(): string { ' "passed": ,', ' "evidence": ""', ' }', - ' ]', + ' ],', + ' "details": {}', '}', ].join('\n'); } diff --git a/packages/core/src/evaluation/evaluators/types.ts b/packages/core/src/evaluation/evaluators/types.ts index a831698dc..9a554bab9 100644 --- a/packages/core/src/evaluation/evaluators/types.ts +++ b/packages/core/src/evaluation/evaluators/types.ts @@ -59,7 +59,7 @@ export interface EvaluationScore { readonly expectedAspectCount: number; readonly evaluatorRawRequest?: JsonObject; readonly scores?: readonly ChildEvaluatorResult[]; - /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */ + /** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */ readonly details?: JsonObject; /** Token usage from LLM calls made by this evaluator (optional). */ readonly tokenUsage?: TokenUsage; @@ -76,7 +76,7 @@ export interface ChildEvaluatorResult { readonly assertions: readonly import('../types.js').AssertionEntry[]; readonly evaluatorRawRequest?: JsonObject; readonly scores?: readonly ChildEvaluatorResult[]; - /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */ + /** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */ readonly details?: JsonObject; /** Token usage from LLM calls made by this evaluator (optional). */ readonly tokenUsage?: TokenUsage; diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts index 69bffb726..f6fbad5e0 100644 --- a/packages/core/test/evaluation/evaluators_variables.test.ts +++ b/packages/core/test/evaluation/evaluators_variables.test.ts @@ -193,4 +193,41 @@ Expected Messages: {{ expected_output }} // Verify no unreplaced template markers remain expect(request?.question).not.toMatch(/\{\{\s*\w+\s*\}\}/); }); + + it('preserves freeform details returned by the grader', async () => { + const evaluator = new LlmGraderEvaluator({ + resolveGraderProvider: async () => undefined, + }); + + const result = ( + evaluator as unknown as { + parseAgentResult: ( + text: string, + rubrics: undefined, + evaluatorRawRequest: Record, + details: Record, + graderTarget?: string, + ) => { details?: Record }; + } + ).parseAgentResult( + JSON.stringify({ + score: 0.75, + assertions: [{ text: 'Context retained', passed: true }], + details: { + scores_per_turn: [1, 0.5], + total_turns: 2, + }, + }), + undefined, + { userPrompt: 'Prompt' }, + { mode: 'delegate' }, + 'capturing', + ); + + expect(result.details).toEqual({ + mode: 'delegate', + scores_per_turn: [1, 0.5], + total_turns: 2, + }); + }); }); From 6157bde50f9a1e5f884ef0a3c2f4f2336b382c78 Mon Sep 17 00:00:00 2001 From: Christopher Date: Tue, 24 Mar 2026 02:45:42 +0000 Subject: [PATCH 2/3] docs: update env example for feature targets --- .env.example | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.env.example b/.env.example index 3a6a1f566..9352eab69 100644 --- a/.env.example +++ b/.env.example @@ -18,8 +18,18 @@ OPENAI_MODEL=gpt-5-mini GOOGLE_GENERATIVE_AI_API_KEY=your-gemini-api-key-here GEMINI_MODEL_NAME=gemini-3-flash-preview +# OpenRouter +OPENROUTER_API_KEY=your-openrouter-api-key-here +OPENROUTER_MODEL=openai/gpt-5.4-mini + # Anthropic ANTHROPIC_API_KEY=your-anthropic-api-key-here +# Codex / agent providers +# Quote values if the path contains spaces. +CODEX_WORKSPACE_DIR=./.agentv/codex-workspaces +CODEX_LOG_DIR=./.agentv/logs/codex +PI_CLI_PATH=pi + # CLI provider sample (used by the local_cli target) -CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli \ No newline at end of file +CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli From 3a231f9a55b7ac7f48becd88222d0ac746c6636c Mon Sep 17 00:00:00 2001 From: Christopher Date: Tue, 24 Mar 2026 02:59:34 +0000 Subject: [PATCH 3/3] chore: restore azure default feature target --- examples/features/.agentv/targets.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/features/.agentv/targets.yaml b/examples/features/.agentv/targets.yaml index 4d23d73f6..37fc2e370 100644 --- a/examples/features/.agentv/targets.yaml +++ b/examples/features/.agentv/targets.yaml @@ -4,9 +4,10 @@ targets: - name: default - provider: gemini - api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} - model: ${{ GEMINI_MODEL_NAME }} + provider: azure + endpoint: ${{ AZURE_OPENAI_ENDPOINT }} + api_key: ${{ AZURE_OPENAI_API_KEY }} + model: ${{ AZURE_DEPLOYMENT_NAME }} - name: openai provider: openai