Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,18 @@ OPENAI_MODEL=gpt-5-mini
GOOGLE_GENERATIVE_AI_API_KEY=your-gemini-api-key-here
GEMINI_MODEL_NAME=gemini-3-flash-preview

# OpenRouter
OPENROUTER_API_KEY=your-openrouter-api-key-here
OPENROUTER_MODEL=openai/gpt-5.4-mini

# Anthropic
ANTHROPIC_API_KEY=your-anthropic-api-key-here

# Codex / agent providers
# Quote values if the path contains spaces.
CODEX_WORKSPACE_DIR=./.agentv/codex-workspaces
CODEX_LOG_DIR=./.agentv/logs/codex
PI_CLI_PATH=pi

# CLI provider sample (used by the local_cli target)
CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli
CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli
7 changes: 3 additions & 4 deletions examples/features/.agentv/targets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ targets:
endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
api_key: ${{ AZURE_OPENAI_API_KEY }}
model: ${{ AZURE_DEPLOYMENT_NAME }}
# version: ${{ AZURE_OPENAI_API_VERSION }} # Optional: uncomment to override default (2024-12-01-preview)

- name: openai
provider: openai
Expand All @@ -23,7 +22,7 @@ targets:

- name: codex
provider: codex
grader_target: azure-llm
grader_target: gemini-llm
# Uses the Codex CLI (defaults to `codex` on PATH)
# executable: ${{ CODEX_CLI_PATH }} # Optional: override executable path
# args: # Optional additional CLI arguments
Expand Down Expand Up @@ -65,13 +64,13 @@ targets:
- name: copilot
provider: copilot
model: gpt-5-mini
grader_target: azure-llm
grader_target: gemini-llm
log_format: json

# Claude - Anthropic's Claude Agent SDK
- name: claude
provider: claude
grader_target: azure-llm
grader_target: gemini-llm
# model: claude-sonnet-4-20250514 # Optional: override model
log_format: json # 'summary' (default) or 'json' for raw event logs
# system_prompt: optional override (default instructs agent to include code in response)
33 changes: 33 additions & 0 deletions examples/features/multi-turn-conversation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Multi-Turn Conversation Evaluation

Demonstrates evaluating multi-turn conversation quality using composable
`llm-grader` prompt templates with per-turn score breakdowns.

## What this shows

1. Multi-turn input with 4+ user/assistant turns where context retention matters
2. Conversation-aware grader prompts that receive the full `{{ input }}` message array
3. Per-turn score breakdown via structured `details`
4. Composability: multiple `llm-grader` evaluators combined with deterministic assertions

## Grader dimensions

| Grader | What it evaluates |
|-------|-------------------|
| `context-retention.md` | Does the agent remember information from earlier turns? |
| `conversation-relevancy.md` | Are responses relevant to the current request and conversation? |
| `role-adherence.md` | Does the agent maintain its assigned persona? |

## Running

```bash
bun apps/cli/src/cli.ts eval examples/features/multi-turn-conversation/evals/dataset.eval.yaml
```

## Creating your own conversation evaluator

1. Create a markdown file in `graders/`
2. Use `{{ input }}` to receive the full conversation message array with roles
3. Use `{{ criteria }}` for the test-specific evaluation criteria
4. Instruct the grader to return `details` with per-turn metrics when useful
5. Reference it in your YAML with `type: llm-grader` and `prompt: ./graders/your-grader.md`
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"timestamp":"2026-03-09T10:16:33.509Z","test_id":"support-context-retention","eval_set":"dataset","score":0.8,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"The assistant addresses Sarah by name and references order #98765 while handling the shipping issue."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Each assistant turn directly addresses the user's immediate request","passed":true,"evidence":"The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a professional and empathetic support persona","passed":true,"evidence":"The tone remains consistent with the system instructions across all turns."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]}],"assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"context_retention: The assistant addresses Sarah by name and references order #98765 while handling the shipping issue. | conversation_relevancy: The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents. | role_adherence: The tone remains consistent with the system instructions across all turns."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true},{"text":"Each assistant turn directly addresses the user's immediate request","passed":true},{"text":"Assistant maintains a professional and empathetic support persona","passed":true},{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]}
{"timestamp":"2026-03-09T10:16:34.697Z","test_id":"support-troubleshooting-flow","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom."}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true,"evidence":"The responses remain focused on diagnosing the WiFi issue and next-step guidance."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true,"evidence":"The tone stays patient and the instructions remain simple across the conversation."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}}],"assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"context_retention: The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom. | conversation_relevancy: The responses remain focused on diagnosing the WiFi issue and next-step guidance. | role_adherence: The tone stays patient and the instructions remain simple across the conversation."},{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true},{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true}]}
125 changes: 125 additions & 0 deletions examples/features/multi-turn-conversation/evals/dataset.eval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# Multi-turn conversation evaluation example
# Demonstrates conversation-level grading with composable llm-grader prompts
# and structured per-turn score breakdowns in the `details` field.

description: Multi-turn conversation evaluation with per-turn score breakdown

execution:
target: default

tests:
- id: support-context-retention
criteria: |-
Agent maintains context across all turns: remembers customer name (Sarah),
order number (#98765), and Friday delivery deadline. Provides relevant,
helpful responses. Maintains professional support persona throughout.

input:
- role: system
content: |-
You are a customer support agent for ShopFast. Be professional, empathetic,
and solution-oriented. Always address customers by name once introduced.
- role: user
content: |-
Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.
My order number is #98765.
- role: assistant
content: |-
Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up
order #98765 for you right away. I can see it was shipped on March 3rd via
standard delivery. It appears there's been a delay at the regional sorting
facility. Would you like me to check on expedited options?
- role: user
content: |-
Yes please. I really need it by Friday - it's a birthday gift for my daughter.
- role: assistant
content: |-
I completely understand the urgency, Sarah. A birthday gift definitely can't
be late! I've flagged order #98765 for priority handling. I can offer you two
options: (1) upgrade to express shipping at no extra cost, estimated delivery
Thursday, or (2) reroute to our nearest pickup location where it could be
available as early as Wednesday.
- role: user
content: |-
The express shipping sounds great. Also, can you add gift wrapping?

expected_output:
- role: assistant
content: |-
Absolutely, Sarah! I've upgraded order #98765 to express shipping - it should
arrive by Thursday, well before Friday. I've also added complimentary gift
wrapping since this is a special occasion. You'll receive a tracking update
shortly. Is there anything else I can help with for your daughter's birthday?

assertions:
- name: context_retention
type: llm-grader
prompt: ../graders/context-retention.md
required: true
- name: conversation_relevancy
type: llm-grader
prompt: ../graders/conversation-relevancy.md
weight: 2
- name: role_adherence
type: llm-grader
prompt: ../graders/role-adherence.md
- type: contains
value: "#98765"

- id: support-troubleshooting-flow
criteria: |-
Agent tracks problem context (WiFi connectivity), steps already attempted
(router restart), and user's stated technical comfort level (beginner)
across turns. Does not repeat already-tried solutions. Adjusts explanation
complexity to match the user's level.

input:
- role: system
content: |-
You are a technical support agent for NetConnect ISP. Adapt your
explanations to the customer's technical level. Never suggest steps
the customer has already tried.
- role: user
content: |-
My WiFi keeps dropping every 10 minutes. I already restarted my router
twice today. I'm not very tech-savvy so please keep it simple.
- role: assistant
content: |-
I'm sorry you're dealing with that - WiFi dropping out is really
frustrating! Since you've already restarted the router, let's skip that
step. Could you tell me: does the light on the front of your router stay
solid green, or does it blink or turn a different color when the WiFi drops?
- role: user
content: |-
It blinks orange for a few seconds then goes back to green.
- role: assistant
content: |-
That orange blinking usually means the router is briefly losing its
connection to our network. Let's try something simple: can you find the
small cable plugged into the back of your router labeled "WAN" or
"Internet"? Just unplug it, wait 10 seconds, and plug it back in firmly.
Sometimes it just needs a better connection.
- role: user
content: |-
OK I did that. What if it happens again?

expected_output:
- role: assistant
content: |-
Great, let's see if that helps! If the WiFi drops again in the next
30 minutes, it might be an issue on our end. In that case, I'll schedule
a technician visit at a time that works for you - no charge. Just let me
know if it happens again.

assertions:
- name: context_retention
type: llm-grader
prompt: ../graders/context-retention.md
required: true
- name: conversation_relevancy
type: llm-grader
prompt: ../graders/conversation-relevancy.md
weight: 2
- name: role_adherence
type: llm-grader
prompt: ../graders/role-adherence.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
You are evaluating whether an AI assistant retains context from earlier turns
in a multi-turn conversation.

Below you will see the full conversation with role annotations (system, user,
assistant). The conversation history contains prior turns, and the agent
response is the final assistant turn. Evaluate all assistant turns across the
entire conversation. Number assistant turns sequentially starting from Turn 1.

For each assistant turn, check whether the assistant correctly references or
builds on information introduced in previous turns, such as names, numbers,
constraints, and preferences.

Score each assistant turn:
- 1.0 if the turn demonstrates awareness of relevant earlier context
- 0.5 if the turn partially retains context and forgets some details
- 0.0 if the turn ignores or contradicts earlier context

In your `assertions`, include specific turn-level checks.
In your `details`, return:
- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn
- `relevant_turns`: count of turns that demonstrated context retention
- `total_turns`: total number of assistant turns evaluated

Your overall `score` should be the average of per-turn scores.

[[ ## criteria ## ]]
{{ criteria }}

[[ ## conversation (all turns with roles) ## ]]
{{ input }}

[[ ## agent response (final turn) ## ]]
{{ output_text }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
You are evaluating whether each assistant response in a multi-turn conversation
is relevant to the user's current request and the broader conversation context.

Below you will see the full conversation with role annotations (system, user,
assistant). The conversation history contains prior turns, and the agent
response is the final assistant turn. Evaluate all assistant turns across the
entire conversation. Number assistant turns sequentially starting from Turn 1.

For each assistant turn, assess:
- Does the response directly address what the user asked?
- Is the response appropriate given the full conversation history?
- Does the assistant avoid tangential or off-topic information?

Score each assistant turn:
- 1.0 if the response is fully relevant to the user's request and context
- 0.5 if the response is partially relevant but includes unnecessary information
- 0.0 if the response is off-topic or fails to address the user's request

In your `assertions`, include specific turn-level checks.
In your `details`, return:
- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn
- `on_topic_turns`: count of turns scored 1.0
- `total_turns`: total number of assistant turns evaluated

Your overall `score` should be the average of per-turn scores.

[[ ## criteria ## ]]
{{ criteria }}

[[ ## conversation (all turns with roles) ## ]]
{{ input }}

[[ ## agent response (final turn) ## ]]
{{ output_text }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
You are evaluating whether an AI assistant maintains its assigned persona and
role consistently across a multi-turn conversation.

Below you will see the full conversation with role annotations (system, user,
assistant). The conversation history contains prior turns, and the agent
response is the final assistant turn. Evaluate all assistant turns across the
entire conversation. Number assistant turns sequentially starting from Turn 1.

Consider the system prompt, if present, as the role definition. For each
assistant turn, assess:
- Does the assistant stay in character?
- Is the tone consistent with the assigned role?
- Does the assistant avoid behavior inconsistent with its role?

Score each assistant turn:
- 1.0 if the turn fully adheres to the assigned role and tone
- 0.5 if the turn partially adheres but shows inconsistency
- 0.0 if the turn breaks character or contradicts the assigned role

In your `assertions`, include specific turn-level checks.
In your `details`, return:
- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn
- `consistent_turns`: count of turns scored 1.0
- `total_turns`: total number of assistant turns evaluated

Your overall `score` should be the average of per-turn scores.

[[ ## criteria ## ]]
{{ criteria }}

[[ ## conversation (all turns with roles) ## ]]
{{ input }}

[[ ## agent response (final turn) ## ]]
{{ output_text }}
13 changes: 11 additions & 2 deletions packages/core/src/evaluation/evaluators/llm-grader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ const freeformEvaluationSchema = z.object({
)
.describe('Per-aspect evaluation results — one entry per aspect checked')
.optional(),
details: z
.record(z.unknown())
.describe('Optional structured metadata for domain-specific metrics')
.optional(),
});

const rubricCheckResultSchema = z.object({
Expand Down Expand Up @@ -261,6 +265,7 @@ export class LlmGraderEvaluator implements Evaluator {
expectedAspectCount: Math.max(assertions.length, 1),
evaluatorRawRequest,
graderTarget: graderProvider.targetName,
details: data.details as JsonObject | undefined,
tokenUsage,
};
} catch (e: unknown) {
Expand Down Expand Up @@ -784,7 +789,10 @@ export class LlmGraderEvaluator implements Evaluator {
expectedAspectCount: Math.max(assertions.length, 1),
evaluatorRawRequest,
graderTarget,
details,
details:
data.details && Object.keys(data.details).length > 0
? ({ ...details, ...data.details } as JsonObject)
: details,
};
} catch {
return {
Expand Down Expand Up @@ -992,7 +1000,8 @@ export function buildOutputSchema(): string {
' "passed": <boolean>,',
' "evidence": "<concise evidence, 1-2 sentences, optional>"',
' }',
' ]',
' ],',
' "details": {<optional object with domain-specific structured metrics>}',
'}',
].join('\n');
}
Expand Down
4 changes: 2 additions & 2 deletions packages/core/src/evaluation/evaluators/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ export interface EvaluationScore {
readonly expectedAspectCount: number;
readonly evaluatorRawRequest?: JsonObject;
readonly scores?: readonly ChildEvaluatorResult[];
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
/** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */
readonly details?: JsonObject;
/** Token usage from LLM calls made by this evaluator (optional). */
readonly tokenUsage?: TokenUsage;
Expand All @@ -76,7 +76,7 @@ export interface ChildEvaluatorResult {
readonly assertions: readonly import('../types.js').AssertionEntry[];
readonly evaluatorRawRequest?: JsonObject;
readonly scores?: readonly ChildEvaluatorResult[];
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
/** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */
readonly details?: JsonObject;
/** Token usage from LLM calls made by this evaluator (optional). */
readonly tokenUsage?: TokenUsage;
Expand Down
Loading