EntityProcess · christso · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/.env.example b/.env.example
@@ -18,8 +18,18 @@ OPENAI_MODEL=gpt-5-mini
 GOOGLE_GENERATIVE_AI_API_KEY=your-gemini-api-key-here
 GEMINI_MODEL_NAME=gemini-3-flash-preview
 
+# OpenRouter
+OPENROUTER_API_KEY=your-openrouter-api-key-here
+OPENROUTER_MODEL=openai/gpt-5.4-mini
+
 # Anthropic
 ANTHROPIC_API_KEY=your-anthropic-api-key-here
 
+# Codex / agent providers
+# Quote values if the path contains spaces.
+CODEX_WORKSPACE_DIR=./.agentv/codex-workspaces
+CODEX_LOG_DIR=./.agentv/logs/codex
+PI_CLI_PATH=pi
+
 # CLI provider sample (used by the local_cli target)
-CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli
+CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli
diff --git a/examples/features/.agentv/targets.yaml b/examples/features/.agentv/targets.yaml
@@ -8,7 +8,6 @@ targets:
     endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
     api_key: ${{ AZURE_OPENAI_API_KEY }}
     model: ${{ AZURE_DEPLOYMENT_NAME }}
-    # version: ${{ AZURE_OPENAI_API_VERSION }}  # Optional: uncomment to override default (2024-12-01-preview)
 
   - name: openai
     provider: openai
@@ -23,7 +22,7 @@ targets:
 
   - name: codex
     provider: codex
-    grader_target: azure-llm
+    grader_target: gemini-llm
     # Uses the Codex CLI (defaults to `codex` on PATH)
     # executable: ${{ CODEX_CLI_PATH }}        # Optional: override executable path
     # args:                             # Optional additional CLI arguments
@@ -65,13 +64,13 @@ targets:
   - name: copilot
     provider: copilot
     model: gpt-5-mini
-    grader_target: azure-llm
+    grader_target: gemini-llm
     log_format: json
 
   # Claude - Anthropic's Claude Agent SDK
   - name: claude
     provider: claude
-    grader_target: azure-llm
+    grader_target: gemini-llm
     # model: claude-sonnet-4-20250514          # Optional: override model
     log_format: json  # 'summary' (default) or 'json' for raw event logs
     # system_prompt: optional override (default instructs agent to include code in response)
diff --git a/examples/features/multi-turn-conversation/README.md b/examples/features/multi-turn-conversation/README.md
@@ -0,0 +1,33 @@
+# Multi-Turn Conversation Evaluation
+
+Demonstrates evaluating multi-turn conversation quality using composable
+`llm-grader` prompt templates with per-turn score breakdowns.
+
+## What this shows
+
+1. Multi-turn input with 4+ user/assistant turns where context retention matters
+2. Conversation-aware grader prompts that receive the full `{{ input }}` message array
+3. Per-turn score breakdown via structured `details`
+4. Composability: multiple `llm-grader` evaluators combined with deterministic assertions
+
+## Grader dimensions
+
+| Grader | What it evaluates |
+|-------|-------------------|
+| `context-retention.md` | Does the agent remember information from earlier turns? |
+| `conversation-relevancy.md` | Are responses relevant to the current request and conversation? |
+| `role-adherence.md` | Does the agent maintain its assigned persona? |
+
+## Running
+
+```bash
+bun apps/cli/src/cli.ts eval examples/features/multi-turn-conversation/evals/dataset.eval.yaml
+```
+
+## Creating your own conversation evaluator
+
+1. Create a markdown file in `graders/`
+2. Use `{{ input }}` to receive the full conversation message array with roles
+3. Use `{{ criteria }}` for the test-specific evaluation criteria
+4. Instruct the grader to return `details` with per-turn metrics when useful
+5. Reference it in your YAML with `type: llm-grader` and `prompt: ./graders/your-grader.md`
diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl
@@ -0,0 +1,2 @@
+{"timestamp":"2026-03-09T10:16:33.509Z","test_id":"support-context-retention","eval_set":"dataset","score":0.8,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"The assistant addresses Sarah by name and references order #98765 while handling the shipping issue."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Each assistant turn directly addresses the user's immediate request","passed":true,"evidence":"The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a professional and empathetic support persona","passed":true,"evidence":"The tone remains consistent with the system instructions across all turns."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]}],"assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"context_retention: The assistant addresses Sarah by name and references order #98765 while handling the shipping issue. | conversation_relevancy: The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents. | role_adherence: The tone remains consistent with the system instructions across all turns."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true},{"text":"Each assistant turn directly addresses the user's immediate request","passed":true},{"text":"Assistant maintains a professional and empathetic support persona","passed":true},{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]}
+{"timestamp":"2026-03-09T10:16:34.697Z","test_id":"support-troubleshooting-flow","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom."}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true,"evidence":"The responses remain focused on diagnosing the WiFi issue and next-step guidance."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true,"evidence":"The tone stays patient and the instructions remain simple across the conversation."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}}],"assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"context_retention: The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom. | conversation_relevancy: The responses remain focused on diagnosing the WiFi issue and next-step guidance. | role_adherence: The tone stays patient and the instructions remain simple across the conversation."},{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true},{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true}]}
diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.yaml b/examples/features/multi-turn-conversation/evals/dataset.eval.yaml
@@ -0,0 +1,125 @@
+# Multi-turn conversation evaluation example
+# Demonstrates conversation-level grading with composable llm-grader prompts
+# and structured per-turn score breakdowns in the `details` field.
+
+description: Multi-turn conversation evaluation with per-turn score breakdown
+
+execution:
+  target: default
+
+tests:
+  - id: support-context-retention
+    criteria: |-
+      Agent maintains context across all turns: remembers customer name (Sarah),
+      order number (#98765), and Friday delivery deadline. Provides relevant,
+      helpful responses. Maintains professional support persona throughout.
+
+    input:
+      - role: system
+        content: |-
+          You are a customer support agent for ShopFast. Be professional, empathetic,
+          and solution-oriented. Always address customers by name once introduced.
+      - role: user
+        content: |-
+          Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.
+          My order number is #98765.
+      - role: assistant
+        content: |-
+          Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up
+          order #98765 for you right away. I can see it was shipped on March 3rd via
+          standard delivery. It appears there's been a delay at the regional sorting
+          facility. Would you like me to check on expedited options?
+      - role: user
+        content: |-
+          Yes please. I really need it by Friday - it's a birthday gift for my daughter.
+      - role: assistant
+        content: |-
+          I completely understand the urgency, Sarah. A birthday gift definitely can't
+          be late! I've flagged order #98765 for priority handling. I can offer you two
+          options: (1) upgrade to express shipping at no extra cost, estimated delivery
+          Thursday, or (2) reroute to our nearest pickup location where it could be
+          available as early as Wednesday.
+      - role: user
+        content: |-
+          The express shipping sounds great. Also, can you add gift wrapping?
+
+    expected_output:
+      - role: assistant
+        content: |-
+          Absolutely, Sarah! I've upgraded order #98765 to express shipping - it should
+          arrive by Thursday, well before Friday. I've also added complimentary gift
+          wrapping since this is a special occasion. You'll receive a tracking update
+          shortly. Is there anything else I can help with for your daughter's birthday?
+
+    assertions:
+      - name: context_retention
+        type: llm-grader
+        prompt: ../graders/context-retention.md
+        required: true
+      - name: conversation_relevancy
+        type: llm-grader
+        prompt: ../graders/conversation-relevancy.md
+        weight: 2
+      - name: role_adherence
+        type: llm-grader
+        prompt: ../graders/role-adherence.md
+      - type: contains
+        value: "#98765"
+
+  - id: support-troubleshooting-flow
+    criteria: |-
+      Agent tracks problem context (WiFi connectivity), steps already attempted
+      (router restart), and user's stated technical comfort level (beginner)
+      across turns. Does not repeat already-tried solutions. Adjusts explanation
+      complexity to match the user's level.
+
+    input:
+      - role: system
+        content: |-
+          You are a technical support agent for NetConnect ISP. Adapt your
+          explanations to the customer's technical level. Never suggest steps
+          the customer has already tried.
+      - role: user
+        content: |-
+          My WiFi keeps dropping every 10 minutes. I already restarted my router
+          twice today. I'm not very tech-savvy so please keep it simple.
+      - role: assistant
+        content: |-
+          I'm sorry you're dealing with that - WiFi dropping out is really
+          frustrating! Since you've already restarted the router, let's skip that
+          step. Could you tell me: does the light on the front of your router stay
+          solid green, or does it blink or turn a different color when the WiFi drops?
+      - role: user
+        content: |-
+          It blinks orange for a few seconds then goes back to green.
+      - role: assistant
+        content: |-
+          That orange blinking usually means the router is briefly losing its
+          connection to our network. Let's try something simple: can you find the
+          small cable plugged into the back of your router labeled "WAN" or
+          "Internet"? Just unplug it, wait 10 seconds, and plug it back in firmly.
+          Sometimes it just needs a better connection.
+      - role: user
+        content: |-
+          OK I did that. What if it happens again?
+
+    expected_output:
+      - role: assistant
+        content: |-
+          Great, let's see if that helps! If the WiFi drops again in the next
+          30 minutes, it might be an issue on our end. In that case, I'll schedule
+          a technician visit at a time that works for you - no charge. Just let me
+          know if it happens again.
+
+    assertions:
+      - name: context_retention
+        type: llm-grader
+        prompt: ../graders/context-retention.md
+        required: true
+      - name: conversation_relevancy
+        type: llm-grader
+        prompt: ../graders/conversation-relevancy.md
+        weight: 2
+      - name: role_adherence
+        type: llm-grader
+        prompt: ../graders/role-adherence.md
diff --git a/examples/features/multi-turn-conversation/graders/context-retention.md b/examples/features/multi-turn-conversation/graders/context-retention.md
@@ -0,0 +1,33 @@
+You are evaluating whether an AI assistant retains context from earlier turns
+in a multi-turn conversation.
+
+Below you will see the full conversation with role annotations (system, user,
+assistant). The conversation history contains prior turns, and the agent
+response is the final assistant turn. Evaluate all assistant turns across the
+entire conversation. Number assistant turns sequentially starting from Turn 1.
+
+For each assistant turn, check whether the assistant correctly references or
+builds on information introduced in previous turns, such as names, numbers,
+constraints, and preferences.
+
+Score each assistant turn:
+- 1.0 if the turn demonstrates awareness of relevant earlier context
+- 0.5 if the turn partially retains context and forgets some details
+- 0.0 if the turn ignores or contradicts earlier context
+
+In your `assertions`, include specific turn-level checks.
+In your `details`, return:
+- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn
+- `relevant_turns`: count of turns that demonstrated context retention
+- `total_turns`: total number of assistant turns evaluated
+
+Your overall `score` should be the average of per-turn scores.
+
+[[ ## criteria ## ]]
+{{ criteria }}
+
+[[ ## conversation (all turns with roles) ## ]]
+{{ input }}
+
+[[ ## agent response (final turn) ## ]]
+{{ output_text }}
diff --git a/examples/features/multi-turn-conversation/graders/conversation-relevancy.md b/examples/features/multi-turn-conversation/graders/conversation-relevancy.md
@@ -0,0 +1,34 @@
+You are evaluating whether each assistant response in a multi-turn conversation
+is relevant to the user's current request and the broader conversation context.
+
+Below you will see the full conversation with role annotations (system, user,
+assistant). The conversation history contains prior turns, and the agent
+response is the final assistant turn. Evaluate all assistant turns across the
+entire conversation. Number assistant turns sequentially starting from Turn 1.
+
+For each assistant turn, assess:
+- Does the response directly address what the user asked?
+- Is the response appropriate given the full conversation history?
+- Does the assistant avoid tangential or off-topic information?
+
+Score each assistant turn:
+- 1.0 if the response is fully relevant to the user's request and context
+- 0.5 if the response is partially relevant but includes unnecessary information
+- 0.0 if the response is off-topic or fails to address the user's request
+
+In your `assertions`, include specific turn-level checks.
+In your `details`, return:
+- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn
+- `on_topic_turns`: count of turns scored 1.0
+- `total_turns`: total number of assistant turns evaluated
+
+Your overall `score` should be the average of per-turn scores.
+
+[[ ## criteria ## ]]
+{{ criteria }}
+
+[[ ## conversation (all turns with roles) ## ]]
+{{ input }}
+
+[[ ## agent response (final turn) ## ]]
+{{ output_text }}
diff --git a/examples/features/multi-turn-conversation/graders/role-adherence.md b/examples/features/multi-turn-conversation/graders/role-adherence.md
@@ -0,0 +1,35 @@
+You are evaluating whether an AI assistant maintains its assigned persona and
+role consistently across a multi-turn conversation.
+
+Below you will see the full conversation with role annotations (system, user,
+assistant). The conversation history contains prior turns, and the agent
+response is the final assistant turn. Evaluate all assistant turns across the
+entire conversation. Number assistant turns sequentially starting from Turn 1.
+
+Consider the system prompt, if present, as the role definition. For each
+assistant turn, assess:
+- Does the assistant stay in character?
+- Is the tone consistent with the assigned role?
+- Does the assistant avoid behavior inconsistent with its role?
+
+Score each assistant turn:
+- 1.0 if the turn fully adheres to the assigned role and tone
+- 0.5 if the turn partially adheres but shows inconsistency
+- 0.0 if the turn breaks character or contradicts the assigned role
+
+In your `assertions`, include specific turn-level checks.
+In your `details`, return:
+- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn
+- `consistent_turns`: count of turns scored 1.0
+- `total_turns`: total number of assistant turns evaluated
+
+Your overall `score` should be the average of per-turn scores.
+
+[[ ## criteria ## ]]
+{{ criteria }}
+
+[[ ## conversation (all turns with roles) ## ]]
+{{ input }}
+
+[[ ## agent response (final turn) ## ]]
+{{ output_text }}
diff --git a/packages/core/src/evaluation/evaluators/llm-grader.ts b/packages/core/src/evaluation/evaluators/llm-grader.ts
@@ -109,6 +109,10 @@ const freeformEvaluationSchema = z.object({
     )
     .describe('Per-aspect evaluation results — one entry per aspect checked')
     .optional(),
+  details: z
+    .record(z.unknown())
+    .describe('Optional structured metadata for domain-specific metrics')
+    .optional(),
 });
 
 const rubricCheckResultSchema = z.object({
@@ -261,6 +265,7 @@ export class LlmGraderEvaluator implements Evaluator {
         expectedAspectCount: Math.max(assertions.length, 1),
         evaluatorRawRequest,
         graderTarget: graderProvider.targetName,
+        details: data.details as JsonObject | undefined,
         tokenUsage,
       };
     } catch (e: unknown) {
@@ -784,7 +789,10 @@ export class LlmGraderEvaluator implements Evaluator {
         expectedAspectCount: Math.max(assertions.length, 1),
         evaluatorRawRequest,
         graderTarget,
-        details,
+        details:
+          data.details && Object.keys(data.details).length > 0
+            ? ({ ...details, ...data.details } as JsonObject)
+            : details,
       };
     } catch {
       return {
@@ -992,7 +1000,8 @@ export function buildOutputSchema(): string {
     '      "passed": <boolean>,',
     '      "evidence": "<concise evidence, 1-2 sentences, optional>"',
     '    }',
-    '  ]',
+    '  ],',
+    '  "details": {<optional object with domain-specific structured metrics>}',
     '}',
   ].join('\n');
 }

diff --git a/packages/core/src/evaluation/evaluators/types.ts b/packages/core/src/evaluation/evaluators/types.ts
@@ -59,7 +59,7 @@ export interface EvaluationScore {
   readonly expectedAspectCount: number;
   readonly evaluatorRawRequest?: JsonObject;
   readonly scores?: readonly ChildEvaluatorResult[];
-  /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
+  /** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */
   readonly details?: JsonObject;
   /** Token usage from LLM calls made by this evaluator (optional). */
   readonly tokenUsage?: TokenUsage;
@@ -76,7 +76,7 @@ export interface ChildEvaluatorResult {
   readonly assertions: readonly import('../types.js').AssertionEntry[];
   readonly evaluatorRawRequest?: JsonObject;
   readonly scores?: readonly ChildEvaluatorResult[];
-  /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
+  /** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */
   readonly details?: JsonObject;
   /** Token usage from LLM calls made by this evaluator (optional). */
   readonly tokenUsage?: TokenUsage;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"timestamp":"2026-03-09T10:16:33.509Z","test_id":"support-context-retention","eval_set":"dataset","score":0.8,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"The assistant addresses Sarah by name and references order #98765 while handling the shipping issue."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Each assistant turn directly addresses the user's immediate request","passed":true,"evidence":"The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a professional and empathetic support persona","passed":true,"evidence":"The tone remains consistent with the system instructions across all turns."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","assertions":[{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]}],"assertions":[{"text":"Turn 1 retains customer identity and order context","passed":true,"evidence":"context_retention: The assistant addresses Sarah by name and references order #98765 while handling the shipping issue. \| conversation_relevancy: The responses stay on topic, move the support flow forward, and answer the shipping questions without tangents. \| role_adherence: The tone remains consistent with the system instructions across all turns."},{"text":"Turn 2 retains delivery urgency and prior support context","passed":true},{"text":"Final turn retains shipping choice and gift-wrapping request","passed":true},{"text":"Each assistant turn directly addresses the user's immediate request","passed":true},{"text":"Assistant maintains a professional and empathetic support persona","passed":true},{"text":"Output contains #98765","passed":false,"evidence":"The final response omitted the order number."}]}
		{"timestamp":"2026-03-09T10:16:34.697Z","test_id":"support-troubleshooting-flow","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"context_retention","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom."}],"details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3}},{"name":"conversation_relevancy","type":"llm-grader","score":1,"weight":2,"verdict":"pass","assertions":[{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true,"evidence":"The responses remain focused on diagnosing the WiFi issue and next-step guidance."}],"details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3}},{"name":"role_adherence","type":"llm-grader","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true,"evidence":"The tone stays patient and the instructions remain simple across the conversation."}],"details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3}}],"assertions":[{"text":"Assistant remembers the router restart and avoids repeating it","passed":true,"evidence":"context_retention: The troubleshooting flow explicitly skips the already-attempted restart and builds on the orange light symptom. \| conversation_relevancy: The responses remain focused on diagnosing the WiFi issue and next-step guidance. \| role_adherence: The tone stays patient and the instructions remain simple across the conversation."},{"text":"Assistant keeps each response targeted to the current troubleshooting step","passed":true},{"text":"Assistant maintains a beginner-friendly technical support persona","passed":true}]}