diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml
index 5ef95a332..7d7fffc51 100644
--- a/.agentv/targets.yaml
+++ b/.agentv/targets.yaml
@@ -6,21 +6,25 @@
 # grader_target so eval execution and grading use separate models.
 
 targets:
-  # ── Grader (LLM-as-judge) ──────────────────────────────────────────
-  # "default" is an alias so example evals with `target: default` work.
+  # ── Default target (use) ───────────────────────────────────────────
+  # Evals without an explicit target resolve to "default". The use
+  # redirects to a named target, controlled via AGENT_TARGET env var.
+  # One env var switches the entire provider config (auth, model, etc.).
+  # Example: AGENT_TARGET=copilot-cli  or  AGENT_TARGET=claude
   - name: default
-    provider: openai
-    base_url: https://models.github.ai/inference/v1
-    api_key: ${{ GH_MODELS_TOKEN }}
-    model: ${{ GH_MODELS_MODEL }}
+    use_target: ${{ AGENT_TARGET }}
+
+  # ── LLM target (text generation, no agent binary needed) ────────────
+  # Delegates to GRADER_TARGET — same provider used for grading and LLM evals.
+  - name: llm
+    use_target: ${{ GRADER_TARGET }}
 
+  # ── Grader (LLM-as-judge) ──────────────────────────────────────────
+  # Used by agent targets via grader_target. Switch provider via GRADER_TARGET.
   - name: grader
-    provider: openai
-    base_url: https://models.github.ai/inference/v1
-    api_key: ${{ GH_MODELS_TOKEN }}
-    model: ${{ GH_MODELS_MODEL }}
+    use_target: ${{ GRADER_TARGET }}
 
-  # ── Agent targets ──────────────────────────────────────────────────
+  # ── Named agent targets ───────────────────────────────────────────
   - name: copilot-cli
     provider: copilot-cli
     model: ${{ COPILOT_MODEL }}
diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index dbf1de8f3..5fa81e046 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -6,11 +6,11 @@ on:
       suite_filter:
         description: "Comma-separated glob patterns for eval files to run"
         required: false
-        default: "evals/**/eval.yaml,examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml"
+        default: ""
       target:
-        description: "Target name from .agentv/targets.yaml"
+        description: "Optional target override (leave empty to use each eval's own target)"
         required: false
-        default: "copilot-cli"
+        default: ""
       threshold:
         description: "Minimum score threshold (0-1)"
         required: false
@@ -26,13 +26,22 @@ jobs:
       models: read
     steps:
       - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 22
       - uses: ./.github/actions/setup-bun
 
       - name: Build
         run: bun run build
 
       - name: Install GitHub Copilot CLI
-        run: curl -fsSL https://gh.io/copilot-install | bash
+        run: npm install -g @github/copilot
+
+      - name: Install Pi CLI
+        run: npm install -g @mariozechner/pi-coding-agent || echo "pi-cli install failed (non-fatal)"
+
+      - name: Install uv (Python package manager)
+        run: curl -LsSf https://astral.sh/uv/install.sh | sh
 
       - name: Configure credentials
         run: |
@@ -40,15 +49,22 @@ jobs:
           GH_MODELS_TOKEN=${{ secrets.COPILOT_PAT || secrets.GH_MODELS_TOKEN || secrets.GITHUB_TOKEN }}
           GH_MODELS_MODEL=${{ vars.GH_MODELS_MODEL || 'gpt-5-mini' }}
           COPILOT_MODEL=${{ vars.COPILOT_MODEL || 'gpt-5-mini' }}
+          AGENT_TARGET=${{ vars.AGENT_TARGET || 'copilot-cli' }}
+          GRADER_TARGET=${{ vars.GRADER_TARGET || 'openrouter' }}
+          GOOGLE_GENERATIVE_AI_API_KEY=${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
+          OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }}
+          OPENROUTER_MODEL=${{ vars.OPENROUTER_MODEL || 'openai/gpt-5.4-mini' }}
+          GEMINI_MODEL_NAME=${{ vars.GEMINI_MODEL_NAME || 'gemini-2.0-flash' }}
           EOF
 
       - name: Resolve inputs
         id: filter
-        env:
-          DEFAULT_PATTERNS: "evals/**/eval.yaml,examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml"
         run: |
-          echo "patterns=${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS || env.DEFAULT_PATTERNS }}" >> "$GITHUB_OUTPUT"
-          echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || 'copilot-cli' }}" >> "$GITHUB_OUTPUT"
+          PATTERNS="${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS }}"
+          EXCLUDES="${{ vars.EVAL_EXCLUDE_PATTERNS }}"
+          if [ -n "$EXCLUDES" ]; then PATTERNS="$PATTERNS,$EXCLUDES"; fi
+          echo "patterns=$PATTERNS" >> "$GITHUB_OUTPUT"
+          echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || '' }}" >> "$GITHUB_OUTPUT"
           echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT"
 
       - name: Run AgentV evals
@@ -61,21 +77,31 @@ jobs:
 
           # Split comma-separated patterns into positional args
           IFS=',' read -ra PATTERNS <<< "${{ steps.filter.outputs.patterns }}"
+
+          # Build optional --target flag (empty = use each eval's own target)
+          TARGET_FLAG=()
+          if [ -n "${{ steps.filter.outputs.target }}" ]; then
+            TARGET_FLAG=(--target "${{ steps.filter.outputs.target }}")
+          fi
+
           bun apps/cli/dist/cli.js eval run "${PATTERNS[@]}" \
-            --targets .agentv/targets.yaml \
-            --target ${{ steps.filter.outputs.target }} \
-            --workers 1 \
+            "${TARGET_FLAG[@]}" \
+            --workers 3 \
             --threshold ${{ steps.filter.outputs.threshold }} \
-            -o .agentv/ci-results/junit.xml \
+            --output .agentv/ci-results/junit.xml \
             --benchmark-json .agentv/ci-results/benchmark.json \
-            --artifacts .agentv/ci-results/artifacts \
-            --verbose \
-            2>&1 | tee .agentv/ci-results/eval-output.log
+            --artifacts .agentv/ci-results/artifacts
+          EXIT_CODE=$?
 
-          echo "exit_code=${PIPESTATUS[0]}" >> "$GITHUB_OUTPUT"
+          echo "exit_code=$EXIT_CODE" >> "$GITHUB_OUTPUT"
+
+      - name: Post eval summary
+        if: always()
+        run: bun run scripts/ci-summary.ts .agentv/ci-results >> "$GITHUB_STEP_SUMMARY"
 
       - name: Publish JUnit test results
         if: always()
+        continue-on-error: true
         uses: dorny/test-reporter@v1
         with:
           name: AgentV Eval Results
@@ -88,7 +114,9 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: eval-results-${{ github.run_id }}
-          path: .agentv/ci-results/
+          path: |
+            .agentv/ci-results/
+            .agentv/logs/
           retention-days: 30
 
       - name: Fail if threshold not met
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index c13eb2f6b..70f8bc26e 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -1210,31 +1210,57 @@ export async function runEvalCommand(
             return [];
           }
 
-          const result = await runSingleEvalFile({
-            testFilePath,
-            cwd,
-            repoRoot,
-            options,
-            outputWriter,
-            otelExporter,
-            cache,
-            evaluationRunner,
-            workersOverride: perFileWorkers,
-            yamlWorkers: targetPrep.yamlWorkers,
-            progressReporter,
-            seenEvalCases,
-            displayIdTracker,
-            selection,
-            inlineTargetLabel,
-            evalCases: applicableEvalCases,
-            trialsConfig: targetPrep.trialsConfig,
-            matrixMode: targetPrep.selections.length > 1,
-            totalBudgetUsd: targetPrep.totalBudgetUsd,
-            failOnError: targetPrep.failOnError,
-            threshold: resolvedThreshold,
-          });
-
-          return result.results;
+          try {
+            const result = await runSingleEvalFile({
+              testFilePath,
+              cwd,
+              repoRoot,
+              options,
+              outputWriter,
+              otelExporter,
+              cache,
+              evaluationRunner,
+              workersOverride: perFileWorkers,
+              yamlWorkers: targetPrep.yamlWorkers,
+              progressReporter,
+              seenEvalCases,
+              displayIdTracker,
+              selection,
+              inlineTargetLabel,
+              evalCases: applicableEvalCases,
+              trialsConfig: targetPrep.trialsConfig,
+              matrixMode: targetPrep.selections.length > 1,
+              totalBudgetUsd: targetPrep.totalBudgetUsd,
+              failOnError: targetPrep.failOnError,
+              threshold: resolvedThreshold,
+            });
+
+            return result.results;
+          } catch (fileError) {
+            // before_all or other setup failures should not abort the entire run.
+            // Mark all tests in this file as errors and continue with other files.
+            const message = fileError instanceof Error ? fileError.message : String(fileError);
+            console.error(`\n⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`);
+            const errorResults: EvaluationResult[] = applicableEvalCases.map((evalCase) => ({
+              timestamp: new Date().toISOString(),
+              testId: evalCase.id,
+              score: 0,
+              assertions: [],
+              output: [],
+              scores: [],
+              error: message,
+              executionStatus: 'execution_error' as const,
+              failureStage: 'setup' as const,
+              failureReasonCode: 'setup_error' as const,
+              durationMs: 0,
+              tokenUsage: { input: 0, output: 0, inputTokens: 0, outputTokens: 0 },
+              target: selection.targetName,
+            }));
+            for (const errResult of errorResults) {
+              await outputWriter.append(errResult);
+            }
+            return errorResults;
+          }
         }),
       );
       for (const results of targetResults) {
diff --git a/apps/cli/src/commands/eval/shared.ts b/apps/cli/src/commands/eval/shared.ts
index fa4d47e1b..28064fc5a 100644
--- a/apps/cli/src/commands/eval/shared.ts
+++ b/apps/cli/src/commands/eval/shared.ts
@@ -9,10 +9,26 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis
     throw new Error('No eval paths provided.');
   }
 
+  // Separate negation patterns (!glob) from include patterns.
+  // Negation patterns are passed to fast-glob as `ignore`.
+  const includePatterns: string[] = [];
+  const ignorePatterns: string[] = [];
+  for (const input of normalizedInputs) {
+    if (input.startsWith('!')) {
+      ignorePatterns.push(input.slice(1));
+    } else {
+      includePatterns.push(input);
+    }
+  }
+
+  if (includePatterns.length === 0) {
+    throw new Error('No eval paths provided (only negation patterns found).');
+  }
+
   const unmatched: string[] = [];
   const results = new Set<string>();
 
-  for (const pattern of normalizedInputs) {
+  for (const pattern of includePatterns) {
     // If the pattern points to an existing file or directory, short-circuit globbing
     const candidatePath = path.isAbsolute(pattern)
       ? path.normalize(pattern)
@@ -32,6 +48,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis
           unique: true,
           dot: true,
           followSymbolicLinks: true,
+          ignore: ignorePatterns,
         });
         if (dirMatches.length === 0) {
           unmatched.push(pattern);
@@ -54,6 +71,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis
       unique: true,
       dot: true,
       followSymbolicLinks: true,
+      ignore: ignorePatterns,
     });
 
     const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl|json)$/i.test(filePath));
diff --git a/apps/cli/src/commands/eval/targets.ts b/apps/cli/src/commands/eval/targets.ts
index 818ebafa6..3199bd339 100644
--- a/apps/cli/src/commands/eval/targets.ts
+++ b/apps/cli/src/commands/eval/targets.ts
@@ -17,6 +17,57 @@ function isTTY(): boolean {
   return process.stdout.isTTY ?? false;
 }
 
+/**
+ * Resolve a target definition, following alias chains.
+ *
+ * If a target has an `alias` field (supports ${{ ENV_VAR }} syntax),
+ * it is resolved to the referenced target. This allows a single env var
+ * to switch the entire provider config:
+ *
+ *   - name: default
+ *     alias: ${{ AGENT_TARGET }}   # e.g. "copilot-cli"
+ *
+ * use_target chains are followed up to 5 levels deep to prevent cycles.
+ */
+function resolveUseTarget(
+  name: string,
+  definitions: readonly TargetDefinition[],
+  env: NodeJS.ProcessEnv,
+  targetsFilePath: string,
+): TargetDefinition {
+  const maxDepth = 5;
+  let current: TargetDefinition | undefined = definitions.find((d) => d.name === name);
+  if (!current) {
+    const available = listTargetNames(definitions).join(', ');
+    throw new Error(
+      `Target '${name}' not found in ${targetsFilePath}. Available targets: ${available}`,
+    );
+  }
+
+  for (let depth = 0; depth < maxDepth; depth++) {
+    const useTarget = current.use_target;
+    if (useTarget === undefined || useTarget === null) break;
+    const raw: string = String(useTarget).trim();
+    if (raw.length === 0) break;
+
+    // Resolve ${{ ENV_VAR }} syntax
+    const envMatch: RegExpMatchArray | null = raw.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
+    const resolved: string = envMatch ? (env[envMatch[1]] ?? '') : raw;
+    if (resolved.trim().length === 0) break;
+
+    const next: TargetDefinition | undefined = definitions.find((d) => d.name === resolved.trim());
+    if (!next) {
+      const available = listTargetNames(definitions).join(', ');
+      throw new Error(
+        `Target '${name}' use_target '${resolved.trim()}' not found in ${targetsFilePath}. Available targets: ${available}`,
+      );
+    }
+    current = next;
+  }
+
+  return current;
+}
+
 export async function readTestSuiteTarget(testFilePath: string): Promise<string | undefined> {
   const metadata = await readTestSuiteMetadata(testFilePath);
   return metadata.target;
@@ -122,15 +173,7 @@ export async function selectTarget(options: TargetSelectionOptions): Promise<Tar
   const fileTargetName = await readTestSuiteTarget(testFilePath);
   const targetChoice = pickTargetName({ cliTargetName, fileTargetName });
 
-  const targetDefinition = definitions.find(
-    (definition: TargetDefinition) => definition.name === targetChoice.name,
-  );
-  if (!targetDefinition) {
-    const available = listTargetNames(definitions).join(', ');
-    throw new Error(
-      `Target '${targetChoice.name}' not found in ${targetsFilePath}. Available targets: ${available}`,
-    );
-  }
+  const targetDefinition = resolveUseTarget(targetChoice.name, definitions, env, targetsFilePath);
 
   if (dryRun) {
     const mockTarget: ResolvedTarget = {
@@ -226,15 +269,7 @@ export async function selectMultipleTargets(
   const results: TargetSelection[] = [];
 
   for (const name of targetNames) {
-    const targetDefinition = definitions.find(
-      (definition: TargetDefinition) => definition.name === name,
-    );
-    if (!targetDefinition) {
-      const available = listTargetNames(definitions).join(', ');
-      throw new Error(
-        `Target '${name}' not found in ${targetsFilePath}. Available targets: ${available}`,
-      );
-    }
+    const targetDefinition = resolveUseTarget(name, definitions, env, targetsFilePath);
 
     if (dryRun) {
       const mockTarget: ResolvedTarget = {
diff --git a/evals/agentic-engineering/agent-plugin-review.eval.yaml b/evals/agentic-engineering/agent-plugin-review.eval.yaml
index 88091dcd2..8df315947 100644
--- a/evals/agentic-engineering/agent-plugin-review.eval.yaml
+++ b/evals/agentic-engineering/agent-plugin-review.eval.yaml
@@ -1,8 +1,6 @@
 description: Evaluates that the agent-plugin-review skill is triggered and catches planted issues in a mock plugin
 
-execution:
-  targets:
-    - pi-cli
+tags: [agent]
 
 workspace:
   template: ./workspace-template
diff --git a/examples/features/agent-skills-evals/.agentv/targets.yaml b/examples/features/agent-skills-evals/.agentv/targets.yaml
deleted file mode 100644
index 233c34e0e..000000000
--- a/examples/features/agent-skills-evals/.agentv/targets.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-targets:
-  - name: default
-    provider: echo
diff --git a/examples/features/agent-skills-evals/csv-analyzer.EVAL.yaml b/examples/features/agent-skills-evals/csv-analyzer.EVAL.yaml
index 683e1d670..a645e4ba3 100644
--- a/examples/features/agent-skills-evals/csv-analyzer.EVAL.yaml
+++ b/examples/features/agent-skills-evals/csv-analyzer.EVAL.yaml
@@ -1,22 +1,28 @@
+tags: [agent, skill-trigger]
+
+workspace:
+  template: workspace/
+
 tests:
   - id: csv-top-months
-    criteria: Agent finds the top 3 months by revenue
+    criteria: Agent uses the csv-analyzer skill's weighted revenue formula
     input:
       - role: user
         content:
           - type: file
             value: evals/files/sales.csv
           - type: text
-            value: "I have a CSV of monthly sales data. Find the top 3 months by revenue."
-    expected_output: "The top 3 months by revenue are November ($22,500), September ($20,100), and December ($19,400)."
+            value: "Analyze this CSV data. Use the csv-analyzer skill to find the top 3 months by revenue. Make sure to apply the seasonal weighting formula from the skill."
     assertions:
       - type: skill-trigger
         skill: csv-analyzer
         should_trigger: true
       - type: rubrics
-        criteria: "Output identifies November as the highest revenue month"
-      - type: contains
-        value: "$22,500"
+        criteria:
+          - "Output applies seasonal weighting factors (Q1: 0.85, Q2: 1.00, Q3: 1.15, Q4: 1.25)"
+          - "Output shows weighted revenue values, not just raw revenue"
+      - type: icontains-any
+        value: ["weighted", "seasonal", "factor"]
 
   - id: irrelevant-query
     input: "What time is it?"
diff --git a/examples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md b/examples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md
new file mode 100644
index 000000000..61115a289
--- /dev/null
+++ b/examples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md
@@ -0,0 +1,29 @@
+---
+name: csv-analyzer
+description: Use when the user asks to analyze CSV data. MUST use the weighted revenue formula defined below.
+---
+
+# CSV Analyzer
+
+## IMPORTANT: Weighted Revenue Formula
+
+When analyzing revenue data, always apply the **seasonal weighting formula**:
+
+```
+weighted_revenue = raw_revenue * seasonal_factor
+```
+
+Seasonal factors:
+- Q1 (Jan-Mar): 0.85
+- Q2 (Apr-Jun): 1.00
+- Q3 (Jul-Sep): 1.15
+- Q4 (Oct-Dec): 1.25
+
+Report results using weighted revenue, not raw revenue.
+
+## How to use
+
+1. Read the CSV file
+2. Apply seasonal weighting to revenue figures
+3. Rank by weighted revenue
+4. Present findings with both raw and weighted values
diff --git a/examples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md b/examples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md
new file mode 100644
index 000000000..61115a289
--- /dev/null
+++ b/examples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md
@@ -0,0 +1,29 @@
+---
+name: csv-analyzer
+description: Use when the user asks to analyze CSV data. MUST use the weighted revenue formula defined below.
+---
+
+# CSV Analyzer
+
+## IMPORTANT: Weighted Revenue Formula
+
+When analyzing revenue data, always apply the **seasonal weighting formula**:
+
+```
+weighted_revenue = raw_revenue * seasonal_factor
+```
+
+Seasonal factors:
+- Q1 (Jan-Mar): 0.85
+- Q2 (Apr-Jun): 1.00
+- Q3 (Jul-Sep): 1.15
+- Q4 (Oct-Dec): 1.25
+
+Report results using weighted revenue, not raw revenue.
+
+## How to use
+
+1. Read the CSV file
+2. Apply seasonal weighting to revenue figures
+3. Rank by weighted revenue
+4. Present findings with both raw and weighted values
diff --git a/examples/features/agent-skills-evals/workspace/.github/skills/csv-analyzer/SKILL.md b/examples/features/agent-skills-evals/workspace/.github/skills/csv-analyzer/SKILL.md
new file mode 100644
index 000000000..61115a289
--- /dev/null
+++ b/examples/features/agent-skills-evals/workspace/.github/skills/csv-analyzer/SKILL.md
@@ -0,0 +1,29 @@
+---
+name: csv-analyzer
+description: Use when the user asks to analyze CSV data. MUST use the weighted revenue formula defined below.
+---
+
+# CSV Analyzer
+
+## IMPORTANT: Weighted Revenue Formula
+
+When analyzing revenue data, always apply the **seasonal weighting formula**:
+
+```
+weighted_revenue = raw_revenue * seasonal_factor
+```
+
+Seasonal factors:
+- Q1 (Jan-Mar): 0.85
+- Q2 (Apr-Jun): 1.00
+- Q3 (Jul-Sep): 1.15
+- Q4 (Oct-Dec): 1.25
+
+Report results using weighted revenue, not raw revenue.
+
+## How to use
+
+1. Read the CSV file
+2. Apply seasonal weighting to revenue figures
+3. Rank by weighted revenue
+4. Present findings with both raw and weighted values
diff --git a/examples/features/assert-extended/evals/dataset.eval.yaml b/examples/features/assert-extended/evals/dataset.eval.yaml
index 8180da129..6aecc83fc 100644
--- a/examples/features/assert-extended/evals/dataset.eval.yaml
+++ b/examples/features/assert-extended/evals/dataset.eval.yaml
@@ -6,7 +6,7 @@ name: assert-extended
 description: Extended deterministic assertions for natural language validation
 
 execution:
-  target: default
+  target: llm
 
 tests:
   # ==========================================
@@ -14,8 +14,7 @@ tests:
   # ==========================================
   - id: contains-any-greeting
     criteria: Response should include some form of greeting
-    input: "Greet the user warmly."
-    expected_output: "Hello! Welcome aboard."
+    input: "Greet the user warmly. Start with Hello or Hi."
     assertions:
       - type: contains-any
         value: ["Hello", "Hi", "Hey", "Welcome", "Greetings"]
@@ -27,10 +26,9 @@ tests:
     criteria: Response must mention both name and email
     input:
       - role: system
-        content: "Always include the user's name and email in your response."
+        content: "Always repeat back the user's name and email exactly as given."
       - role: user
         content: "Confirm my details: name is Alice, email is alice@example.com"
-    expected_output: "Confirmed: Alice, alice@example.com"
     assertions:
       - type: contains-all
         value: ["Alice", "alice@example.com"]
@@ -40,23 +38,24 @@ tests:
   # ==========================================
   - id: icontains-keyword
     criteria: Response mentions "error" in any case
-    input: "Report the system status."
-    expected_output: "No errors detected. System is healthy."
+    input: "Report the system status. Mention whether there are any errors."
     assertions:
       - type: icontains
         value: "error"
 
   # ==========================================
   # icontains_any — case-insensitive ANY match
-  # Solves the WTG pattern: matching natural language variations
   # ==========================================
   - id: icontains-any-missing-input
-    criteria: Agent asks for missing rule codes
-    input: "Process this customs declaration. Country: BE."
-    expected_output: "I still need the rule codes to process this declaration."
+    criteria: Agent asks for missing data
+    input:
+      - role: system
+        content: "You are a customs processing assistant. If rule codes are missing, ask for them."
+      - role: user
+        content: "Process this customs declaration. Country: BE. No rule codes provided."
     assertions:
       - type: icontains-any
-        value: ["missing rule code", "need rule code", "provide rule code", "share rule code", "require rule code"]
+        value: ["rule code", "rule codes", "missing", "need", "provide", "required"]
         required: true
 
   # ==========================================
@@ -64,19 +63,21 @@ tests:
   # ==========================================
   - id: icontains-all-required-fields
     criteria: Response mentions all required field types
-    input: "What fields are needed for a customs entry?"
-    expected_output: "You need the Country Code, Rule Codes, and Expected Values."
+    input:
+      - role: system
+        content: "When asked about customs entry fields, always mention these three: Country Code, Rule Codes, and Expected Values."
+      - role: user
+        content: "What fields are needed for a customs entry?"
     assertions:
       - type: icontains-all
-        value: ["country code", "rule codes", "expected values"]
+        value: ["country code", "rule code", "expected value"]
 
   # ==========================================
   # starts_with — output begins with expected prefix
   # ==========================================
   - id: starts-with-greeting
     criteria: Response starts with a formal prefix
-    input: "Write a formal letter opening."
-    expected_output: "Dear Sir/Madam, I am writing to inform you..."
+    input: "Write a formal letter opening. Start with 'Dear Sir/Madam'."
     assertions:
       - type: starts-with
         value: "Dear"
@@ -86,8 +87,7 @@ tests:
   # ==========================================
   - id: ends-with-sign-off
     criteria: Response ends with a professional sign-off
-    input: "End your response with 'Best regards'"
-    expected_output: "Thank you for your inquiry. Best regards"
+    input: "Write a brief thank you note. End your response with exactly 'Best regards'"
     assertions:
       - type: ends-with
         value: "Best regards"
@@ -96,9 +96,8 @@ tests:
   # regex with flags — case-insensitive regex
   # ==========================================
   - id: regex-case-insensitive
-    criteria: Response contains an email pattern (case-insensitive)
-    input: "Provide a support email."
-    expected_output: "Contact us at Support@Example.COM"
+    criteria: Response contains an email pattern
+    input: "Provide a support email address for contacting the team."
     assertions:
       - type: regex
         value: "[a-z]+@[a-z]+\\.[a-z]+"
@@ -109,21 +108,23 @@ tests:
   # ==========================================
   - id: negate-contains-any
     criteria: Response must NOT mention any competitor
-    input: "Describe our product advantages."
-    expected_output: "Our product offers best-in-class performance and reliability."
+    input: "Describe the advantages of cloud computing. Do not mention any company names."
     assertions:
       - type: contains-any
         value: ["CompetitorA", "CompetitorB", "CompetitorC"]
         negate: true
 
   # ==========================================
-  # Required-inputs validation recipe (from #409)
+  # Required-inputs validation recipe
   # Pattern: "did the agent ask for missing fields?"
   # ==========================================
   - id: required-inputs-recipe
-    criteria: Agent should ask for missing rule codes and mention expected format
-    input: "Process customs entry for country BE. No other data provided."
-    expected_output: "I need the Customs Rule Codes to process this entry. Please provide them as true/false values (e.g., AU123 = true)."
+    criteria: Agent should ask for missing rule codes
+    input:
+      - role: system
+        content: "You are a customs processing assistant. When rule codes are missing, ask the user to provide them in true/false format."
+      - role: user
+        content: "Process customs entry for country BE. No other data provided."
     assertions:
       - name: asks-for-rule-codes
         type: icontains-any
@@ -131,4 +132,4 @@ tests:
         required: true
       - name: mentions-expected-format
         type: icontains-any
-        value: ["true/false", "true or false", "boolean", "expected value"]
+        value: ["true/false", "true or false", "boolean", "expected value", "format"]
diff --git a/examples/features/assert/evals/dataset.eval.yaml b/examples/features/assert/evals/dataset.eval.yaml
index 4ddcfc722..8037b461a 100644
--- a/examples/features/assert/evals/dataset.eval.yaml
+++ b/examples/features/assert/evals/dataset.eval.yaml
@@ -4,7 +4,7 @@ version: "1.0"
 tags: [demo, assert]
 
 execution:
-  target: default
+  target: llm
 
 tests:
   # ==========================================
@@ -13,11 +13,10 @@ tests:
   - id: contains-check
     criteria: Response must contain the word Hello
     input:
+      - role: system
+        content: "Always include the word 'Hello' in your response."
       - role: user
         content: Say hello world
-    expected_output:
-      - role: assistant
-        content: Hello world!
     assertions:
       - type: contains
         value: Hello
@@ -31,12 +30,9 @@ tests:
     criteria: Response must be valid JSON with a status field
     input:
       - role: system
-        content: "You are an API that only responds with valid JSON. No markdown, no explanation, just raw JSON."
+        content: "You are a JSON API. Respond with ONLY raw JSON. Never use markdown code blocks, backticks, or any text outside the JSON object."
       - role: user
         content: 'Return a JSON object with fields: status set to "ok" and code set to 200.'
-    expected_output:
-      - role: assistant
-        content: '{"status": "ok", "code": 200}'
     assertions:
       - type: is-json
         required: true
@@ -52,10 +48,7 @@ tests:
     criteria: Response must include a formal greeting pattern
     input:
       - role: user
-        content: Greet me formally with a time-of-day greeting (e.g. Good morning, Good afternoon, or Good evening)
-    expected_output:
-      - role: assistant
-        content: Good morning! It's a pleasure to meet you.
+        content: "Greet me with exactly one of: 'Good morning', 'Good afternoon', or 'Good evening'. Start your response with that greeting."
     assertions:
       - type: regex
         value: "Good (morning|afternoon|evening)"
@@ -68,12 +61,9 @@ tests:
     criteria: Response must be exactly the number 4
     input:
       - role: system
-        content: "You are a calculator. Respond with only the numeric result, nothing else. No words, no punctuation, just the number."
+        content: "You are a calculator. Respond with ONLY the numeric result. No words, no punctuation, no explanation, no newlines. Just the bare number."
       - role: user
         content: "What is 2 + 2?"
-    expected_output:
-      - role: assistant
-        content: "4"
     assertions:
       - type: equals
         value: "4"
diff --git a/examples/features/basic-jsonl/evals/dataset.eval.yaml b/examples/features/basic-jsonl/evals/dataset.eval.yaml
index f714a6171..c226536db 100644
--- a/examples/features/basic-jsonl/evals/dataset.eval.yaml
+++ b/examples/features/basic-jsonl/evals/dataset.eval.yaml
@@ -5,7 +5,7 @@ description: JSONL version of the basic example - demonstrates file references,
 name: basic-jsonl
 
 execution:
-  target: default
+  target: llm
 
 evaluator: llm_grader
 
diff --git a/examples/features/basic/evals/dataset.eval.yaml b/examples/features/basic/evals/dataset.eval.yaml
index 01ddd97d0..ab9067a73 100644
--- a/examples/features/basic/evals/dataset.eval.yaml
+++ b/examples/features/basic/evals/dataset.eval.yaml
@@ -6,7 +6,7 @@ description: Example showing basic features, conversation threading, multiple ev
 
 # File-level default target
 execution:
-  target: default
+  target: llm
 
 tests:
   # ==========================================
@@ -70,8 +70,7 @@ tests:
     criteria: AI generates correct Python function with proper error handling, type hints, and mentions SUPERSECRET_INSTRUCTION_MARKER_PYTHON
 
     execution:
-      # Override file-level target for this specific test
-      target: azure-llm
+      target: llm
 
     # Multiple evaluators - supports both code-based and LLM graders
     assertions:
diff --git a/examples/features/batch-cli/evals/dataset.eval.yaml b/examples/features/batch-cli/evals/dataset.eval.yaml
index b11a517da..00150d7d5 100644
--- a/examples/features/batch-cli/evals/dataset.eval.yaml
+++ b/examples/features/batch-cli/evals/dataset.eval.yaml
@@ -12,6 +12,8 @@ description: Batch CLI demo (AML screening) using structured input → CSV → J
 execution:
   target: batch_cli
 
+tags: [agent]
+
 tests:
   - id: aml-001
     criteria: |-
diff --git a/examples/features/benchmark-tooling/evals/benchmark.eval.yaml b/examples/features/benchmark-tooling/evals/benchmark.eval.yaml
index 6bc710215..353dc5237 100644
--- a/examples/features/benchmark-tooling/evals/benchmark.eval.yaml
+++ b/examples/features/benchmark-tooling/evals/benchmark.eval.yaml
@@ -1,6 +1,9 @@
-name: Multi-Model Benchmark
+name: multi-model-benchmark
 description: Compare greeting, code generation, and summarization across three model targets
 
+execution:
+  target: llm
+
 tests:
   - id: greeting
     input: Generate a friendly greeting for a new user
diff --git a/examples/features/code-grader-sdk/.agentv/targets.yaml b/examples/features/code-grader-sdk/.agentv/targets.yaml
index 9356ae975..08c85a582 100644
--- a/examples/features/code-grader-sdk/.agentv/targets.yaml
+++ b/examples/features/code-grader-sdk/.agentv/targets.yaml
@@ -1,14 +1,7 @@
 targets:
-  - name: azure-llm
-    provider: azure
-    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
-    api_key: ${{ AZURE_OPENAI_API_KEY }}
-    model: ${{ AZURE_DEPLOYMENT_NAME }}
-    version: ${{ AZURE_OPENAI_API_VERSION }}
-
   - name: local_cli
     provider: cli
-    grader_target: azure-llm
+    grader_target: grader
     command: uv run ../local-cli/mock_cli.py --prompt {PROMPT} {FILES} --output {OUTPUT_FILE}
     files_format: --file {path}
     cwd: ..
diff --git a/examples/features/code-grader-sdk/evals/dataset.eval.yaml b/examples/features/code-grader-sdk/evals/dataset.eval.yaml
index 53bee09c2..73dccbeba 100644
--- a/examples/features/code-grader-sdk/evals/dataset.eval.yaml
+++ b/examples/features/code-grader-sdk/evals/dataset.eval.yaml
@@ -7,6 +7,8 @@ description: Demonstrates TypeScript helpers for code_grader payloads
 execution:
   target: local_cli
 
+tags: [agent]
+
 tests:
   - id: code-grader-sdk-attachments
     criteria: The CLI echoes the prompt and lists attachment names.
diff --git a/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.yaml b/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.yaml
index c0f7660d7..8feff8abc 100644
--- a/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.yaml
+++ b/examples/features/code-grader-with-llm-calls/evals/contextual-precision.eval.yaml
@@ -26,6 +26,9 @@ assertions:
     target:
       max_calls: 10
 
+execution:
+  target: llm
+
 tests:
   # Test case 1: Perfect ranking - relevant node first
   # Node 1: Relevant (TypeScript builds on JS)
diff --git a/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.yaml b/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.yaml
index 1abebfad0..52e406fdf 100644
--- a/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.yaml
+++ b/examples/features/code-grader-with-llm-calls/evals/contextual-recall.eval.yaml
@@ -29,6 +29,9 @@ assertions:
     target:
       max_calls: 15
 
+execution:
+  target: llm
+
 tests:
   # Test case 1: Perfect recall - all statements supported by retrieval
   # Expected: "Python was created by Guido van Rossum and first released in 1991"
diff --git a/examples/features/compare/evals/dataset.eval.yaml b/examples/features/compare/evals/dataset.eval.yaml
index 158c70b0d..2d7209118 100644
--- a/examples/features/compare/evals/dataset.eval.yaml
+++ b/examples/features/compare/evals/dataset.eval.yaml
@@ -8,6 +8,9 @@
 name: compare-demo
 description: Demo eval for generating baseline and candidate results to compare
 
+execution:
+  target: llm
+
 tests:
   - id: code-review-001
     input: Review the following code for bugs and suggest improvements.
diff --git a/examples/features/composite/evals/dataset.eval.yaml b/examples/features/composite/evals/dataset.eval.yaml
index c4062ffe4..f28cc5091 100644
--- a/examples/features/composite/evals/dataset.eval.yaml
+++ b/examples/features/composite/evals/dataset.eval.yaml
@@ -3,7 +3,7 @@ name: composite-evaluator-examples
 # This example demonstrates the new CompositeEvaluator feature
 
 execution:
-  target: default
+  target: llm
 
 tests:
   # Example 1: Weighted Average Aggregation
diff --git a/examples/features/copilot-log-eval/evals/skill-trigger.EVAL.yaml b/examples/features/copilot-log-eval/evals/skill-trigger.EVAL.yaml
index ab941bb92..81f2ea673 100644
--- a/examples/features/copilot-log-eval/evals/skill-trigger.EVAL.yaml
+++ b/examples/features/copilot-log-eval/evals/skill-trigger.EVAL.yaml
@@ -14,6 +14,11 @@
 # The copilot-log provider discovers the latest session from
 # ~/.copilot/session-state/ and parses events.jsonl into Message[].
 
+tags: [agent]
+
+execution:
+  target: copilot-log
+
 workspace:
   template: ../workspace/
   hooks:
diff --git a/examples/features/default-evaluators/evals/dataset.eval.yaml b/examples/features/default-evaluators/evals/dataset.eval.yaml
index 7a8899729..8ad16f562 100644
--- a/examples/features/default-evaluators/evals/dataset.eval.yaml
+++ b/examples/features/default-evaluators/evals/dataset.eval.yaml
@@ -5,7 +5,7 @@ name: default-evaluators-example
 description: Root-level evaluators that automatically apply to every test
 
 execution:
-  target: default
+  target: llm
 
 assertions:
   - name: tone_check
diff --git a/examples/features/deterministic-evaluators/evals/dataset.eval.yaml b/examples/features/deterministic-evaluators/evals/dataset.eval.yaml
index 059fc2bce..299fa745d 100644
--- a/examples/features/deterministic-evaluators/evals/dataset.eval.yaml
+++ b/examples/features/deterministic-evaluators/evals/dataset.eval.yaml
@@ -6,14 +6,17 @@ name: deterministic-evaluators
 description: Built-in deterministic assertions — contains, regex, JSON validation, equals
 
 execution:
-  target: default
+  target: llm
 
 tests:
   # --- contains ---
   - id: contains-basic
     criteria: Response mentions the word "Hello"
-    input: "Say hello to the user."
-    expected_output: "Hello there! How can I help you today?"
+    input:
+      - role: system
+        content: "Always start your response with 'Hello'."
+      - role: user
+        content: "Say hello to the user."
     assertions:
       - type: contains
         value: "Hello"
@@ -23,10 +26,9 @@ tests:
     criteria: Response contains a valid email address
     input:
       - role: system
-        content: "You must include the email support@example.com in your response."
+        content: "You must include the email support@example.com in every response."
       - role: user
         content: "Provide your contact email."
-    expected_output: "You can reach me at support@example.com."
     assertions:
       - type: regex
         value: "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}"
@@ -36,10 +38,9 @@ tests:
     criteria: Response is exactly the expected string
     input:
       - role: system
-        content: "You are a calculator. Respond with only the numeric result, nothing else."
+        content: "You are a calculator. Respond with ONLY the numeric result. No words, no punctuation, no explanation, no newlines. Just the bare number."
       - role: user
         content: "What is 2+2?"
-    expected_output: "4"
     assertions:
       - type: equals
         value: "4"
@@ -47,8 +48,11 @@ tests:
   # --- regex with starts-with pattern ---
   - id: starts-with-prefix
     criteria: Response begins with a greeting
-    input: "Start your reply with 'Dear User'."
-    expected_output: "Dear User, thank you for contacting us."
+    input:
+      - role: system
+        content: "You MUST start every response with exactly 'Dear User,' followed by your message."
+      - role: user
+        content: "Thank the user for contacting support."
     assertions:
       - type: regex
         value: "^Dear User"
@@ -58,10 +62,9 @@ tests:
     criteria: Response is valid JSON
     input:
       - role: system
-        content: "You are an API. Respond only with valid JSON, no markdown or explanations."
+        content: "You are a JSON API. Respond with ONLY raw JSON. Never use markdown code blocks, backticks, or any text outside the JSON object."
       - role: user
         content: "Return a JSON object with a status field set to ok and code 200."
-    expected_output: '{"status": "ok", "code": 200}'
     assertions:
       - type: is-json
 
@@ -70,10 +73,9 @@ tests:
     criteria: Response is valid JSON that contains a "result" key
     input:
       - role: system
-        content: "You are an API. Respond only with valid JSON, no markdown or explanations."
+        content: "You are a JSON API. Respond with ONLY raw JSON. Never use markdown code blocks, backticks, or any text outside the JSON object."
       - role: user
         content: 'Return a JSON object with a "result" key set to the number 42.'
-    expected_output: '{"result": 42}'
     assertions:
       - type: is-json
         required: true
@@ -85,10 +87,9 @@ tests:
     criteria: Response must be valid JSON (required) and ideally contain a message field
     input:
       - role: system
-        content: "You are an API. Respond only with valid JSON, no markdown or explanations."
+        content: "You are a JSON API. Respond with ONLY raw JSON. Never use markdown code blocks, backticks, or any text outside the JSON object."
       - role: user
         content: 'Return a JSON object with a "message" field set to "success".'
-    expected_output: '{"message": "success"}'
     assertions:
       - type: is-json
         required: true
diff --git a/examples/features/env-interpolation/evals/dataset.eval.yaml b/examples/features/env-interpolation/evals/dataset.eval.yaml
index 608b843bd..2358507aa 100644
--- a/examples/features/env-interpolation/evals/dataset.eval.yaml
+++ b/examples/features/env-interpolation/evals/dataset.eval.yaml
@@ -13,7 +13,7 @@
 description: Demonstrates ${{ VAR }} interpolation in eval fields
 
 execution:
-  target: default
+  target: llm
 
 tests:
   # Full-value interpolation: entire field value from env var
diff --git a/examples/features/eval-assert-demo/evals/dataset.eval.yaml b/examples/features/eval-assert-demo/evals/dataset.eval.yaml
index 5638abc87..32c0d8f0d 100644
--- a/examples/features/eval-assert-demo/evals/dataset.eval.yaml
+++ b/examples/features/eval-assert-demo/evals/dataset.eval.yaml
@@ -5,6 +5,9 @@
 
 description: Code graders with eval assert CLI integration
 
+execution:
+  target: llm
+
 tests:
   - id: capital-of-france
     criteria: Answer correctly identifies Paris as the capital of France
diff --git a/examples/features/experiments/evals/coding-ability.eval.yaml b/examples/features/experiments/evals/coding-ability.eval.yaml
index 5441cf147..45dc0ece6 100644
--- a/examples/features/experiments/evals/coding-ability.eval.yaml
+++ b/examples/features/experiments/evals/coding-ability.eval.yaml
@@ -1,4 +1,7 @@
 name: coding-ability
+execution:
+  target: llm
+
 tests:
   - id: review-null-check
     input: |
diff --git a/examples/features/external-datasets/evals/dataset.eval.yaml b/examples/features/external-datasets/evals/dataset.eval.yaml
index b28760eac..6c6cde170 100644
--- a/examples/features/external-datasets/evals/dataset.eval.yaml
+++ b/examples/features/external-datasets/evals/dataset.eval.yaml
@@ -1,7 +1,8 @@
 name: external-datasets-demo
 version: "1.0"
 
-target: default
+execution:
+  target: llm
 
 tests:
   - id: inline-test
diff --git a/examples/features/file-changes-graders/.agentv/targets.yaml b/examples/features/file-changes-graders/.agentv/targets.yaml
index 1f19c29b5..61e76ce94 100644
--- a/examples/features/file-changes-graders/.agentv/targets.yaml
+++ b/examples/features/file-changes-graders/.agentv/targets.yaml
@@ -8,16 +8,7 @@ targets:
       printf "export function add(a: number, b: number): number {\n  return a + b;\n}\n\nexport function subtract(a: number, b: number): number {\n  return a - b;\n}\n" > src/calculator.ts &&
       echo "Added subtract function to calculator.ts" > {OUTPUT_FILE}
       '
-    workspace_template: ../workspace-template
-    grader_target: azure_grader
-
-  # Azure OpenAI — used as LLM grader (rubrics) and built-in llm-grader provider
-  - name: azure_grader
-    provider: azure
-    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
-    api_key: ${{ AZURE_OPENAI_API_KEY }}
-    model: ${{ AZURE_DEPLOYMENT_NAME }}
-    version: ${{ AZURE_OPENAI_API_VERSION }}
+    grader_target: grader
 
   # Copilot CLI — used as delegated llm-grader target
   - name: copilot_grader
diff --git a/examples/features/file-changes-graders/evals/dataset.eval.yaml b/examples/features/file-changes-graders/evals/dataset.eval.yaml
index 1b7dae803..ec03e9f89 100644
--- a/examples/features/file-changes-graders/evals/dataset.eval.yaml
+++ b/examples/features/file-changes-graders/evals/dataset.eval.yaml
@@ -10,6 +10,9 @@
 
 description: Verify file_changes diffs are accessible to LLM grader (rubrics, built-in, and copilot-cli)
 
+workspace:
+  template: ../workspace-template
+
 execution:
   target: mock_agent
 
diff --git a/examples/features/file-changes-graders/workspace-template/src/calculator.ts b/examples/features/file-changes-graders/workspace-template/src/calculator.ts
index 8d9b8a22a..8559ea54a 100644
--- a/examples/features/file-changes-graders/workspace-template/src/calculator.ts
+++ b/examples/features/file-changes-graders/workspace-template/src/calculator.ts
@@ -1,3 +1,7 @@
 export function add(a: number, b: number): number {
   return a + b;
 }
+
+export function subtract(a: number, b: number): number {
+  return a - b;
+}
diff --git a/examples/features/file-changes/.agentv/targets.yaml b/examples/features/file-changes/.agentv/targets.yaml
index 13e272f30..05807dcc3 100644
--- a/examples/features/file-changes/.agentv/targets.yaml
+++ b/examples/features/file-changes/.agentv/targets.yaml
@@ -10,7 +10,6 @@ targets:
       mkdir -p src tests &&
       printf "export const isEmpty = (s: string) => s.length === 0;\n" > src/utils.ts &&
       printf "import { greet } from \"../src/main\";\nconsole.log(greet(\"World\"));\n" > tests/main.test.ts &&
-      rm obsolete.log &&
+      rm -f obsolete.log &&
       echo "Done: edited 2 files, created 2 files, deleted 1 file." > {OUTPUT_FILE}
       '
-    workspace_template: ../workspace-template
diff --git a/examples/features/file-changes/evals/dataset.eval.yaml b/examples/features/file-changes/evals/dataset.eval.yaml
index 8efdcd3ea..3d8db67e2 100644
--- a/examples/features/file-changes/evals/dataset.eval.yaml
+++ b/examples/features/file-changes/evals/dataset.eval.yaml
@@ -12,6 +12,9 @@
 name: file-changes
 description: Verify file_changes captures edits, creates, and deletes across multiple tests
 
+workspace:
+  template: ../workspace-template
+
 execution:
   target: mock_agent
 
diff --git a/examples/features/functional-grading/.agentv/targets.yaml b/examples/features/functional-grading/.agentv/targets.yaml
index 89a69fdf3..24d32f865 100644
--- a/examples/features/functional-grading/.agentv/targets.yaml
+++ b/examples/features/functional-grading/.agentv/targets.yaml
@@ -8,4 +8,3 @@ targets:
       printf "export function add(a: number, b: number): number {\n  return a + b;\n}\n\nexport function multiply(a: number, b: number): number {\n  return a * b;\n}\n\nexport function fibonacci(n: number): number {\n  if (n <= 1) return n;\n  let a = 0, b = 1;\n  for (let i = 2; i <= n; i++) {\n    const tmp = a + b;\n    a = b;\n    b = tmp;\n  }\n  return b;\n}\n" > src/index.ts &&
       echo "Implemented add, multiply, and fibonacci functions." > {OUTPUT_FILE}
       '
-    workspace_template: ../workspace-template
diff --git a/examples/features/functional-grading/evals/dataset.eval.yaml b/examples/features/functional-grading/evals/dataset.eval.yaml
index c07eda709..adc68a6ae 100644
--- a/examples/features/functional-grading/evals/dataset.eval.yaml
+++ b/examples/features/functional-grading/evals/dataset.eval.yaml
@@ -13,6 +13,9 @@
 name: functional-grading
 description: Functional grading with workspace_path — deploy-and-test pattern
 
+workspace:
+  template: ../workspace-template
+
 execution:
   target: mock_agent
 
diff --git a/examples/features/import-claude/evals/transcript-check.EVAL.yaml b/examples/features/import-claude/evals/transcript-check.EVAL.yaml
index ecd18a84c..ca9b95af4 100644
--- a/examples/features/import-claude/evals/transcript-check.EVAL.yaml
+++ b/examples/features/import-claude/evals/transcript-check.EVAL.yaml
@@ -1,3 +1,6 @@
+execution:
+  target: llm
+
 tests:
   - id: transcript-quality
     input: "Analyze the imported Claude Code transcript"
diff --git a/examples/features/input-files-shorthand/evals/dataset.eval.yaml b/examples/features/input-files-shorthand/evals/dataset.eval.yaml
index b209b359b..e763bc669 100644
--- a/examples/features/input-files-shorthand/evals/dataset.eval.yaml
+++ b/examples/features/input-files-shorthand/evals/dataset.eval.yaml
@@ -28,7 +28,7 @@
 description: Demonstrates input_files shorthand for attaching files to test inputs
 
 execution:
-  target: default
+  target: llm
 
 tests:
   # ==========================================
diff --git a/examples/features/latency-assertions/.agentv/targets.yaml b/examples/features/latency-assertions/.agentv/targets.yaml
index c807c9359..95c53760a 100644
--- a/examples/features/latency-assertions/.agentv/targets.yaml
+++ b/examples/features/latency-assertions/.agentv/targets.yaml
@@ -1,14 +1,7 @@
 targets:
-  - name: azure-llm
-    provider: azure
-    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
-    api_key: ${{ AZURE_OPENAI_API_KEY }}
-    model: ${{ AZURE_DEPLOYMENT_NAME }}
-    version: ${{ AZURE_OPENAI_API_VERSION }}
-
   - name: mock_latency_agent
     provider: cli
-    grader_target: azure-llm
+    grader_target: grader
     command: bun run ./mock-latency-agent.ts --prompt {PROMPT} --output {OUTPUT_FILE}
     cwd: ..
     healthcheck:
diff --git a/examples/features/local-cli/.agentv/targets.yaml b/examples/features/local-cli/.agentv/targets.yaml
index 0758e7b72..5b9324231 100644
--- a/examples/features/local-cli/.agentv/targets.yaml
+++ b/examples/features/local-cli/.agentv/targets.yaml
@@ -1,14 +1,7 @@
 targets:
-  - name: azure-llm
-    provider: azure
-    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
-    api_key: ${{ AZURE_OPENAI_API_KEY }}
-    model: ${{ AZURE_DEPLOYMENT_NAME }}
-    version: ${{ AZURE_OPENAI_API_VERSION }}
-
   - name: local_cli
     provider: cli
-    grader_target: azure-llm
+    grader_target: grader
     command: uv run ./mock_cli.py --prompt {PROMPT} {FILES} --output {OUTPUT_FILE}
     files_format: --file {path}
     cwd: ..
diff --git a/examples/features/local-cli/evals/dataset.eval.yaml b/examples/features/local-cli/evals/dataset.eval.yaml
index aa50c54f6..722be2ace 100644
--- a/examples/features/local-cli/evals/dataset.eval.yaml
+++ b/examples/features/local-cli/evals/dataset.eval.yaml
@@ -6,6 +6,8 @@ description: Minimal demo showing how to invoke a CLI target with file attachmen
 execution:
   target: local_cli
 
+tags: [agent]
+
 tests:
   - id: cli-provider-echo
     criteria: CLI echoes the prompt and mentions all attachment names
diff --git a/examples/features/matrix-evaluation/evals/dataset.eval.yaml b/examples/features/matrix-evaluation/evals/dataset.eval.yaml
index a1e2dbea3..9c6d704b1 100644
--- a/examples/features/matrix-evaluation/evals/dataset.eval.yaml
+++ b/examples/features/matrix-evaluation/evals/dataset.eval.yaml
@@ -1,30 +1,18 @@
 # Matrix Evaluation Example
 #
-# Runs tests against multiple targets and displays
-# a cross-target comparison matrix.
-#
-# Usage:
-#   agentv eval examples/features/matrix-evaluation/evals/dataset.eval.yaml
-#
-# Or with CLI override:
+# Runs tests against the configured agent target.
+# Override with CLI for multi-target comparison:
 #   agentv eval examples/features/matrix-evaluation/evals/dataset.eval.yaml --target copilot --target claude
 
-execution:
-  targets:
-    - copilot
-    - claude
-
+tags: [agent]
 tests:
   - id: general-greeting
     input: "Say hello"
     criteria: "The response should contain a greeting"
 
-  - id: copilot-only-task
+  - id: github-task
     input: "Create a GitHub issue"
     criteria: "The response should reference GitHub"
-    execution:
-      targets:
-        - copilot
 
   - id: code-generation
     input: "Write a fibonacci function in Python"
diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.yaml b/examples/features/multi-turn-conversation/evals/dataset.eval.yaml
index cc67a78f1..312289c31 100644
--- a/examples/features/multi-turn-conversation/evals/dataset.eval.yaml
+++ b/examples/features/multi-turn-conversation/evals/dataset.eval.yaml
@@ -5,7 +5,7 @@
 description: Multi-turn conversation evaluation with per-turn score breakdown
 
 execution:
-  target: default
+  target: llm
 
 tests:
   - id: support-context-retention
diff --git a/examples/features/nlp-metrics/evals/dataset.eval.yaml b/examples/features/nlp-metrics/evals/dataset.eval.yaml
index 967bfbbcc..f75b4d511 100644
--- a/examples/features/nlp-metrics/evals/dataset.eval.yaml
+++ b/examples/features/nlp-metrics/evals/dataset.eval.yaml
@@ -6,7 +6,7 @@ name: nlp-metrics
 description: NLP text-quality metrics using code_grader evaluators
 
 execution:
-  target: default
+  target: llm
 
 tests:
   - id: summarisation-rouge
diff --git a/examples/features/prompt-template-sdk/evals/dataset.eval.yaml b/examples/features/prompt-template-sdk/evals/dataset.eval.yaml
index ea6e410cb..6917de1bd 100644
--- a/examples/features/prompt-template-sdk/evals/dataset.eval.yaml
+++ b/examples/features/prompt-template-sdk/evals/dataset.eval.yaml
@@ -6,7 +6,7 @@ description: Demonstrates TypeScript prompt templates for custom LLM grader prom
 
 # Uses the default target defined in .agentv/targets.yaml
 execution:
-  target: default
+  target: llm
 
 tests:
   - id: prompt-template-basic
diff --git a/examples/features/repo-lifecycle/evals/dataset.eval.yaml b/examples/features/repo-lifecycle/evals/dataset.eval.yaml
index 1c544e7c0..b10f22132 100644
--- a/examples/features/repo-lifecycle/evals/dataset.eval.yaml
+++ b/examples/features/repo-lifecycle/evals/dataset.eval.yaml
@@ -14,8 +14,7 @@ workspace:
       clone:
         depth: 1
 
-execution:
-  target: copilot
+tags: [agent]
 
 tests:
   - id: describe-package
diff --git a/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml b/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml
index 7e7943eee..69f8087b5 100644
--- a/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml
+++ b/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml
@@ -16,9 +16,10 @@ workspace:
         depth: 1
 
 execution:
-  target: copilot
   workers: 2
 
+tags: [agent]
+
 tests:
   - id: test-1-core-name
     criteria: Report the core package name
diff --git a/examples/features/rubric/evals/dataset.eval.yaml b/examples/features/rubric/evals/dataset.eval.yaml
index 691cf7884..630ca0924 100644
--- a/examples/features/rubric/evals/dataset.eval.yaml
+++ b/examples/features/rubric/evals/dataset.eval.yaml
@@ -5,7 +5,7 @@ name: rubric
 description: "Example showing rubric evaluator - string shorthand and type: rubrics"
 
 execution:
-  target: default
+  target: llm
 
 tests:
   # ==========================================
diff --git a/examples/features/sdk-config-file/evals/dataset.eval.yaml b/examples/features/sdk-config-file/evals/dataset.eval.yaml
index 1c2b647a6..a28f0e037 100644
--- a/examples/features/sdk-config-file/evals/dataset.eval.yaml
+++ b/examples/features/sdk-config-file/evals/dataset.eval.yaml
@@ -5,7 +5,7 @@ name: sdk-config-file
 description: Demonstrates defineConfig() for typed project configuration
 
 execution:
-  target: default
+  target: llm
 
 tests:
   - id: config-greeting
diff --git a/examples/features/sdk-custom-assertion/evals/dataset.eval.yaml b/examples/features/sdk-custom-assertion/evals/dataset.eval.yaml
index a25078e06..6de27e4f8 100644
--- a/examples/features/sdk-custom-assertion/evals/dataset.eval.yaml
+++ b/examples/features/sdk-custom-assertion/evals/dataset.eval.yaml
@@ -5,7 +5,7 @@ name: sdk-custom-assertion
 description: Demonstrates custom assertions via defineAssertion() and convention discovery
 
 execution:
-  target: default
+  target: llm
 
 tests:
   - id: greeting-response
diff --git a/examples/features/suite-level-input-files/evals/dataset.eval.yaml b/examples/features/suite-level-input-files/evals/dataset.eval.yaml
index 8d23b147d..9211a4366 100644
--- a/examples/features/suite-level-input-files/evals/dataset.eval.yaml
+++ b/examples/features/suite-level-input-files/evals/dataset.eval.yaml
@@ -9,7 +9,7 @@ name: suite-level-input-files-example
 description: Suite-level input + input_files shorthands
 
 execution:
-  target: default
+  target: llm
 
 # Suite-level input as a plain string — prepended as a user message to every test.
 # No role/content wrapping needed at the top level, just like per-test input.
diff --git a/examples/features/suite-level-input/evals/dataset.eval.yaml b/examples/features/suite-level-input/evals/dataset.eval.yaml
index 5f0b204a0..7d6d75b4e 100644
--- a/examples/features/suite-level-input/evals/dataset.eval.yaml
+++ b/examples/features/suite-level-input/evals/dataset.eval.yaml
@@ -6,7 +6,7 @@ name: suite-level-input-example
 description: Suite-level input prepended to all tests (like suite-level assert)
 
 execution:
-  target: default
+  target: llm
 
 # Suite-level input: prepended to every test's input messages.
 # Accepts the same formats as test-level input (string or message array).
diff --git a/examples/features/threshold-evaluator/evals/dataset.eval.yaml b/examples/features/threshold-evaluator/evals/dataset.eval.yaml
index d3ea8b70c..2c1b395b5 100644
--- a/examples/features/threshold-evaluator/evals/dataset.eval.yaml
+++ b/examples/features/threshold-evaluator/evals/dataset.eval.yaml
@@ -5,7 +5,7 @@ description: Demonstrates the threshold aggregator — pass if N% of child evalu
 # Borderline verdicts count as passing (lenient).
 
 execution:
-  target: default
+  target: llm
 
 tests:
   - id: flexible-gate
diff --git a/examples/features/tool-evaluation-plugins/evals/dataset.eval.yaml b/examples/features/tool-evaluation-plugins/evals/dataset.eval.yaml
index c3c312dd9..0413df377 100644
--- a/examples/features/tool-evaluation-plugins/evals/dataset.eval.yaml
+++ b/examples/features/tool-evaluation-plugins/evals/dataset.eval.yaml
@@ -13,6 +13,9 @@
 
 description: Tool-call F1 scoring examples
 
+execution:
+  target: llm
+
 tests:
   # ==========================================
   # Example 1: Basic tool-call F1
diff --git a/examples/features/tool-trajectory-advanced/.agentv/targets.yaml b/examples/features/tool-trajectory-advanced/.agentv/targets.yaml
index e914855a4..d88455c8e 100644
--- a/examples/features/tool-trajectory-advanced/.agentv/targets.yaml
+++ b/examples/features/tool-trajectory-advanced/.agentv/targets.yaml
@@ -1,14 +1,7 @@
 targets:
-  - name: azure-llm
-    provider: azure
-    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
-    api_key: ${{ AZURE_OPENAI_API_KEY }}
-    model: ${{ AZURE_DEPLOYMENT_NAME }}
-    version: ${{ AZURE_OPENAI_API_VERSION }}
-
   - name: static_trace
     provider: cli
-    grader_target: azure-llm
+    grader_target: grader
     command: bun run ./cat-trace.ts --trace ./static-trace.json --prompt {PROMPT} --output {OUTPUT_FILE}
     cwd: ..
     healthcheck:
diff --git a/examples/features/tool-trajectory-simple/.agentv/targets.yaml b/examples/features/tool-trajectory-simple/.agentv/targets.yaml
index a748f5017..d190214c3 100644
--- a/examples/features/tool-trajectory-simple/.agentv/targets.yaml
+++ b/examples/features/tool-trajectory-simple/.agentv/targets.yaml
@@ -1,14 +1,7 @@
 targets:
-  - name: azure-llm
-    provider: azure
-    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
-    api_key: ${{ AZURE_OPENAI_API_KEY }}
-    model: ${{ AZURE_DEPLOYMENT_NAME }}
-    version: ${{ AZURE_OPENAI_API_VERSION }}
-
   - name: mock_agent
     provider: cli
-    grader_target: azure-llm
+    grader_target: grader
     command: bun run ./mock-agent.ts --prompt {PROMPT} --output {OUTPUT_FILE}
     cwd: ..
     healthcheck:
diff --git a/examples/features/trace-analysis/evals/dataset.eval.yaml b/examples/features/trace-analysis/evals/dataset.eval.yaml
index a8f683aca..cfc8b02a0 100644
--- a/examples/features/trace-analysis/evals/dataset.eval.yaml
+++ b/examples/features/trace-analysis/evals/dataset.eval.yaml
@@ -5,6 +5,9 @@
 name: trace-analysis-demo
 description: Demo eval for generating execution traces to analyze
 
+execution:
+  target: llm
+
 tests:
   - id: research-question
     input: What are the key differences between REST and GraphQL APIs?
diff --git a/examples/features/trace-evaluation/evals/dataset.eval.yaml b/examples/features/trace-evaluation/evals/dataset.eval.yaml
index cf6e7e94f..5253abe4e 100644
--- a/examples/features/trace-evaluation/evals/dataset.eval.yaml
+++ b/examples/features/trace-evaluation/evals/dataset.eval.yaml
@@ -8,6 +8,9 @@
 
 description: Trace-based evaluation of agent internals using code graders
 
+execution:
+  target: llm
+
 tests:
   # ==========================================
   # Span Count - verify LLM/tool call counts
diff --git a/examples/features/trial-output-consistency/evals/dataset.eval.yaml b/examples/features/trial-output-consistency/evals/dataset.eval.yaml
index df889d038..dbd467972 100644
--- a/examples/features/trial-output-consistency/evals/dataset.eval.yaml
+++ b/examples/features/trial-output-consistency/evals/dataset.eval.yaml
@@ -8,7 +8,7 @@
 description: Trial output consistency via embedding similarity
 
 execution:
-  target: default
+  target: llm
 
 tests:
   # ── High consistency: semantically identical outputs ──────────────
diff --git a/examples/features/trials/evals/dataset.eval.yaml b/examples/features/trials/evals/dataset.eval.yaml
index 19c0832de..0dc441a72 100644
--- a/examples/features/trials/evals/dataset.eval.yaml
+++ b/examples/features/trials/evals/dataset.eval.yaml
@@ -5,7 +5,7 @@ name: trials
 description: Trial strategy example - pass@k with 2 trials
 
 execution:
-  target: default
+  target: llm
   trials:
     count: 2
     strategy: pass_at_k
diff --git a/examples/features/weighted-evaluators/evals/dataset.eval.yaml b/examples/features/weighted-evaluators/evals/dataset.eval.yaml
index 87ad8e079..dd2f8dfbf 100644
--- a/examples/features/weighted-evaluators/evals/dataset.eval.yaml
+++ b/examples/features/weighted-evaluators/evals/dataset.eval.yaml
@@ -3,7 +3,7 @@ name: weighted-evaluators-examples
 # This example demonstrates per-evaluator weights for top-level aggregation
 
 execution:
-  target: default
+  target: llm
 
 tests:
   # Example 1: Different weights for multiple evaluators
diff --git a/examples/features/workspace-multi-repo/evals/dataset.eval.yaml b/examples/features/workspace-multi-repo/evals/dataset.eval.yaml
index facb1af6d..17b12b480 100644
--- a/examples/features/workspace-multi-repo/evals/dataset.eval.yaml
+++ b/examples/features/workspace-multi-repo/evals/dataset.eval.yaml
@@ -1,7 +1,6 @@
 description: >-
-  Demonstrates a multi-repo workspace with VSCode. Two repos (agentv and
-  allagents) are cloned into the workspace and opened as separate folders
-  in a single VSCode window via the .code-workspace file.
+  Demonstrates a multi-repo workspace. Two repos (agentv and
+  allagents) are cloned into the workspace.
 
 workspace:
   template: ../workspace-template
@@ -27,10 +26,8 @@ workspace:
         resolve: remote
       clone:
         depth: 1
-execution:
-  targets:
-    - vscode
-    - copilot
+
+tags: [agent]
 
 tests:
   - id: verify-multi-repo
diff --git a/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml b/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml
index 52de5906b..a730f4697 100644
--- a/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml
+++ b/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml
@@ -29,8 +29,7 @@ workspace:
         resolve: local
       clone:
         depth: 1
-execution:
-  target: vscode
+tags: [agent]
 
 tests:
   - id: verify-workspace
diff --git a/examples/features/workspace-setup-script/evals/dataset.eval.yaml b/examples/features/workspace-setup-script/evals/dataset.eval.yaml
index b37c64d2b..feca0485e 100644
--- a/examples/features/workspace-setup-script/evals/dataset.eval.yaml
+++ b/examples/features/workspace-setup-script/evals/dataset.eval.yaml
@@ -27,8 +27,7 @@ workspace:
         resolve: local
       clone:
         depth: 1
-execution:
-  target: copilot
+tags: [agent]
 
 tests:
   - id: verify-workspace
diff --git a/examples/features/workspace-shared-config/evals/accuracy/dataset.eval.yaml b/examples/features/workspace-shared-config/evals/accuracy/dataset.eval.yaml
index ab71b766a..cd8ffa538 100644
--- a/examples/features/workspace-shared-config/evals/accuracy/dataset.eval.yaml
+++ b/examples/features/workspace-shared-config/evals/accuracy/dataset.eval.yaml
@@ -3,6 +3,7 @@ description: >-
   The workspace is defined once in workspace.yaml and reused across eval files.
 
 workspace: ../../workspace.yaml
+tags: [agent]
 
 tests:
   - id: verify-repo-exists
diff --git a/examples/features/workspace-shared-config/evals/regression/dataset.eval.yaml b/examples/features/workspace-shared-config/evals/regression/dataset.eval.yaml
index 9aced7cbd..b53eeafd5 100644
--- a/examples/features/workspace-shared-config/evals/regression/dataset.eval.yaml
+++ b/examples/features/workspace-shared-config/evals/regression/dataset.eval.yaml
@@ -3,6 +3,7 @@ description: >-
   Demonstrates workspace config reuse across eval files in different directories.
 
 workspace: ../../workspace.yaml
+tags: [agent]
 
 tests:
   - id: verify-readme-exists
diff --git a/examples/showcase/cross-repo-sync/.agentv/targets.yaml b/examples/showcase/cross-repo-sync/.agentv/targets.yaml
index 4b51211be..104be87ee 100644
--- a/examples/showcase/cross-repo-sync/.agentv/targets.yaml
+++ b/examples/showcase/cross-repo-sync/.agentv/targets.yaml
@@ -9,9 +9,3 @@ targets:
   - name: copilot_agent
     provider: copilot-cli
     model: claude-haiku-4.5
-
-  - name: azure_grader
-    provider: azure
-    model: ${{ AZURE_DEPLOYMENT_NAME }}
-    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
-    api_key: ${{ AZURE_OPENAI_API_KEY }}
diff --git a/examples/showcase/cw-incident-triage/evals/dataset.eval.yaml b/examples/showcase/cw-incident-triage/evals/dataset.eval.yaml
index 94a57b6ff..53c61706d 100644
--- a/examples/showcase/cw-incident-triage/evals/dataset.eval.yaml
+++ b/examples/showcase/cw-incident-triage/evals/dataset.eval.yaml
@@ -7,7 +7,7 @@
 
 description: CargoWise criticality rating (CR1-CR9) classification eval for support ticket triage in logistics software.
 execution:
-  target: default
+  target: llm
 
 assertions:
   - name: json_schema_validator
diff --git a/examples/showcase/evaluator-conformance/EVAL.yaml b/examples/showcase/evaluator-conformance/EVAL.yaml
index bf1724f55..54c6d9ed7 100644
--- a/examples/showcase/evaluator-conformance/EVAL.yaml
+++ b/examples/showcase/evaluator-conformance/EVAL.yaml
@@ -9,6 +9,9 @@
 
 description: Keyword-matching evaluator used for conformance testing demo
 
+execution:
+  target: llm
+
 tests:
   - id: exact-match
     criteria: "Answer must name the capital city of France."
diff --git a/examples/showcase/export-screening/evals/dataset.eval.yaml b/examples/showcase/export-screening/evals/dataset.eval.yaml
index f2a5a898c..a6b88d2b6 100644
--- a/examples/showcase/export-screening/evals/dataset.eval.yaml
+++ b/examples/showcase/export-screening/evals/dataset.eval.yaml
@@ -17,9 +17,6 @@
 
 description: Export control risk classification eval for trade compliance screening
 
-execution:
-  target: default
-
 assertions:
   - name: risk_assessment_quality
     type: code-grader
diff --git a/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml b/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml
index a805c43d2..4e6b468cf 100644
--- a/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml
+++ b/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml
@@ -12,6 +12,7 @@
 #   agentv eval examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml
 
 description: Multi-model benchmark — accuracy, completeness, and clarity across models
+tags: [multi-provider]
 
 execution:
   targets:
diff --git a/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml b/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml
index 34212cabf..69441befb 100644
--- a/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml
+++ b/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml
@@ -3,27 +3,7 @@ targets:
     provider: cli
     command: bun run ./scripts/replay-fixture-output.ts --prompt {PROMPT} --output {OUTPUT_FILE}
     cwd: ..
-    grader_target: grader_gpt_5_mini
+    grader_target: grader
     healthcheck:
       command: bun run ./scripts/replay-fixture-output.ts --healthcheck
       cwd: ..
-
-  # Illustrative low-cost grader targets. Swap these to the low-cost models you already use.
-  - name: grader_gpt_5_mini
-    provider: azure
-    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
-    api_key: ${{ AZURE_OPENAI_API_KEY }}
-    version: ${{ AZURE_OPENAI_API_VERSION }}
-    model: ${{ AZURE_DEPLOYMENT_NAME }}
-
-  - name: grader_claude_haiku
-    provider: openrouter
-    api_key: ${{ OPENROUTER_API_KEY }}
-    model: anthropic/claude-haiku-4.5
-    system_prompt: "Return concise structured grading output only."
-
-  - name: grader_gemini_flash
-    provider: openrouter
-    api_key: ${{ OPENROUTER_API_KEY }}
-    model: google/gemini-3-flash-preview
-    system_prompt: "Return concise structured grading output only."
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index e03ab9672..f5e2faa56 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -356,10 +356,22 @@ export async function runEvaluation(
     if (resolvedTargetsByName.has(name)) {
       return resolvedTargetsByName.get(name);
     }
-    const definition = targetDefinitions.get(name);
+    // Follow use_target chain to find the concrete definition
+    let definition = targetDefinitions.get(name);
     if (!definition) {
       return undefined;
     }
+    for (let depth = 0; depth < 5; depth++) {
+      const useTarget = definition.use_target;
+      if (typeof useTarget !== 'string' || useTarget.trim().length === 0) break;
+      // Resolve ${{ ENV_VAR }} syntax
+      const envMatch = useTarget.trim().match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
+      const resolvedName = envMatch ? (envLookup[envMatch[1]] ?? '') : useTarget.trim();
+      if (resolvedName.length === 0) break;
+      const next = targetDefinitions.get(resolvedName);
+      if (!next) break;
+      definition = next;
+    }
     const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
     resolvedTargetsByName.set(name, resolved);
     return resolved;
diff --git a/packages/core/src/evaluation/providers/targets-file.ts b/packages/core/src/evaluation/providers/targets-file.ts
index 902549a0b..7e7e366fb 100644
--- a/packages/core/src/evaluation/providers/targets-file.ts
+++ b/packages/core/src/evaluation/providers/targets-file.ts
@@ -32,8 +32,11 @@ function assertTargetDefinition(value: unknown, index: number, filePath: string)
     );
   }
 
-  if (typeof provider !== 'string' || provider.trim().length === 0) {
-    throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
+  const hasUseTarget = typeof value.use_target === 'string' && value.use_target.trim().length > 0;
+  if (!hasUseTarget && (typeof provider !== 'string' || provider.trim().length === 0)) {
+    throw new Error(
+      `targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider' (or use use_target for delegation)`,
+    );
   }
 
   // Pass through all properties from the YAML to support the flattened schema
diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts
index dd1df2d0a..6ec0217f9 100644
--- a/packages/core/src/evaluation/providers/targets.ts
+++ b/packages/core/src/evaluation/providers/targets.ts
@@ -643,6 +643,7 @@ export type ResolvedTarget =
  * here automatically makes it valid in targets.yaml without a separate update.
  */
 export const COMMON_TARGET_SETTINGS = [
+  'use_target',
   'provider_batching',
   'providerBatching',
   'subagent_mode_allowed',
@@ -654,7 +655,8 @@ export const COMMON_TARGET_SETTINGS = [
 const BASE_TARGET_SCHEMA = z
   .object({
     name: z.string().min(1, 'target name is required'),
-    provider: z.string().min(1, 'provider is required'),
+    provider: z.string().optional(),
+    use_target: z.string().optional(),
     grader_target: z.string().optional(),
     judge_target: z.string().optional(), // backward compat
     workers: z.number().int().min(1).optional(),
@@ -736,6 +738,11 @@ export function resolveTargetDefinition(
       `${parsed.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`,
     );
   }
+  if (!parsed.provider) {
+    throw new Error(
+      `${parsed.name}: 'provider' is required (targets with use_target must be resolved before calling resolveTargetDefinition)`,
+    );
+  }
   const provider = resolveString(
     parsed.provider,
     env,
diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts
index 9b12dce77..774f32c07 100644
--- a/packages/core/src/evaluation/providers/types.ts
+++ b/packages/core/src/evaluation/providers/types.ts
@@ -282,7 +282,10 @@ export type EnvLookup = Readonly<Record<string, string | undefined>>;
 
 export interface TargetDefinition {
   readonly name: string;
-  readonly provider: ProviderKind | string;
+  readonly provider?: ProviderKind | string;
+  // Delegation: resolve this target as another named target.
+  // Supports ${{ ENV_VAR }} syntax (e.g., use_target: ${{ AGENT_TARGET }}).
+  readonly use_target?: string | unknown | undefined;
   readonly grader_target?: string | undefined;
   /** @deprecated Use `grader_target` instead */
   readonly judge_target?: string | undefined;
diff --git a/packages/core/src/evaluation/validation/targets-validator.ts b/packages/core/src/evaluation/validation/targets-validator.ts
index d941900f6..7e1e8299b 100644
--- a/packages/core/src/evaluation/validation/targets-validator.ts
+++ b/packages/core/src/evaluation/validation/targets-validator.ts
@@ -535,16 +535,19 @@ export async function validateTargetsFile(filePath: string): Promise<ValidationR
 
     // Required field: provider
     const provider = target.provider;
+    const hasUseTarget =
+      typeof target.use_target === 'string' && target.use_target.trim().length > 0;
     const providerValue = typeof provider === 'string' ? provider.trim().toLowerCase() : undefined;
     const isTemplated = typeof provider === 'string' && /^\$\{\{.+\}\}$/.test(provider.trim());
-    if (typeof provider !== 'string' || provider.trim().length === 0) {
+    if (!hasUseTarget && (typeof provider !== 'string' || provider.trim().length === 0)) {
       errors.push({
         severity: 'error',
         filePath: absolutePath,
         location: `${location}.provider`,
-        message: "Missing or invalid 'provider' field (must be a non-empty string)",
+        message:
+          "Missing or invalid 'provider' field (must be a non-empty string, or use use_target for delegation)",
       });
-    } else if (!isTemplated && !knownProviders.includes(provider)) {
+    } else if (typeof provider === 'string' && !isTemplated && !knownProviders.includes(provider)) {
       // Warning for unknown providers (non-fatal); skip when provider uses ${{ VAR }}
       errors.push({
         severity: 'warning',
diff --git a/scripts/ci-summary.ts b/scripts/ci-summary.ts
new file mode 100644
index 000000000..0b709be3c
--- /dev/null
+++ b/scripts/ci-summary.ts
@@ -0,0 +1,166 @@
+#!/usr/bin/env bun
+/**
+ * Generate a GitHub Actions step summary from AgentV eval results.
+ *
+ * Usage: bun run scripts/ci-summary.ts <results-dir>
+ *
+ * Reads:
+ *   <results-dir>/artifacts/index.jsonl  — per-test results
+ *
+ * Outputs GitHub-flavored Markdown to stdout (pipe to $GITHUB_STEP_SUMMARY).
+ */
+import { existsSync, readFileSync } from 'node:fs';
+import path from 'node:path';
+
+const resultsDir = process.argv[2] || '.agentv/ci-results';
+const indexPath = path.join(resultsDir, 'artifacts', 'index.jsonl');
+
+interface EvalResult {
+  test_id?: string;
+  dataset?: string;
+  score?: number;
+  pass?: boolean;
+  execution_status?: string;
+  error?: string;
+  duration_ms?: number;
+  target?: string;
+  assertions?: Array<{ text?: string; passed?: boolean }>;
+  failure_stage?: string;
+  failure_reason_code?: string;
+}
+
+// Parse JSONL results
+const results: EvalResult[] = [];
+if (existsSync(indexPath)) {
+  const lines = readFileSync(indexPath, 'utf-8').split('\n').filter(Boolean);
+  for (const line of lines) {
+    try {
+      results.push(JSON.parse(line));
+    } catch {
+      /* skip malformed */
+    }
+  }
+}
+
+if (results.length === 0) {
+  console.log('## AgentV Eval Results\n\n:warning: No results found.');
+  process.exit(0);
+}
+
+// Group by dataset/suite
+const suites = new Map<string, EvalResult[]>();
+for (const r of results) {
+  const suite = r.dataset || 'default';
+  if (!suites.has(suite)) suites.set(suite, []);
+  suites.get(suite)?.push(r);
+}
+
+// Compute stats
+const threshold = 0.8;
+let totalPass = 0;
+let totalFail = 0;
+let totalErrors = 0;
+let totalScore = 0;
+const scores: number[] = [];
+
+for (const r of results) {
+  const isError = r.execution_status === 'execution_error';
+  const passed = !isError && (r.score ?? 0) >= threshold;
+  if (isError) totalErrors++;
+  else if (passed) totalPass++;
+  else totalFail++;
+  const score = r.score ?? 0;
+  totalScore += score;
+  scores.push(score);
+}
+
+const totalTests = results.length;
+const meanScore = totalTests > 0 ? totalScore / totalTests : 0;
+
+// Stddev
+const variance =
+  scores.length > 0 ? scores.reduce((sum, s) => sum + (s - meanScore) ** 2, 0) / scores.length : 0;
+const stddev = Math.sqrt(variance);
+
+// Total duration
+const totalDuration = results.reduce((s, r) => s + (r.duration_ms ?? 0), 0);
+
+const md: string[] = [];
+md.push('## AgentV Eval Results');
+md.push('');
+
+const icon = totalFail === 0 && totalErrors === 0 ? ':white_check_mark:' : ':x:';
+md.push(
+  `${icon} **${totalPass}/${totalTests} passed** | Mean: **${meanScore.toFixed(3)}** | Stddev: **${stddev.toFixed(3)}** | Errors: **${totalErrors}** | Duration: **${(totalDuration / 1000).toFixed(1)}s**`,
+);
+md.push('');
+
+// Suite table
+md.push('| Suite | Tests | Pass | Fail | Errors | Mean | Duration |');
+md.push('|-------|------:|-----:|-----:|-------:|-----:|---------:|');
+
+for (const [suite, tests] of suites) {
+  const pass = tests.filter(
+    (t) => t.execution_status !== 'execution_error' && (t.score ?? 0) >= threshold,
+  ).length;
+  const errors = tests.filter((t) => t.execution_status === 'execution_error').length;
+  const fail = tests.length - pass - errors;
+  const mean = (tests.reduce((s, t) => s + (t.score ?? 0), 0) / tests.length).toFixed(3);
+  const duration = tests.reduce((s, t) => s + (t.duration_ms ?? 0), 0);
+  const durationStr = duration > 0 ? `${(duration / 1000).toFixed(1)}s` : '-';
+  const suiteIcon =
+    fail === 0 && errors === 0 ? ':white_check_mark:' : errors > 0 ? ':warning:' : ':x:';
+  md.push(
+    `| ${suiteIcon} ${suite} | ${tests.length} | ${pass} | ${fail} | ${errors} | ${mean} | ${durationStr} |`,
+  );
+}
+
+md.push('');
+
+// Failed tests detail
+const failedTests = results.filter(
+  (r) => r.execution_status !== 'execution_error' && (r.score ?? 0) < threshold,
+);
+if (failedTests.length > 0) {
+  md.push('<details>');
+  md.push(`<summary>:x: ${failedTests.length} quality failure(s)</summary>`);
+  md.push('');
+  for (const t of failedTests.slice(0, 50)) {
+    const name = t.test_id || 'unknown';
+    const suite = t.dataset || 'default';
+    md.push(
+      `**${suite} / ${name}** — score: ${(t.score ?? 0).toFixed(3)} | target: ${t.target ?? '-'}`,
+    );
+    if (t.assertions) {
+      const failed = t.assertions.filter((a) => !a.passed);
+      for (const a of failed) {
+        md.push(`  - :x: ${a.text ?? 'assertion failed'}`);
+      }
+    }
+    md.push('');
+  }
+  if (failedTests.length > 50) {
+    md.push(`_...and ${failedTests.length - 50} more_`);
+  }
+  md.push('</details>');
+  md.push('');
+}
+
+// Error tests detail
+const errorTests = results.filter((r) => r.execution_status === 'execution_error');
+if (errorTests.length > 0) {
+  md.push('<details>');
+  md.push(`<summary>:warning: ${errorTests.length} execution error(s)</summary>`);
+  md.push('');
+  for (const t of errorTests.slice(0, 30)) {
+    const name = t.test_id || 'unknown';
+    md.push(`**${name}** — ${t.failure_reason_code ?? 'error'}: ${t.error ?? 'unknown error'}`);
+    md.push('');
+  }
+  if (errorTests.length > 30) {
+    md.push(`_...and ${errorTests.length - 30} more_`);
+  }
+  md.push('</details>');
+}
+
+console.log(md.join('\n'));