agentv/examples/features/execution-metrics/evals/dataset.yaml at cc12194dd3aa964f6c4b0000fbc5eea29e94420c · EntityProcess/agentv · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# Execution Metrics Evaluator Demo
# Demonstrates the built-in execution_metrics evaluator for declarative threshold-based checks.
#
# The execution_metrics evaluator allows you to set limits on:
# - max_tool_calls: Maximum number of tool invocations
# - max_llm_calls: Maximum number of LLM calls (assistant messages)
# - max_tokens: Maximum total tokens (input + output)
# - max_cost_usd: Maximum cost in USD
# - max_duration_ms: Maximum execution duration in milliseconds
# - target_exploration_ratio: Target ratio of read-only tool calls (with tolerance)
#
# Only specified thresholds are checked; omitted ones are ignored.
# Score is proportional: hits / (hits + misses)
#
# Run with:
#   bun agentv eval examples/features/execution-metrics/evals/dataset.yaml --dry-run

description: Demonstrates the built-in execution_metrics evaluator

# Mock agent that returns realistic execution metrics
execution:
  target: mock_metrics_agent

evalcases:
  # ==========================================
  # Example 1: Simple threshold check - PASS
  # Check that an efficient agent stays within limits
  # ==========================================
  - id: simple-thresholds-pass

    expected_outcome: |-
      Agent responds efficiently within all specified limits.

    input:
      - role: user
        content: Hello, this is a simple question.

    execution:
      evaluators:
        - name: efficiency-check
          type: execution_metrics
          max_tool_calls: 10
          max_tokens: 2000
          max_duration_ms: 10000

  # ==========================================
  # Example 2: Multiple thresholds - PASS
  # Comprehensive check with all threshold types
  # ==========================================
  - id: comprehensive-thresholds

    expected_outcome: |-
      Agent performs a task within all efficiency constraints.

    input:
      - role: user
        content: Hello, give me a simple response.

    execution:
      evaluators:
        - name: full-efficiency-check
          type: execution_metrics
          max_tool_calls: 15
          max_llm_calls: 5
          max_tokens: 3000
          max_cost_usd: 0.1
          max_duration_ms: 30000

  # ==========================================
  # Example 3: Research task with tool trajectory + metrics
  # Combines multiple evaluator types
  # ==========================================
  - id: research-with-metrics

    expected_outcome: |-
      Agent performs research and uses tools efficiently.
      Metrics reflect reasonable token usage and tool calls.

    input:
      - role: user
        content: Research and analyze the topic of machine learning.

    execution:
      evaluators:
        # Check tool trajectory
        - name: trajectory-check
          type: tool_trajectory
          mode: any_order
          minimums:
            search: 1

        # Check execution efficiency
        - name: metrics-check
          type: execution_metrics
          max_tool_calls: 20
          max_tokens: 5000

  # ==========================================
  # Example 4: Exploration ratio check
  # Validates the balance between read-only and write operations
  # ==========================================
  - id: exploration-ratio-check

    expected_outcome: |-
      Agent maintains a good balance of exploration (reading) vs
      action (writing/editing) tool calls.

    input:
      - role: user
        content: Read the documentation and make small improvements.

    execution:
      evaluators:
        - name: exploration-balance
          type: execution_metrics
          target_exploration_ratio: 0.6  # 60% should be read-only tools
          exploration_tolerance: 0.2      # Allow +/- 20% variance

  # ==========================================
  # Example 5: Strict cost budget
  # Useful for cost-sensitive applications
  # ==========================================
  - id: cost-budget-check

    expected_outcome: |-
      Agent completes the task within the specified cost budget.

    input:
      - role: user
        content: Generate a brief summary.

    execution:
      evaluators:
        - name: cost-check
          type: execution_metrics
          max_cost_usd: 0.05
          weight: 2.0  # Double weight for cost compliance

  # ==========================================
  # Example 6: Combining with code_judge
  # Use execution_metrics for thresholds, code_judge for custom logic
  # ==========================================
  - id: hybrid-evaluation

    expected_outcome: |-
      Agent passes both declarative thresholds and custom evaluation.

    input:
      - role: user
        content: Process the data efficiently.

    execution:
      evaluators:
        # Declarative threshold checks
        - name: metric-thresholds
          type: execution_metrics
          max_tool_calls: 10
          max_duration_ms: 5000

        # Custom evaluation logic (for more complex checks)
        - name: custom-check
          type: code_judge
          script: ["bun", "run", "../scripts/check-metrics-present.ts"]