trampolines/examples/adversarial_code_review.py at main · hud-evals/trampolines · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""Adversarial code review -- two agents in the same environment.

One agent writes/fixes code. The other finds bugs and edge cases.
They bounce infinitely until the code is solid (or timeout).

    python -m trampolines run examples/adversarial_code_review.py

Both agents share the same HUD environment (bash + edit tools),
so the reviewer can actually RUN the code to find real bugs.
"""

from __future__ import annotations

import os

import hud
from hud.agents import create_agent
from hud.tools.coding import EditTool, ShellTool
from hud.tools.filesystem import GlobTool, GrepTool, ListTool, ReadTool

base = os.getcwd()
env = hud.Environment("adversarial-review")
env.add_tool(ShellTool(cwd=base))
env.add_tool(EditTool())
env.add_tool(ReadTool(base_path=base))
env.add_tool(GrepTool(base_path=base))
env.add_tool(GlobTool(base_path=base))
env.add_tool(ListTool(base_path=base))

MODEL = "claude-sonnet-4-20250514"
coder = create_agent(MODEL, verbose=True)
reviewer = create_agent(MODEL, verbose=True)


@env.scenario("code")
async def code_scenario(task: str):
    yield f"""You are a developer. Complete this task using the available tools.
Write clean, tested code.

Task: {task}

When done, submit a summary of what you implemented."""
    yield 1.0


@env.scenario("review")
async def review_scenario(task: str):
    yield f"""You are a ruthless code reviewer and QA engineer.
The codebase was just modified. Your job:

1. Read the code that was changed
2. Run it and try to break it
3. Find bugs, edge cases, missing error handling
4. Write failing test cases that expose problems

If you find issues, describe them clearly with reproduction steps.
If the code is solid and all edge cases are handled, say "LGTM".

Context: {task}"""
    yield 1.0


async def write(task: str) -> object:
    async with hud.eval(env("code", task=task), trace=False, quiet=True) as ctx:
        result = await coder.run(ctx, max_steps=20)
    return review(f"Original task: {task}\n\nCoder's summary: {result.content}")


async def review(context: str) -> str | object:
    async with hud.eval(env("review", task=context), trace=False, quiet=True) as ctx:
        result = await reviewer.run(ctx, max_steps=20)
    feedback = result.content or ""
    if "lgtm" in feedback.lower():
        return f"Code approved.\n\nFinal review:\n{feedback}"
    return write(f"{context}\n\nReviewer feedback:\n{feedback}")