-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathlocal_test.py
More file actions
69 lines (55 loc) · 1.88 KB
/
local_test.py
File metadata and controls
69 lines (55 loc) · 1.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Local testing for UI-CUBE deterministic tasks."""
import asyncio
import os
import hud
from hud import Environment
from hud.agents import create_agent
from prompts import SYSTEM_PROMPT
from hud.agents.gemini_cua import GeminiCUAAgent
from hud.agents import OpenAIAgent
from hud.agents import OperatorAgent
import logging
logger = logging.getLogger(__name__)
DEV_URL = os.getenv("HUD_DEV_URL", "http://localhost:8765/mcp")
env = Environment("ui-cube")
env.connect_url(DEV_URL)
# model = "gpt-5.1"
# model = "grok-4-1-fast"
# model = "z-ai/glm-4.5v"
# model = "claude-sonnet-4-5"
# model = "gemini-3-pro-preview"
model = "claude-sonnet-4-5"
max_steps = 30
async def test_sample(task_id: str = "combo-box-tasks--1"):
"""Test a specific deterministic task."""
print(f"\n=== Test: {task_id} ===")
task = env("deterministic", task_id=task_id)
logger.info(f"SYSTEM_MESSAGE: {SYSTEM_PROMPT}")
async with hud.eval(task) as ctx:
agent = create_agent(
model=model,
system_prompt=SYSTEM_PROMPT,
disallowed_tools=["hud-logs", "gemnini_computer"],
)
# agent = OperatorAgent.create(
# model="computer-use-preview",
# system_prompt=SYSTEM_PROMPT,
# disallowed_tools=["hud-logs"],
# validate_api_key=False,
# )
# agent = GeminiCUAAgent.create(
# model="gemini-2.5-computer-use-preview-10-2025",
# system_prompt=SYSTEM_PROMPT,
# disallowed_tools=["hud-logs"],
# validate_api_key=False,
# )
await agent.run(ctx, max_steps=max_steps)
print(f"Reward: {ctx.reward}")
print(f"Success: {ctx.reward == 1.0}")
async def main():
print("UI-CUBE Local Test")
print("=" * 40)
# await test_sample()
await test_sample("navigation-search-interaction--16")
if __name__ == "__main__":
asyncio.run(main())