forked from LINs-lab/MASArena
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
164 lines (133 loc) · 5.26 KB
/
main.py
File metadata and controls
164 lines (133 loc) · 5.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python3
import argparse
import datetime
import sys
from pathlib import Path
import asyncio
from mas_arena.benchmark_runner import BenchmarkRunner
import logging
logger = logging.getLogger(__name__)
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description="Run benchmarks for multi-agent systems")
# Import available agent systems and benchmarks
from mas_arena.agents import AVAILABLE_AGENT_SYSTEMS
from mas_arena.evaluators import BENCHMARKS
parser.add_argument(
"--benchmark",
type=str,
default="math",
choices=list(BENCHMARKS.keys()),
help="Benchmark to run (default: math)",
)
parser.add_argument(
"--data", type=str, default=None, help="Path to benchmark data (default: data/{benchmark}_test.jsonl)"
)
parser.add_argument("--limit", type=int, default=None, help="Maximum number of problems to process (default: None)")
parser.add_argument(
"--agent-system",
type=str,
default="single_agent",
choices=list(AVAILABLE_AGENT_SYSTEMS.keys()),
help="Agent system to use (default: single_agent)",
)
parser.add_argument(
"--verbose", action="store_true", default=True, help="Print progress information (default: True)"
)
parser.add_argument(
"--results-dir", type=str, default="results", help="Directory to store results (default: results)"
)
parser.add_argument(
"--use-mcp-tools", action="store_true", default=False,
help="Enable integration of MCP tools (default: False)"
)
parser.add_argument(
"--mcp-config-file", type=str, default=None,
help="Path to MCP servers configuration JSON file"
)
parser.add_argument(
"--use-tools", action="store_true", default=None,
help="Enable integration of tools (default: False)"
)
parser.add_argument(
"--async-run", action="store_true", help="Run the benchmark asynchronously."
)
parser.add_argument(
"--concurrency", type=int, default=10, help="Concurrency level for async run."
)
parser.add_argument(
"--data-id", type=str, default=None,
help="Data ID to use (default: None)"
)
# Parse arguments
args = parser.parse_args()
# Build agent configuration for MCP tool integration
agent_config = {}
if args.use_mcp_tools:
agent_config["use_mcp_tools"] = True
import json
if not args.mcp_config_file:
parser.error("--use-mcp-tools requires --mcp-config-file")
try:
with open(args.mcp_config_file, "r") as f:
agent_config["mcp_servers"] = json.load(f)
# Store the config file path for reference
agent_config["mcp_config_file"] = args.mcp_config_file
# Enable mock mode if "mock" appears in the config file name
if "mock" in args.mcp_config_file.lower():
agent_config["mock_mcp"] = True
print(f"Using mock MCP tools (config: {args.mcp_config_file})")
except Exception as e:
print(f"Failed to load MCP config file: {e}", file=sys.stderr)
return 1
if args.use_tools:
agent_config["use_tools"] = True
# Create directories if needed
Path(args.results_dir).mkdir(exist_ok=True)
# Print header
print("\n" + "=" * 80)
print(f"Multi-Agent Benchmark Runner ({datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')})")
print("=" * 80)
print(f"Benchmark: {args.benchmark}")
print(f"Agent System: {args.agent_system}")
print(f"Data: {args.data or 'default'}")
print(f"Limit: {args.limit or 'all'}")
print("=" * 80 + "\n")
# Create benchmark runner
runner = BenchmarkRunner(results_dir=args.results_dir)
# Check for concurrency support
benchmark_config = BENCHMARKS.get(args.benchmark, {})
evaluator_class = benchmark_config.get("evaluator")
supports_concurrency = evaluator_class and getattr(evaluator_class, 'SUPPORTS_CONCURRENCY', True)
run_async = args.async_run and supports_concurrency
if args.async_run and not supports_concurrency:
if args.verbose:
print(f"Warning: {args.benchmark} benchmark does not support concurrency. Running synchronously.\n")
# Run benchmark
try:
if run_async:
summary = asyncio.run(runner.arun(
benchmark_name=args.benchmark,
data_path=args.data,
limit=args.limit,
agent_system=args.agent_system,
agent_config=agent_config if agent_config else None,
verbose=args.verbose,
concurrency=args.concurrency,
))
else:
summary = runner.run(
benchmark_name=args.benchmark,
data_path=args.data,
limit=args.limit,
agent_system=args.agent_system,
agent_config=agent_config if agent_config else None,
verbose=args.verbose
)
logger.info(f"Benchmark summary: {summary}")
return 0
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return 1
if __name__ == "__main__":
sys.exit(main())