-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_evaluation.sh
More file actions
executable file
·80 lines (69 loc) · 2.13 KB
/
run_evaluation.sh
File metadata and controls
executable file
·80 lines (69 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/bin/bash
# ARC Challenge Evaluation Runner
# Evaluate first 20 tasks with detailed logging and real-time progress
echo "🏆 ARC Challenge Evaluation System"
echo "=================================="
echo ""
echo "Choose evaluation mode:"
echo "1) Terminal mode (detailed logging, real-time)"
echo "2) GUI mode (visual interface)"
echo "3) Quick test (first 2 tasks only)"
echo ""
read -p "Enter choice (1-3): " choice
case $choice in
1)
echo ""
echo "🚀 Starting terminal evaluation..."
echo "📝 Logs will be saved to evaluation_logs/"
echo ""
python3 evaluate_first_20.py
;;
2)
echo ""
echo "🖥️ Starting GUI evaluation..."
echo "💡 Click 'Start Evaluation' in the GUI window"
echo ""
python3 evaluate_gui.py
;;
3)
echo ""
echo "⚡ Quick test mode (first 2 tasks)..."
echo ""
python3 -c "
import json
import time
from evaluate_first_20 import load_data, grids_equal, to_grid
from arc_solver.solver import solve_task
challenges, solutions = load_data()
task_ids = list(challenges.keys())[:2]
total_score = 0
print('Task ID | Status | Score | Accuracy | Details')
print('-' * 60)
for task_id in task_ids:
start_time = time.time()
try:
task = challenges[task_id]
result = solve_task(task)
prediction = result['attempt_1'][0]
gold_solution = solutions[task_id][0]
is_correct, details, accuracy = grids_equal(prediction, gold_solution)
score = 1 if is_correct else 0
status = '✅ PASS' if is_correct else '❌ FAIL'
total_score += score
except Exception as e:
score = 0
status = '⚠️ ERROR'
accuracy = 0.0
details = str(e)[:30]
duration = time.time() - start_time
print(f'{task_id:15} | {status:6} | {score:5} | {accuracy*100:6.1f}% | {details[:30]}')
print(f'\\nQuick Test Score: {total_score}/2 = {total_score/2*100:.0f}%')
"
;;
*)
echo "Invalid choice. Exiting."
exit 1
;;
esac
echo ""
echo "✨ Evaluation complete!"