diff --git a/.github/workflows/validate-tasks.yml b/.github/workflows/validate-tasks.yml new file mode 100644 index 000000000..2b4190b78 --- /dev/null +++ b/.github/workflows/validate-tasks.yml @@ -0,0 +1,163 @@ +name: Validate Harbor Tasks + +on: + pull_request: + paths: + - 'data/**' + push: + branches: [main] + paths: + - 'data/**' + workflow_dispatch: + inputs: + verify_solutions: + description: 'Run solution verification (Docker required)' + type: boolean + default: false + force_verify: + description: 'Re-verify all tasks (ignore checksum cache)' + type: boolean + default: false + +jobs: + detect-changed-tasks: + runs-on: ubuntu-latest + outputs: + changed_tasks: ${{ steps.detect.outputs.changed_tasks }} + any_changed: ${{ steps.detect.outputs.any_changed }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Detect changed task directories + id: detect + run: | + if [ "${{ github.event_name }}" = "pull_request" ]; then + BASE=${{ github.event.pull_request.base.sha }} + HEAD=${{ github.sha }} + else + # For push events, compare with previous commit + BASE=${{ github.event.before }} + HEAD=${{ github.sha }} + fi + + # Find changed task directories + CHANGED=$(git diff --name-only "$BASE" "$HEAD" -- 'data/' \ + | cut -d'/' -f1-2 \ + | sort -u \ + | tr '\n' ' ') + + if [ -z "$CHANGED" ]; then + echo "any_changed=false" >> "$GITHUB_OUTPUT" + echo "changed_tasks=" >> "$GITHUB_OUTPUT" + else + echo "any_changed=true" >> "$GITHUB_OUTPUT" + echo "changed_tasks=$CHANGED" >> "$GITHUB_OUTPUT" + fi + echo "Changed tasks: $CHANGED" + + # ── Tier 1: Schema lint (fast, always runs) ────────────────────── + lint: + runs-on: ubuntu-latest + needs: detect-changed-tasks + if: needs.detect-changed-tasks.outputs.any_changed == 'true' || github.event_name == 'workflow_dispatch' + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Lint changed tasks + run: | + TASKS="${{ needs.detect-changed-tasks.outputs.changed_tasks }}" + if [ -z "$TASKS" ]; then + echo "Linting all tasks..." + python scripts/validate_harbor_task.py data/ + else + echo "Linting changed tasks: $TASKS" + python scripts/validate_harbor_task.py $TASKS + fi + + # ── Tier 2: Docker build (medium, runs on PRs) ────────────────── + docker-build: + runs-on: ubuntu-latest + needs: [detect-changed-tasks, lint] + if: needs.detect-changed-tasks.outputs.any_changed == 'true' + steps: + - uses: actions/checkout@v4 + + - name: Build Docker images for changed tasks + run: | + TASKS="${{ needs.detect-changed-tasks.outputs.changed_tasks }}" + FAILED=0 + + for TASK_DIR in $TASKS; do + TASK_ID=$(basename "$TASK_DIR") + DOCKERFILE="$TASK_DIR/environment/Dockerfile" + # Sanitize for Docker tag (only lowercase alnum and single dashes) + SAFE_TAG=$(echo "$TASK_ID" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g; s/-\+/-/g; s/^-//; s/-$//') + TAG="hci-${SAFE_TAG:0:120}" + + if [ ! -f "$DOCKERFILE" ]; then + echo "::error::$TASK_ID: missing Dockerfile" + FAILED=$((FAILED + 1)) + continue + fi + + echo "Building $TASK_ID..." + if docker build -t "$TAG" "$TASK_DIR/environment/" 2>&1; then + echo "::notice::$TASK_ID: build OK" + docker rmi -f "$TAG" 2>/dev/null || true + else + echo "::error::$TASK_ID: Docker build failed" + FAILED=$((FAILED + 1)) + fi + done + + if [ "$FAILED" -gt 0 ]; then + echo "$FAILED task(s) failed Docker build" + exit 1 + fi + + # ── Tier 3: Solution verification (heavy, opt-in or on push to main) ─ + verify-solutions: + runs-on: ubuntu-latest + needs: [detect-changed-tasks, lint] + if: >- + (github.event_name == 'workflow_dispatch' && github.event.inputs.verify_solutions == 'true') || + (github.event_name == 'push' && github.ref == 'refs/heads/main') + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Verify solutions + run: | + FORCE_FLAG="" + if [ "${{ github.event.inputs.force_verify }}" = "true" ]; then + FORCE_FLAG="--force" + fi + + TASKS="${{ needs.detect-changed-tasks.outputs.changed_tasks }}" + if [ -z "$TASKS" ]; then + python scripts/verify_solutions.py $FORCE_FLAG --jobs 2 data/ + else + python scripts/verify_solutions.py $FORCE_FLAG --jobs 2 $TASKS + fi + + - name: Commit updated checksums + if: success() + run: | + if git diff --quiet .validated-solutions.json 2>/dev/null; then + echo "No checksum changes to commit" + else + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add .validated-solutions.json + git commit -m "ci: update validated solution checksums" + git push + fi diff --git a/scripts/validate_harbor_task.py b/scripts/validate_harbor_task.py new file mode 100755 index 000000000..062a32152 --- /dev/null +++ b/scripts/validate_harbor_task.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +""" +Validate that task directories conform to the harbor specification. + +Usage: + # Validate all tasks in data/ + python scripts/validate_harbor_task.py data/ + + # Validate specific task directories + python scripts/validate_harbor_task.py data/task_abc123 data/task_def456 + + # Validate with verbose output + python scripts/validate_harbor_task.py -v data/ + +Exit codes: + 0 - All tasks valid + 1 - Validation errors found +""" + +import argparse +import os +import sys + +try: + import tomllib +except ImportError: + try: + import tomli as tomllib + except ImportError: + tomllib = None + + +def validate_task(task_dir: str, verbose: bool = False) -> list[str]: + """Validate a single task directory. Returns list of error strings.""" + errors = [] + task_id = os.path.basename(task_dir) + + def err(msg: str): + errors.append(f"{task_id}: {msg}") + + def info(msg: str): + if verbose: + print(f" [ok] {msg}") + + # ── Required files ────────────────────────────────────────────── + required_files = { + "task.toml": "task configuration", + "instruction.md": "task instructions", + "environment/Dockerfile": "Docker environment", + "solution/solve.sh": "solution script", + "tests/test.sh": "test script", + } + + for rel_path, desc in required_files.items(): + full_path = os.path.join(task_dir, rel_path) + if not os.path.isfile(full_path): + err(f"missing required file: {rel_path} ({desc})") + else: + info(f"{rel_path} exists") + + # ── task.toml validation ──────────────────────────────────────── + toml_path = os.path.join(task_dir, "task.toml") + if os.path.isfile(toml_path): + if tomllib is None: + # Python < 3.11 and no tomli installed — do basic text checks + with open(toml_path) as f: + content = f.read() + if not content.strip(): + err("task.toml is empty") + else: + # Check for required sections + for section in ["[verifier]", "[agent]", "[environment]"]: + if section not in content: + err(f"task.toml missing section: {section}") + else: + info(f"task.toml has {section}") + else: + try: + with open(toml_path, "rb") as f: + toml_data = tomllib.load(f) + except Exception as e: + err(f"task.toml parse error: {e}") + toml_data = None + + if toml_data is not None: + # Required sections + for section in ["verifier", "agent", "environment"]: + if section not in toml_data: + err(f"task.toml missing section: [{section}]") + else: + info(f"task.toml has [{section}]") + + # Verifier checks + verifier = toml_data.get("verifier", {}) + if "timeout_sec" not in verifier: + err("task.toml [verifier] missing timeout_sec") + elif not isinstance(verifier["timeout_sec"], (int, float)): + err("task.toml [verifier].timeout_sec must be a number") + elif verifier["timeout_sec"] <= 0: + err("task.toml [verifier].timeout_sec must be positive") + else: + info(f"verifier.timeout_sec = {verifier['timeout_sec']}") + + # Agent checks + agent = toml_data.get("agent", {}) + if "timeout_sec" not in agent: + err("task.toml [agent] missing timeout_sec") + elif not isinstance(agent["timeout_sec"], (int, float)): + err("task.toml [agent].timeout_sec must be a number") + elif agent["timeout_sec"] <= 0: + err("task.toml [agent].timeout_sec must be positive") + else: + info(f"agent.timeout_sec = {agent['timeout_sec']}") + + # Environment checks + env = toml_data.get("environment", {}) + if "cpus" not in env: + err("task.toml [environment] missing cpus") + if "memory" not in env and "memory_mb" not in env: + err("task.toml [environment] missing memory or memory_mb") + + # ── instruction.md validation ─────────────────────────────────── + instr_path = os.path.join(task_dir, "instruction.md") + if os.path.isfile(instr_path): + size = os.path.getsize(instr_path) + if size == 0: + err("instruction.md is empty") + elif size < 20: + err(f"instruction.md suspiciously short ({size} bytes)") + else: + info(f"instruction.md is {size} bytes") + + # ── Dockerfile validation ─────────────────────────────────────── + dockerfile_path = os.path.join(task_dir, "environment", "Dockerfile") + if os.path.isfile(dockerfile_path): + with open(dockerfile_path) as f: + dockerfile = f.read() + if not dockerfile.strip(): + err("Dockerfile is empty") + else: + lines = [l.strip() for l in dockerfile.splitlines() if l.strip() and not l.strip().startswith("#")] + if not lines: + err("Dockerfile has no instructions (only comments/blanks)") + elif not lines[0].startswith("FROM "): + err(f"Dockerfile first instruction must be FROM, got: {lines[0][:60]}") + else: + info(f"Dockerfile starts with {lines[0][:60]}") + + # ── solve.sh validation ───────────────────────────────────────── + solve_path = os.path.join(task_dir, "solution", "solve.sh") + if os.path.isfile(solve_path): + with open(solve_path) as f: + solve_content = f.read() + if not solve_content.strip(): + err("solution/solve.sh is empty") + elif not solve_content.startswith("#!/"): + err("solution/solve.sh missing shebang (#!/bin/bash or similar)") + else: + info("solution/solve.sh has shebang") + + # ── test.sh validation ────────────────────────────────────────── + test_path = os.path.join(task_dir, "tests", "test.sh") + if os.path.isfile(test_path): + with open(test_path) as f: + # Read just the first 1KB to check the header (test.sh can be huge with embedded tarballs) + test_header = f.read(1024) + if not test_header.strip(): + err("tests/test.sh is empty") + elif not test_header.startswith("#!/"): + err("tests/test.sh missing shebang") + else: + info("tests/test.sh has shebang") + + return errors + + +def find_task_dirs(path: str) -> list[str]: + """Find all task directories under a path. + + A task directory is identified by containing a task.toml file. + """ + tasks = [] + if os.path.isfile(os.path.join(path, "task.toml")): + # path itself is a task directory + tasks.append(path) + else: + # Scan one level deep (data//task.toml) + for entry in sorted(os.listdir(path)): + candidate = os.path.join(path, entry) + if os.path.isdir(candidate) and os.path.isfile(os.path.join(candidate, "task.toml")): + tasks.append(candidate) + return tasks + + +def main(): + parser = argparse.ArgumentParser(description="Validate harbor task directories") + parser.add_argument("paths", nargs="+", help="Task directories or parent directories to scan") + parser.add_argument("-v", "--verbose", action="store_true", help="Show passing checks too") + parser.add_argument("--json", action="store_true", help="Output results as JSON") + args = parser.parse_args() + + all_task_dirs = [] + for p in args.paths: + all_task_dirs.extend(find_task_dirs(p)) + + if not all_task_dirs: + print("No task directories found.", file=sys.stderr) + sys.exit(1) + + all_errors = {} + passed = 0 + failed = 0 + + for task_dir in all_task_dirs: + task_id = os.path.basename(task_dir) + if args.verbose: + print(f"\nValidating {task_id}...") + errors = validate_task(task_dir, verbose=args.verbose) + if errors: + all_errors[task_id] = errors + failed += 1 + else: + passed += 1 + + if args.json: + import json + result = { + "passed": passed, + "failed": failed, + "total": passed + failed, + "errors": all_errors, + } + print(json.dumps(result, indent=2)) + else: + if all_errors: + print(f"\n{'='*60}") + print(f"VALIDATION FAILED: {failed}/{passed+failed} tasks have errors\n") + for task_id, errors in all_errors.items(): + for e in errors: + print(f" ERROR: {e}") + print() + else: + print(f"\nAll {passed} tasks passed validation.") + + sys.exit(1 if all_errors else 0) + + +if __name__ == "__main__": + main() diff --git a/scripts/verify_solutions.py b/scripts/verify_solutions.py new file mode 100755 index 000000000..d546a5dbb --- /dev/null +++ b/scripts/verify_solutions.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python3 +""" +Build Docker images, run solutions, and verify they pass tests. + +Tracks which tasks have been successfully verified via a checksum file, +so each task only needs to be verified once (re-verified if changed). + +Usage: + # Verify all unverified/changed tasks + python scripts/verify_solutions.py data/ + + # Verify specific tasks (ignores checksum cache) + python scripts/verify_solutions.py --force data/task_abc123 + + # Dry run — show what would be verified without running anything + python scripts/verify_solutions.py --dry-run data/ + + # Set concurrency for parallel Docker builds + python scripts/verify_solutions.py --jobs 4 data/ +""" + +import argparse +import hashlib +import json +import os +import re +import subprocess +import sys +import tempfile +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +CHECKSUM_FILE = ".validated-solutions.json" + + +def compute_task_hash(task_dir: str) -> str: + """Compute a stable hash of all files in a task directory. + + This is used to detect when a task has changed and needs re-verification. + """ + h = hashlib.sha256() + task_path = Path(task_dir) + for filepath in sorted(task_path.rglob("*")): + if filepath.is_file(): + rel = filepath.relative_to(task_path) + h.update(str(rel).encode()) + h.update(filepath.read_bytes()) + return h.hexdigest()[:16] + + +def load_checksums(repo_root: str) -> dict: + """Load the validated solutions checksum file.""" + path = os.path.join(repo_root, CHECKSUM_FILE) + if os.path.isfile(path): + with open(path) as f: + return json.load(f) + return {} + + +def save_checksums(repo_root: str, checksums: dict): + """Save the validated solutions checksum file.""" + path = os.path.join(repo_root, CHECKSUM_FILE) + with open(path, "w") as f: + json.dump(checksums, f, indent=2, sort_keys=True) + f.write("\n") + + +def find_task_dirs(path: str) -> list[str]: + """Find all task directories under a path.""" + tasks = [] + if os.path.isfile(os.path.join(path, "task.toml")): + tasks.append(os.path.abspath(path)) + else: + for entry in sorted(os.listdir(path)): + candidate = os.path.join(path, entry) + if os.path.isdir(candidate) and os.path.isfile(os.path.join(candidate, "task.toml")): + tasks.append(os.path.abspath(candidate)) + return tasks + + +def verify_task(task_dir: str, timeout: int = 600, platform: str = "") -> tuple[str, bool, str]: + """Build a task's Docker image, run the solution, and check the test. + + Returns (task_id, success, message). + """ + task_id = os.path.basename(task_dir) + # Docker tags: [a-z0-9] with single separators, no slashes (local image) + safe_tag = re.sub(r'[^a-z0-9]', '-', task_id.lower()) + safe_tag = re.sub(r'-+', '-', safe_tag).strip('-')[:120] + image_tag = f"hci-{safe_tag}" + + try: + # Step 1: Build Docker image + env_dir = os.path.join(task_dir, "environment") + build_cmd = ["docker", "build"] + if platform: + build_cmd += ["--platform", platform] + build_cmd += ["-t", image_tag, env_dir] + result = subprocess.run( + build_cmd, + capture_output=True, text=True, timeout=300, + ) + if result.returncode != 0: + return (task_id, False, f"Docker build failed:\n{result.stderr[-500:]}") + + # Step 2: Run solution inside the container + solve_sh = os.path.join(task_dir, "solution", "solve.sh") + test_sh = os.path.join(task_dir, "tests", "test.sh") + + # Create a runner script that: + # 1. Runs the solution + # 2. Runs the test + # 3. Checks the reward + runner = """#!/bin/bash +set -euo pipefail + +# Create required directories +mkdir -p /logs/verifier + +# Run solution +echo "=== Running solution ===" +bash /solution/solve.sh +echo "=== Solution complete ===" + +# Run tests +echo "=== Running tests ===" +bash /tests/test.sh +echo "=== Tests complete ===" + +# Check reward +if [ -f /logs/verifier/reward.txt ]; then + reward=$(cat /logs/verifier/reward.txt | tr -d '[:space:]') + if [ "$reward" = "1" ] || [ "$reward" = "1.0" ]; then + echo "REWARD_CHECK_PASSED" + exit 0 + else + echo "REWARD_CHECK_FAILED: reward=$reward" + exit 1 + fi +else + echo "REWARD_CHECK_FAILED: /logs/verifier/reward.txt not found" + exit 1 +fi +""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False) as f: + f.write(runner) + runner_path = f.name + + try: + # Mount solution, tests, and runner into the container + solution_dir = os.path.join(task_dir, "solution") + tests_dir = os.path.join(task_dir, "tests") + run_cmd = [ + "docker", "run", "--rm", + ] + if platform: + run_cmd += ["--platform", platform] + run_cmd += [ + "-v", f"{solution_dir}:/solution:ro", + "-v", f"{tests_dir}:/tests:ro", + "-v", f"{runner_path}:/runner.sh:ro", + image_tag, + "bash", "/runner.sh", + ] + result = subprocess.run( + run_cmd, + capture_output=True, text=True, timeout=timeout, + ) + finally: + os.unlink(runner_path) + + if result.returncode != 0: + output = (result.stdout + result.stderr)[-1000:] + return (task_id, False, f"Solution verification failed:\n{output}") + + if "REWARD_CHECK_PASSED" in result.stdout: + return (task_id, True, "Solution verified successfully") + else: + return (task_id, False, f"Unexpected output:\n{result.stdout[-500:]}") + + except subprocess.TimeoutExpired: + return (task_id, False, f"Timed out after {timeout}s") + except Exception as e: + return (task_id, False, f"Error: {e}") + finally: + # Clean up Docker image + subprocess.run( + ["docker", "rmi", "-f", image_tag], + capture_output=True, timeout=30, + ) + + +def main(): + parser = argparse.ArgumentParser(description="Verify harbor task solutions") + parser.add_argument("paths", nargs="+", help="Task directories or parent directories to scan") + parser.add_argument("--force", action="store_true", help="Re-verify even if checksum matches") + parser.add_argument("--dry-run", action="store_true", help="Show what would be verified") + parser.add_argument("--jobs", type=int, default=2, help="Parallel verification jobs (default: 2)") + parser.add_argument("--timeout", type=int, default=600, help="Per-task timeout in seconds (default: 600)") + parser.add_argument("--platform", type=str, default="", help="Docker platform (e.g. linux/amd64)") + parser.add_argument("--json", action="store_true", help="Output results as JSON") + args = parser.parse_args() + + # Determine repo root (parent of scripts/) + repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + # Find all task directories + all_task_dirs = [] + for p in args.paths: + all_task_dirs.extend(find_task_dirs(p)) + + if not all_task_dirs: + print("No task directories found.", file=sys.stderr) + sys.exit(1) + + # Load existing checksums + checksums = load_checksums(repo_root) + + # Determine which tasks need verification + tasks_to_verify = [] + tasks_already_verified = [] + + for task_dir in all_task_dirs: + task_id = os.path.basename(task_dir) + current_hash = compute_task_hash(task_dir) + + if not args.force and checksums.get(task_id) == current_hash: + tasks_already_verified.append(task_id) + else: + tasks_to_verify.append((task_dir, task_id, current_hash)) + + print(f"Tasks found: {len(all_task_dirs)}") + print(f"Already verified (unchanged): {len(tasks_already_verified)}") + print(f"Need verification: {len(tasks_to_verify)}") + + if args.dry_run: + if tasks_to_verify: + print("\nWould verify:") + for _, task_id, _ in tasks_to_verify: + print(f" {task_id}") + sys.exit(0) + + if not tasks_to_verify: + print("\nAll tasks are already verified!") + sys.exit(0) + + # Run verifications + print(f"\nVerifying {len(tasks_to_verify)} tasks (jobs={args.jobs})...\n") + + results = {"passed": [], "failed": []} + + with ThreadPoolExecutor(max_workers=args.jobs) as executor: + futures = {} + for task_dir, task_id, current_hash in tasks_to_verify: + future = executor.submit(verify_task, task_dir, args.timeout, args.platform) + futures[future] = (task_id, current_hash) + + for future in as_completed(futures): + task_id, current_hash = futures[future] + tid, success, message = future.result() + + if success: + results["passed"].append(tid) + checksums[tid] = current_hash + # Save after each success so we don't lose progress + save_checksums(repo_root, checksums) + print(f" PASS: {tid}") + else: + results["failed"].append({"task_id": tid, "message": message}) + print(f" FAIL: {tid}") + print(f" {message[:200]}") + + # Summary + print(f"\n{'='*60}") + print(f"Passed: {len(results['passed'])}") + print(f"Failed: {len(results['failed'])}") + print(f"Previously verified: {len(tasks_already_verified)}") + + if args.json: + print(json.dumps(results, indent=2)) + + if results["failed"]: + print("\nFailed tasks:") + for item in results["failed"]: + print(f" {item['task_id']}: {item['message'][:100]}") + sys.exit(1) + + sys.exit(0) + + +if __name__ == "__main__": + main()