From b571a3adc82f824e7222aa1060a172fd0d36ded1 Mon Sep 17 00:00:00 2001
From: Justin McLean <justin@classsoftware.com>
Date: Wed, 10 Jun 2026 13:30:15 +1000
Subject: [PATCH] feat(validator): add SOFT eval-coverage check (check #8)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every skill under skills/ must ship a matching behavioural eval suite
under tools/skill-evals/evals/<slug>/.  The new validate_eval_coverage
function surfaces missing suites as SOFT advisory violations so that
in-flight eval PRs do not fail the gate while their branches are pending
review.

Against the live repo the check correctly flags the two skills that
currently have in-flight eval branches (pr-management-quick-merge and
setup-status) and is silent on all others.  8 new test cases cover the
happy path, the missing-eval path, missing-both-dirs paths, the
soft-category membership, and the non-directory skip.

Addresses the Known Gap in specs/meta-and-quality-tooling.md:
"Eval coverage is incomplete — skills added before the per-skill-eval
convention have no suite."  The check prevents future regressions.

Generated-by: Claude (Opus 4.7)
---
 .../src/skill_and_tool_validator/__init__.py  | 48 ++++++++++-
 .../tests/test_validator.py                   | 83 +++++++++++++++++++
 2 files changed, 130 insertions(+), 1 deletion(-)
diff --git a/tools/skill-and-tool-validator/src/skill_and_tool_validator/__init__.py b/tools/skill-and-tool-validator/src/skill_and_tool_validator/__init__.py
index 1d12cbf0..4b617d71 100644
--- a/tools/skill-and-tool-validator/src/skill_and_tool_validator/__init__.py
+++ b/tools/skill-and-tool-validator/src/skill_and_tool_validator/__init__.py
@@ -17,7 +17,7 @@
 
 """Validate framework skill definitions.
 
-This module validates eight aspects of every skill under
+This module validates nine aspects of every skill under
 skills/:
 
 1. YAML frontmatter — every SKILL.md must have a valid frontmatter
@@ -49,6 +49,11 @@
    Apache Software Foundation license preamble.  Skill ``.md`` files
    declare their license via the required ``license:`` frontmatter key
    (checked by aspect 1), so they need no separate header.
+9. Eval-coverage (SOFT) — every skill directory under ``skills/``
+   must have a matching behavioural eval suite under
+   ``tools/skill-evals/evals/<slug>/``.  Missing suites are
+   advisories so in-flight eval PRs do not block the gate while
+   their branches are pending review.
 
 SOFT categories surface as advisory warnings (stderr) without
 failing the run unless ``--strict`` is passed.
@@ -74,6 +79,7 @@
 SKILLS_DIR = Path("skills")
 TOOLS_DIR = Path("tools")
 DOCS_DIR = Path("docs")
+SKILL_EVALS_DIR = Path("tools/skill-evals/evals")
 PROJECTS_TEMPLATE_DIR = Path("projects/_template")
 
 # Categories for the tool-validator block. Both HARD by default — every
@@ -89,6 +95,8 @@
 # with live skill frontmatter + tool README declarations.
 DOCS_LABELS_AND_CAPABILITIES = Path("docs/labels-and-capabilities.md")
 CAPABILITY_SYNC_CATEGORY = "capability-sync"
+# Eval-coverage check: every skill must have a matching eval suite.
+EVAL_COVERAGE_CATEGORY = "eval-coverage"
 _SKILL_TABLE_HEADER = "## Capability to skill map"
 _TOOL_TABLE_HEADER = "## Capability to tool map"
 # Tokens like `capability:setup`. Optional backticks around the token.
@@ -262,6 +270,7 @@ def _read_mode_table() -> dict[str, str]:
         GH_LIST_CATEGORY,
         PRIVACY_CATEGORY,
         LOWERCASE_F_FIELD_CATEGORY,
+        EVAL_COVERAGE_CATEGORY,
     }
 )
 HARD_CATEGORIES: frozenset[str] = frozenset(
@@ -1728,6 +1737,40 @@ def collect_doc_files(root: Path | None = None) -> set[Path]:
     return files
 
 
+# ---------------------------------------------------------------------------
+# Eval-coverage check (check #9, SOFT)
+# ---------------------------------------------------------------------------
+
+
+def validate_eval_coverage(root: Path | None = None) -> Iterable[Violation]:
+    """Warn when a skill directory has no matching eval suite.
+
+    Every skill under skills/ must have a behavioural eval suite under
+    tools/skill-evals/evals/<slug>/.  Missing suites surface as SOFT
+    advisories so in-flight eval PRs do not fail the gate while their
+    branches are pending review.
+    """
+    repo_root = root or find_repo_root()
+    skills_base = repo_root / SKILLS_DIR
+    evals_base = repo_root / SKILL_EVALS_DIR
+    if not skills_base.exists():
+        return
+    eval_slugs: set[str] = set()
+    if evals_base.exists():
+        eval_slugs = {p.name for p in evals_base.iterdir() if p.is_dir()}
+    for skill_dir in sorted(skills_base.iterdir()):
+        if not skill_dir.is_dir():
+            continue
+        slug = skill_dir.name
+        if slug not in eval_slugs:
+            yield Violation(
+                skill_dir / "SKILL.md",
+                None,
+                f"eval-coverage: no eval suite at tools/skill-evals/evals/{slug}/ — add one before shipping",
+                category=EVAL_COVERAGE_CATEGORY,
+            )
+
+
 def run_validation(root: Path | None = None) -> list[Violation]:
     """Run the full validation suite and return all violations."""
     repo_root = root or find_repo_root()
@@ -1774,6 +1817,9 @@ def run_validation(root: Path | None = None) -> list[Violation]:
     # Capability-sync check: the doc tables and the source must agree.
     violations.extend(validate_capability_sync(repo_root))
 
+    # Eval-coverage check: every skill must have a matching eval suite.
+    violations.extend(validate_eval_coverage(repo_root))
+
     return violations
 
 
diff --git a/tools/skill-and-tool-validator/tests/test_validator.py b/tools/skill-and-tool-validator/tests/test_validator.py
index 2e5430b3..3fda038f 100644
--- a/tools/skill-and-tool-validator/tests/test_validator.py
+++ b/tools/skill-and-tool-validator/tests/test_validator.py
@@ -30,6 +30,7 @@
     _PRIVACY_EXTERNAL_CONTENT_MODES,
     ALL_CATEGORIES,
     ALLOWED_MODES,
+    EVAL_COVERAGE_CATEGORY,
     FORBIDDEN_PATTERNS,
     GH_LIST_CATEGORY,
     HARD_CATEGORIES,
@@ -61,6 +62,7 @@
     run_validation,
     slugify,
     validate_capability_sync,
+    validate_eval_coverage,
     validate_frontmatter,
     validate_gh_list_limit,
     validate_injection_guard,
@@ -2399,3 +2401,84 @@ def test_italic_parens_annotation_is_stripped(self, tmp_path: Path) -> None:
         # The parenthetical capability:reconciliation must NOT be flagged as a doc-side declared capability;
         # the row's authoritative capability is just intake, which matches the live skill.
         assert violations == [], [v.message for v in violations]
+
+
+# ---------------------------------------------------------------------------
+# Eval-coverage check
+# ---------------------------------------------------------------------------
+
+
+class TestValidateEvalCoverage:
+    """Tests for validate_eval_coverage (check #9 — SOFT)."""
+
+    def _make_skill(self, root: Path, slug: str) -> None:
+        skill_dir = root / "skills" / slug
+        skill_dir.mkdir(parents=True, exist_ok=True)
+        (skill_dir / "SKILL.md").write_text(
+            f"---\nname: magpie-{slug}\ndescription: test\ncapability: capability:triage\nlicense: Apache-2.0\n---\n"
+        )
+
+    def _make_eval(self, root: Path, slug: str) -> None:
+        eval_dir = root / "tools" / "skill-evals" / "evals" / slug
+        eval_dir.mkdir(parents=True, exist_ok=True)
+        (eval_dir / "README.md").write_text(f"# {slug} evals\n")
+
+    def test_skill_with_matching_eval_passes(self, tmp_path: Path) -> None:
+        self._make_skill(tmp_path, "issue-triage")
+        self._make_eval(tmp_path, "issue-triage")
+        violations = list(validate_eval_coverage(tmp_path))
+        assert violations == []
+
+    def test_skill_without_eval_yields_soft_violation(self, tmp_path: Path) -> None:
+        self._make_skill(tmp_path, "new-skill")
+        # No matching eval directory.
+        violations = list(validate_eval_coverage(tmp_path))
+        assert len(violations) == 1
+        v = violations[0]
+        assert v.category == EVAL_COVERAGE_CATEGORY
+        assert "new-skill" in v.message
+        assert "tools/skill-evals/evals/new-skill/" in v.message
+
+    def test_multiple_skills_some_missing_evals(self, tmp_path: Path) -> None:
+        self._make_skill(tmp_path, "alpha")
+        self._make_skill(tmp_path, "beta")
+        self._make_skill(tmp_path, "gamma")
+        self._make_eval(tmp_path, "alpha")
+        # beta and gamma have no evals.
+        violations = list(validate_eval_coverage(tmp_path))
+        assert len(violations) == 2
+        slugs = {v.path.parent.name for v in violations}
+        assert slugs == {"beta", "gamma"}
+        assert all(v.category == EVAL_COVERAGE_CATEGORY for v in violations)
+
+    def test_no_skills_dir_returns_no_violations(self, tmp_path: Path) -> None:
+        # skills/ does not exist at all.
+        violations = list(validate_eval_coverage(tmp_path))
+        assert violations == []
+
+    def test_no_evals_dir_all_skills_flagged(self, tmp_path: Path) -> None:
+        self._make_skill(tmp_path, "alpha")
+        self._make_skill(tmp_path, "beta")
+        # tools/skill-evals/evals/ does not exist.
+        violations = list(validate_eval_coverage(tmp_path))
+        assert len(violations) == 2
+        assert all(v.category == EVAL_COVERAGE_CATEGORY for v in violations)
+
+    def test_eval_coverage_is_soft_category(self) -> None:
+        assert EVAL_COVERAGE_CATEGORY in SOFT_CATEGORIES
+        assert EVAL_COVERAGE_CATEGORY not in ALL_CATEGORIES - SOFT_CATEGORIES
+
+    def test_violation_path_points_to_skill_md(self, tmp_path: Path) -> None:
+        self._make_skill(tmp_path, "orphan")
+        violations = list(validate_eval_coverage(tmp_path))
+        assert len(violations) == 1
+        assert violations[0].path.name == "SKILL.md"
+        assert violations[0].path.parent.name == "orphan"
+
+    def test_non_directory_entries_in_skills_are_skipped(self, tmp_path: Path) -> None:
+        skills_dir = tmp_path / "skills"
+        skills_dir.mkdir(parents=True)
+        # A plain file (not a directory) must not be treated as a skill.
+        (skills_dir / "README.md").write_text("# skills\n")
+        violations = list(validate_eval_coverage(tmp_path))
+        assert violations == []