Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

"""Validate framework skill definitions.

This module validates eight aspects of every skill under
This module validates nine aspects of every skill under
skills/:

1. YAML frontmatter — every SKILL.md must have a valid frontmatter
Expand Down Expand Up @@ -49,6 +49,11 @@
Apache Software Foundation license preamble. Skill ``.md`` files
declare their license via the required ``license:`` frontmatter key
(checked by aspect 1), so they need no separate header.
9. Eval-coverage (SOFT) — every skill directory under ``skills/``
must have a matching behavioural eval suite under
``tools/skill-evals/evals/<slug>/``. Missing suites are
advisories so in-flight eval PRs do not block the gate while
their branches are pending review.

SOFT categories surface as advisory warnings (stderr) without
failing the run unless ``--strict`` is passed.
Expand All @@ -74,6 +79,7 @@
SKILLS_DIR = Path("skills")
TOOLS_DIR = Path("tools")
DOCS_DIR = Path("docs")
SKILL_EVALS_DIR = Path("tools/skill-evals/evals")
PROJECTS_TEMPLATE_DIR = Path("projects/_template")

# Categories for the tool-validator block. Both HARD by default — every
Expand All @@ -89,6 +95,8 @@
# with live skill frontmatter + tool README declarations.
DOCS_LABELS_AND_CAPABILITIES = Path("docs/labels-and-capabilities.md")
CAPABILITY_SYNC_CATEGORY = "capability-sync"
# Eval-coverage check: every skill must have a matching eval suite.
EVAL_COVERAGE_CATEGORY = "eval-coverage"
_SKILL_TABLE_HEADER = "## Capability to skill map"
_TOOL_TABLE_HEADER = "## Capability to tool map"
# Tokens like `capability:setup`. Optional backticks around the token.
Expand Down Expand Up @@ -262,6 +270,7 @@ def _read_mode_table() -> dict[str, str]:
GH_LIST_CATEGORY,
PRIVACY_CATEGORY,
LOWERCASE_F_FIELD_CATEGORY,
EVAL_COVERAGE_CATEGORY,
}
)
HARD_CATEGORIES: frozenset[str] = frozenset(
Expand Down Expand Up @@ -1728,6 +1737,40 @@ def collect_doc_files(root: Path | None = None) -> set[Path]:
return files


# ---------------------------------------------------------------------------
# Eval-coverage check (check #9, SOFT)
# ---------------------------------------------------------------------------


def validate_eval_coverage(root: Path | None = None) -> Iterable[Violation]:
"""Warn when a skill directory has no matching eval suite.

Every skill under skills/ must have a behavioural eval suite under
tools/skill-evals/evals/<slug>/. Missing suites surface as SOFT
advisories so in-flight eval PRs do not fail the gate while their
branches are pending review.
"""
repo_root = root or find_repo_root()
skills_base = repo_root / SKILLS_DIR
evals_base = repo_root / SKILL_EVALS_DIR
if not skills_base.exists():
return
eval_slugs: set[str] = set()
if evals_base.exists():
eval_slugs = {p.name for p in evals_base.iterdir() if p.is_dir()}
for skill_dir in sorted(skills_base.iterdir()):
if not skill_dir.is_dir():
continue
slug = skill_dir.name
if slug not in eval_slugs:
yield Violation(
skill_dir / "SKILL.md",
None,
f"eval-coverage: no eval suite at tools/skill-evals/evals/{slug}/ — add one before shipping",
category=EVAL_COVERAGE_CATEGORY,
)


def run_validation(root: Path | None = None) -> list[Violation]:
"""Run the full validation suite and return all violations."""
repo_root = root or find_repo_root()
Expand Down Expand Up @@ -1774,6 +1817,9 @@ def run_validation(root: Path | None = None) -> list[Violation]:
# Capability-sync check: the doc tables and the source must agree.
violations.extend(validate_capability_sync(repo_root))

# Eval-coverage check: every skill must have a matching eval suite.
violations.extend(validate_eval_coverage(repo_root))

return violations


Expand Down
83 changes: 83 additions & 0 deletions tools/skill-and-tool-validator/tests/test_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
_PRIVACY_EXTERNAL_CONTENT_MODES,
ALL_CATEGORIES,
ALLOWED_MODES,
EVAL_COVERAGE_CATEGORY,
FORBIDDEN_PATTERNS,
GH_LIST_CATEGORY,
HARD_CATEGORIES,
Expand Down Expand Up @@ -61,6 +62,7 @@
run_validation,
slugify,
validate_capability_sync,
validate_eval_coverage,
validate_frontmatter,
validate_gh_list_limit,
validate_injection_guard,
Expand Down Expand Up @@ -2399,3 +2401,84 @@ def test_italic_parens_annotation_is_stripped(self, tmp_path: Path) -> None:
# The parenthetical capability:reconciliation must NOT be flagged as a doc-side declared capability;
# the row's authoritative capability is just intake, which matches the live skill.
assert violations == [], [v.message for v in violations]


# ---------------------------------------------------------------------------
# Eval-coverage check
# ---------------------------------------------------------------------------


class TestValidateEvalCoverage:
"""Tests for validate_eval_coverage (check #9 — SOFT)."""

def _make_skill(self, root: Path, slug: str) -> None:
skill_dir = root / "skills" / slug
skill_dir.mkdir(parents=True, exist_ok=True)
(skill_dir / "SKILL.md").write_text(
f"---\nname: magpie-{slug}\ndescription: test\ncapability: capability:triage\nlicense: Apache-2.0\n---\n"
)

def _make_eval(self, root: Path, slug: str) -> None:
eval_dir = root / "tools" / "skill-evals" / "evals" / slug
eval_dir.mkdir(parents=True, exist_ok=True)
(eval_dir / "README.md").write_text(f"# {slug} evals\n")

def test_skill_with_matching_eval_passes(self, tmp_path: Path) -> None:
self._make_skill(tmp_path, "issue-triage")
self._make_eval(tmp_path, "issue-triage")
violations = list(validate_eval_coverage(tmp_path))
assert violations == []

def test_skill_without_eval_yields_soft_violation(self, tmp_path: Path) -> None:
self._make_skill(tmp_path, "new-skill")
# No matching eval directory.
violations = list(validate_eval_coverage(tmp_path))
assert len(violations) == 1
v = violations[0]
assert v.category == EVAL_COVERAGE_CATEGORY
assert "new-skill" in v.message
assert "tools/skill-evals/evals/new-skill/" in v.message

def test_multiple_skills_some_missing_evals(self, tmp_path: Path) -> None:
self._make_skill(tmp_path, "alpha")
self._make_skill(tmp_path, "beta")
self._make_skill(tmp_path, "gamma")
self._make_eval(tmp_path, "alpha")
# beta and gamma have no evals.
violations = list(validate_eval_coverage(tmp_path))
assert len(violations) == 2
slugs = {v.path.parent.name for v in violations}
assert slugs == {"beta", "gamma"}
assert all(v.category == EVAL_COVERAGE_CATEGORY for v in violations)

def test_no_skills_dir_returns_no_violations(self, tmp_path: Path) -> None:
# skills/ does not exist at all.
violations = list(validate_eval_coverage(tmp_path))
assert violations == []

def test_no_evals_dir_all_skills_flagged(self, tmp_path: Path) -> None:
self._make_skill(tmp_path, "alpha")
self._make_skill(tmp_path, "beta")
# tools/skill-evals/evals/ does not exist.
violations = list(validate_eval_coverage(tmp_path))
assert len(violations) == 2
assert all(v.category == EVAL_COVERAGE_CATEGORY for v in violations)

def test_eval_coverage_is_soft_category(self) -> None:
assert EVAL_COVERAGE_CATEGORY in SOFT_CATEGORIES
assert EVAL_COVERAGE_CATEGORY not in ALL_CATEGORIES - SOFT_CATEGORIES

def test_violation_path_points_to_skill_md(self, tmp_path: Path) -> None:
self._make_skill(tmp_path, "orphan")
violations = list(validate_eval_coverage(tmp_path))
assert len(violations) == 1
assert violations[0].path.name == "SKILL.md"
assert violations[0].path.parent.name == "orphan"

def test_non_directory_entries_in_skills_are_skipped(self, tmp_path: Path) -> None:
skills_dir = tmp_path / "skills"
skills_dir.mkdir(parents=True)
# A plain file (not a directory) must not be treated as a skill.
(skills_dir / "README.md").write_text("# skills\n")
violations = list(validate_eval_coverage(tmp_path))
assert violations == []
Loading