From fa0943d3d7f8df360f2493072f16036d0fa351a6 Mon Sep 17 00:00:00 2001 From: duguwanglong Date: Fri, 24 Apr 2026 16:14:36 +0800 Subject: [PATCH] fix(tool/skill): cap description preview at 500 chars and load full SKILL.md on demand MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `skill` tool exposes two surfaces, both of which were silently dropping critical content: 1. The tool's meta-description (shipped in every system prompt) listed each skill's full frontmatter description (up to 1024 chars). Long descriptions bloated the prompt and the model often acted on a partial preview without loading the full file. 2. When the model called `skill(name=...)`, ToolRegistry's auto-truncate path silently cropped the returned SKILL.md at 10 KB / 200 lines (head-only), dropping the workflow steps, references, and constraints that authors typically place at the end of the file. Users reported "skill.md 后面的 感觉就完全丢失了". This change implements progressive disclosure (mirroring hermes-agent's `skills_list` / `skill_view` split): - `build_description()` now caps each skill's preview at MAX_SKILL_DESCRIPTION_PREVIEW_CHARS (500) using head + tail truncation, preserving both the opening (scope/triggers) and the closing (hard constraints like "必须先加载本 skill"). Adds an explicit instruction telling the model the previews are summaries and it MUST call `skill(name=...)` to load the full SKILL.md before acting. - `skill_tool_impl()` sets `truncated=True` on its `ToolResult` to opt out of the registry's auto-truncate pass, so the full SKILL.md (verified end- to-end with 18.8 KB / 500 lines `agent-browser`) reaches the model intact. Also emits a `skill.load.full_content` log event for auditability. Empirical impact on the 12 existing skills: - System prompt index: 4.6 KB total (only `tool-builder` 614→500 is trimmed) - Load on demand: previously 8/12 skills lost their tail; now all 12 are delivered in full Tests: 15 new in `tests/tool/test_skill_tool_description.py` covering truncation algorithm, build_description output, and end-to-end no-truncation through `ToolRegistry.execute()`. Made-with: Cursor --- flocks/tool/system/skill.py | 97 +++++++- tests/tool/test_skill_tool_description.py | 271 ++++++++++++++++++++++ 2 files changed, 359 insertions(+), 9 deletions(-) create mode 100644 tests/tool/test_skill_tool_description.py diff --git a/flocks/tool/system/skill.py b/flocks/tool/system/skill.py index 5ab4c55c1..f61f362c8 100644 --- a/flocks/tool/system/skill.py +++ b/flocks/tool/system/skill.py @@ -19,29 +19,90 @@ log = Log.create(service="tool.skill") +# Maximum characters of a skill's description shown in the `skill` tool's +# meta-description (the tool index that ships with the system prompt). +# +# Why a limit at all? +# The `skill` tool's description is injected into every LLM call as part of +# the tool schema. Listing the full SKILL.md frontmatter description (allowed +# up to 1024 chars by `Skill._is_valid_description`) for every skill makes +# the prompt grow linearly with the number of skills — and most of that text +# is "how to use" detail that the model only needs *after* it decides to +# load the skill. +# +# Why 500? +# Empirically, the descriptions in `flocks/.flocks/plugins/skills/*/SKILL.md` +# cluster between 60 and 614 characters; 500 chars preserves ~96% of the +# total content (only one outlier needs trimming) while keeping the worst- +# case cost of the index bounded. Critically, threat-intel/EDR skills tend +# to put their hard constraints ("must load this skill before any X tool") +# at the *end* of the description, so we keep both head and tail. +MAX_SKILL_DESCRIPTION_PREVIEW_CHARS = 500 + + +def _truncate_skill_description(description: str, name: str) -> str: + """ + Cap a single skill's description at MAX_SKILL_DESCRIPTION_PREVIEW_CHARS. + + Uses head + tail truncation so both the opening (scope/triggers) and the + closing (hard constraints, "must load first") survive. Inserts a marker + that tells the model how to fetch the full content via the `skill` tool. + """ + if len(description) <= MAX_SKILL_DESCRIPTION_PREVIEW_CHARS: + return description + + marker = f' … [truncated; load full SKILL.md via skill(name="{name}") before acting] … ' + available = MAX_SKILL_DESCRIPTION_PREVIEW_CHARS - len(marker) + if available < 80: + # Marker alone is unusually long (very long skill name); fall back to + # plain head truncation so we still emit something useful. + return description[: MAX_SKILL_DESCRIPTION_PREVIEW_CHARS - 1] + "…" + + head_size = (available * 3) // 5 # ~60% head + tail_size = available - head_size + return description[:head_size] + marker + description[-tail_size:] + + def build_description(skills: List[SkillInfo]) -> str: - """Build tool description with available skills""" + """Build tool description with available skills. + + Each skill's description is capped at MAX_SKILL_DESCRIPTION_PREVIEW_CHARS + (head + tail). The model is instructed to call `skill(name=...)` to + obtain the full SKILL.md when it decides to act on a skill. + """ if not skills: return "Load a skill to get detailed instructions for a specific task. No skills are currently available." - + # Match Flocks's format: space-separated, no newlines parts = [ "Load a skill to get detailed instructions for a specific task.", "Skills provide specialized knowledge and step-by-step guidance.", "Use this when a task matches an available skill's description.", + # Strong, explicit guidance: the descriptions below are PREVIEWS only. + # The model must call this tool to get the full SKILL.md before + # actually executing the skill's workflow. + ( + "IMPORTANT: each below is a preview that may be " + f"truncated to {MAX_SKILL_DESCRIPTION_PREVIEW_CHARS} chars. " + "It is enough to decide WHETHER a skill applies, but NOT enough " + "to execute it. Once you pick a skill, you MUST call " + "skill(name=\"\") to load the full SKILL.md before " + "running its steps or calling any tool the skill governs." + ), "", ] - + for skill in skills: + preview = _truncate_skill_description(skill.description, skill.name) parts.extend([ " ", f" {skill.name}", - f" {skill.description}", + f" {preview}", " ", ]) - + parts.append("") - + # Join with space like Flocks does: .join(" ") return " ".join(parts) @@ -99,21 +160,39 @@ async def skill_tool_impl( # Get base directory skill_dir = os.path.dirname(location) - + # Format output output = f"""## Skill: {skill.name} **Base directory**: {skill_dir} {content.strip()}""" - + + # ``truncated=True`` here is intentional: it tells ToolRegistry's + # auto-truncate path (registry.py: "Auto-truncate output unless the tool + # already handled it") to leave our payload alone. The `skill` tool is the + # *load-on-demand* counterpart of the tiny preview that ships in the system + # prompt -- if the model just decided to load this skill, it needs the + # FULL SKILL.md to act on. Cropping it at 10 KB / 200 lines (the + # registry's defaults) silently drops the workflow steps, references, and + # constraints that authors typically place at the *end* of the file, which + # is the exact bug users were hitting (skill.md tail "感觉就完全丢失了"). + # Mirrors hermes-agent's `skill_view`, which also returns content in full. + log.info("skill.load.full_content", { + "name": skill.name, + "bytes": len(output.encode("utf-8")), + "lines": output.count("\n") + 1, + }) + return ToolResult( success=True, output=output, title=f"Loaded skill: {skill.name}", + truncated=True, metadata={ "name": skill.name, - "dir": skill_dir + "dir": skill_dir, + "auto_truncate_bypassed": True, } ) diff --git a/tests/tool/test_skill_tool_description.py b/tests/tool/test_skill_tool_description.py new file mode 100644 index 000000000..a1d874a9a --- /dev/null +++ b/tests/tool/test_skill_tool_description.py @@ -0,0 +1,271 @@ +"""Tests for flocks.tool.system.skill. + +Two complementary aspects of the skill tool's progressive-disclosure design +are exercised here: + +1. ``build_description`` — the meta-description that ships in every system + prompt. Each skill's description is capped at + ``MAX_SKILL_DESCRIPTION_PREVIEW_CHARS`` using head + tail truncation, so + both the opening (scope/triggers) and the closing (hard constraints, "must + load this skill before X") survive. The model is explicitly told to call + ``skill(name=...)`` to load the full SKILL.md before acting. + +2. ``skill_tool`` (load on demand) — when the model actually calls the tool, + the FULL SKILL.md must come back unredacted. This is the load-on-demand + counterpart of the truncated preview, mirroring hermes-agent's + ``skill_view``. Without the explicit opt-out the registry would silently + crop SKILL.md at 10 KB / 200 lines (head-only), dropping the workflow + steps and references that authors put at the file's tail. +""" + +from __future__ import annotations + +import os +import tempfile +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from flocks.skill.skill import Skill, SkillInfo +from flocks.tool.registry import ToolContext, ToolRegistry +from flocks.tool.system.skill import ( + MAX_SKILL_DESCRIPTION_PREVIEW_CHARS, + _truncate_skill_description, + build_description, + skill_tool_impl, +) + + +def _skill(name: str, description: str) -> SkillInfo: + return SkillInfo(name=name, description=description, location=f"/tmp/{name}/SKILL.md") + + +def _make_ctx() -> ToolContext: + ctx = MagicMock(spec=ToolContext) + ctx.ask = AsyncMock(return_value=None) + ctx.metadata = MagicMock() + ctx.aborted = False + ctx.extra = {} + ctx.agent = "rex" + ctx.session_id = "ses_test_skill" + return ctx + + +# --------------------------------------------------------------------------- +# _truncate_skill_description +# --------------------------------------------------------------------------- + + +class TestTruncateSkillDescription: + def test_short_description_is_unchanged(self): + desc = "Short skill description in two sentences. Use it for X." + assert _truncate_skill_description(desc, "x-skill") == desc + + def test_description_at_cap_is_unchanged(self): + desc = "a" * MAX_SKILL_DESCRIPTION_PREVIEW_CHARS + assert _truncate_skill_description(desc, "x-skill") == desc + + def test_long_description_is_truncated_under_cap(self): + desc = "a" * 1000 + out = _truncate_skill_description(desc, "x-skill") + assert len(out) <= MAX_SKILL_DESCRIPTION_PREVIEW_CHARS + + def test_truncation_keeps_head_and_tail(self): + head = "HEAD_MARKER " + ("a" * 600) + tail = ("z" * 200) + " TAIL_MARKER" + desc = head + tail + out = _truncate_skill_description(desc, "demo") + assert out.startswith("HEAD_MARKER") + assert out.endswith("TAIL_MARKER") + + def test_truncation_inserts_skill_load_hint(self): + desc = "a" * 2000 + out = _truncate_skill_description(desc, "onesec-use") + # The truncation marker must point the model at the right tool call, + # otherwise progressive disclosure breaks. + assert 'skill(name="onesec-use")' in out + assert "truncated" in out + + def test_chinese_threat_intel_skill_keeps_trailing_constraint(self): + # Real-world shape: scope ... ability list ... HARD CONSTRAINT at end. + # The trailing "必须先加载本 skill" is the most decision-critical part + # for these skills. + head = "用于处理 OneSEC 终端安全平台相关任务," + (",能力列表" * 200) + tail = "本 skill 是 OneSEC 平台操作的唯一决策入口:必须先加载本 skill。" + desc = head + tail + out = _truncate_skill_description(desc, "onesec-use") + assert len(out) <= MAX_SKILL_DESCRIPTION_PREVIEW_CHARS + assert out.startswith("用于处理 OneSEC 终端安全平台相关任务") + assert out.endswith("必须先加载本 skill。") + + +# --------------------------------------------------------------------------- +# build_description +# --------------------------------------------------------------------------- + + +class TestBuildDescription: + def test_empty_skills_returns_no_skills_message(self): + out = build_description([]) + assert "No skills are currently available" in out + assert "" not in out + + def test_each_skill_emits_xml_block(self): + out = build_description( + [ + _skill("alpha", "First skill description."), + _skill("beta", "Second skill description."), + ] + ) + assert "" in out + assert "" in out + assert "alpha" in out + assert "beta" in out + assert "First skill description." in out + assert "Second skill description." in out + + def test_includes_progressive_disclosure_instruction(self): + out = build_description([_skill("alpha", "demo")]) + # Must direct the model at the load-on-demand tool call pattern, + # otherwise the model will try to act on the (possibly truncated) + # preview without first reading the full SKILL.md. + assert "skill(name=" in out + assert "preview" in out.lower() or "truncated" in out.lower() + assert "MUST" in out or "must" in out.lower() + + def test_long_description_is_truncated_in_output(self): + long_desc = "a" * 2000 + out = build_description([_skill("big", long_desc)]) + # The full 2000-char string should never appear verbatim because it + # exceeds the per-skill preview cap. + assert long_desc not in out + assert 'skill(name="big")' in out + + def test_short_descriptions_are_emitted_verbatim(self): + desc = "Short, single-sentence description that fits." + out = build_description([_skill("tiny", desc)]) + # No truncation marker should appear for descriptions under the cap. + assert desc in out + assert "truncated" not in out.split("")[1] + + def test_api_surface_returns_string(self): + out = build_description([_skill("alpha", "desc")]) + assert isinstance(out, str) + assert len(out) > 0 + + +# --------------------------------------------------------------------------- +# skill_tool_impl: load-on-demand must NOT truncate the SKILL.md content +# --------------------------------------------------------------------------- + + +@pytest.fixture +def fake_skill_dir(): + """Create a temp directory with a SKILL.md that exceeds default truncation + limits (>10 KB AND >200 lines) so we can prove the registry's auto-truncate + pass leaves it alone. + """ + with tempfile.TemporaryDirectory() as tmpdir: + skill_dir = Path(tmpdir) / "huge-skill" + skill_dir.mkdir() + + body_lines = [ + f"## Step {i}: do something important and clearly identifiable" for i in range(1, 401) + ] + # Tail sentinel proves we kept the END of the file (where authors + # typically put the operational steps and references). + body_lines.append("# TAIL_SENTINEL: this line MUST survive end-to-end") + body = "\n".join(body_lines) + # Pad with extra bytes so total comfortably exceeds the 10 KB cap. + padding = "x" * (15 * 1024) + + (skill_dir / "SKILL.md").write_text( + f"""--- +name: huge-skill +description: Test skill that exceeds default truncation limits. +--- + +# Huge Skill + +{padding} + +{body} +""", + encoding="utf-8", + ) + yield skill_dir + + +class TestSkillLoadNoTruncation: + @pytest.mark.asyncio + async def test_skill_tool_returns_full_content(self, fake_skill_dir): + """skill_tool_impl must return the entire SKILL.md content, including + the tail, regardless of file size.""" + skill_md = fake_skill_dir / "SKILL.md" + original = skill_md.read_text(encoding="utf-8") + # Sanity-check the fixture: it MUST exceed the registry defaults, + # otherwise the test isn't actually exercising the bypass. + assert len(original.encode("utf-8")) > 10 * 1024 + assert original.count("\n") > 200 + + skill_info = SkillInfo( + name="huge-skill", + description="Test skill", + location=str(skill_md), + ) + + with patch.object(Skill, "get", AsyncMock(return_value=skill_info)): + result = await skill_tool_impl(_make_ctx(), name="huge-skill") + + assert result.success is True + assert "TAIL_SENTINEL" in result.output, ( + "tail of SKILL.md was dropped — load-on-demand must deliver " + "the full file, not the head-truncated version" + ) + # The content should appear verbatim within the formatted output. + assert original.strip() in result.output + + @pytest.mark.asyncio + async def test_skill_tool_sets_truncated_flag_to_bypass_registry(self, fake_skill_dir): + """skill_tool_impl must set ``truncated=True`` so ToolRegistry's + auto-truncate pass (registry.py: 'unless the tool already handled it') + leaves our payload alone.""" + skill_info = SkillInfo( + name="huge-skill", + description="Test skill", + location=str(fake_skill_dir / "SKILL.md"), + ) + + with patch.object(Skill, "get", AsyncMock(return_value=skill_info)): + result = await skill_tool_impl(_make_ctx(), name="huge-skill") + + assert result.truncated is True + assert result.metadata.get("auto_truncate_bypassed") is True + + @pytest.mark.asyncio + async def test_registry_execute_does_not_truncate_skill_output(self, fake_skill_dir): + """End-to-end: when the `skill` tool is executed via ToolRegistry + (which is what the real session loop does), the auto-truncate path + must NOT fire and the tail must survive.""" + skill_info = SkillInfo( + name="huge-skill", + description="Test skill", + location=str(fake_skill_dir / "SKILL.md"), + ) + + skill_tool = ToolRegistry.get("skill") + assert skill_tool is not None, "skill tool must be registered" + + with patch.object(Skill, "all", AsyncMock(return_value=[skill_info])), \ + patch.object(Skill, "get", AsyncMock(return_value=skill_info)): + result = await skill_tool.execute(_make_ctx(), name="huge-skill") + + assert result.success is True + assert result.truncated is True # explicit bypass, not registry-imposed + assert "TAIL_SENTINEL" in result.output + # The registry's truncation hint must NOT appear — that would mean + # the bypass failed. + assert "Use Grep to search the full content" not in result.output + assert "lines truncated" not in result.output + assert "bytes truncated" not in result.output