Skip to content

Commit 4ad7969

Browse files
salmad3claude
andcommitted
feat: spec-informed prose knowledge extraction
Uses the OpenAPI spec as an entity vocabulary to resolve prose references. When prose says "do not call POST /payments when using subscriptions," the extractor resolves both entities against the spec's endpoint and tag lists, producing a typed conflicts_with edge between payments and subscriptions skills. Also captures sections and headings as blocks in document metadata extraction so plain .md content is visible to downstream extractors. Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 62e4e19 commit 4ad7969

File tree

6 files changed

+372
-17
lines changed

6 files changed

+372
-17
lines changed

packages/agent-metadata/src/extractors/document.test.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,13 @@ describe('extractDocumentMetadata', () => {
118118

119119
const doc = makeDoc({ children: [section] });
120120
const result = extractDocumentMetadata(doc);
121-
expect(result.blocks).toHaveLength(1);
122-
expect(result.blocks[0]!.codeBlocks).toHaveLength(1);
123-
expect(result.blocks[0]!.codeBlocks[0]!.lang).toBe('bash');
121+
// The section produces a block (annotated) and its heading produces
122+
// another (all headings qualify). The annotated section block carries
123+
// the code blocks.
124+
const annotatedBlock = result.blocks.find((b) => b.codeBlocks.length > 0);
125+
expect(annotatedBlock).toBeDefined();
126+
expect(annotatedBlock!.codeBlocks).toHaveLength(1);
127+
expect(annotatedBlock!.codeBlocks[0]!.lang).toBe('bash');
124128
});
125129

126130
it('counts words across all blocks', () => {

packages/agent-metadata/src/extractors/document.ts

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -95,30 +95,37 @@ export function extractDocumentMetadata(doc: DocumentNode): DocumentMetadata {
9595
collectInlineRefs(node.children, doc.filePath, blockId);
9696
}
9797

98-
// Extract annotated blocks as discrete metadata units
98+
// Extract blocks as discrete metadata units.
99+
// Annotated blocks always qualify. Headings and sections qualify
100+
// unconditionally so that plain .md content is visible to downstream
101+
// extractors (prose knowledge, relationship detection).
99102
const annotations = 'annotations' in node ? node.annotations : undefined;
100-
if (annotations && Object.keys(annotations).length > 0) {
103+
const hasAnnotations = annotations && Object.keys(annotations).length > 0;
104+
const isSection = node.type === 'section';
105+
const isHeading = node.type === 'heading';
106+
107+
if (hasAnnotations || isSection || isHeading) {
101108
const textContent = extractBlockText(node);
102109
const headingText =
103-
node.type === 'heading'
110+
isHeading
104111
? extractInlineText(node.children)
105-
: node.type === 'section'
112+
: isSection
106113
? extractInlineText(node.heading.children)
107114
: undefined;
108115
const headingDepth =
109-
node.type === 'heading'
116+
isHeading
110117
? node.depth
111-
: node.type === 'section'
118+
: isSection
112119
? node.heading.depth
113120
: undefined;
114121

115122
blocks.push({
116-
id: annotations.id ?? blockId,
117-
type: annotations.type ?? node.type,
123+
id: (annotations?.id) ?? blockId,
124+
type: (annotations?.type) ?? node.type,
118125
headingText,
119126
headingDepth,
120127
textContent,
121-
annotations,
128+
annotations: annotations ?? {},
122129
codeBlocks: collectCodeBlocks(node),
123130
sourcePath: doc.filePath,
124131
sourceLocation: node.location

0 commit comments

Comments
 (0)