From d4843959585a1a43248c20dab0364ae65b6d805c Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Mon, 27 Apr 2026 22:36:27 +0200
Subject: [PATCH] feat: oracle delta + grouping + honest "Automated review"
 header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Why
spar#175 (the docs PR) got an oracle review with **91 findings, every one
pre-existing**, with the *same message* repeated 50+ times. The user's
reaction was "not sure if i like it, unclear what to do with it" — the
right reaction. A PR review should answer "did this PR make things worse",
not "list every diagnostic in the project".

Three independent changes, shipped together because they're all the same
"make the review legible" theme:

## 1. Delta filter (`subtractFindings`)
Run `rivet validate` at HEAD AND at BASE. Surface only findings present in
HEAD but not in BASE. Pre-existing diagnostics get filtered. On a docs PR
this drops 91 findings to 0 — which is the correct answer.

Cost: one extra tarball fetch + extract + validate (~5–15 s). Acceptable.

## 2. Grouping (`groupOracleFindings`)
Findings sharing the same `(source, severity, message)` collapse into one
finding listing all affected `artifact_ids`:

  Before: 50 lines of "REQ-XXX: every requirement should be satisfied by …"
  After:  1 line: "Every requirement should be satisfied by … —
                  affecting: REQ-INST-003, REQ-TRANSFORM-002, … (+45 more)"

`message` is now stored separately on findings (was inlined into `claim`).
`maxIdsShown` defaults to 10, then "… (+N more)".

## 3. Honest header rename
"AI Code Review" was misleading on PRs where the AI contributed nothing
beyond the summary line (most PRs to date). Now:

- Header: "## Automated review for PR #N"
- HTML marker `<!-- temper-automated-review -->` for supersede detection
  (independent of visible header text)
- Footer breakdown: "_Findings: X mechanical (rivet) · Y from local AI model._"
- supersedePreviousReviews matches BOTH the new marker AND the legacy
  "AI Code Review" string so old comments still get marked outdated.

## Test plan
- [x] All 788 tests pass (was 773 — added 15 covering subtract / group /
      header rename / breakdown footer / grouped finding render)
- [x] eslint clean
- [ ] After deploy: `/review-pr` on spar#175 (or any non-trivial PR) should
      now show: (a) a much shorter findings list (delta only), (b) grouped
      messages, (c) the new "Automated review" header with breakdown.
- [ ] On a docs-only PR the oracle should report 0 new findings → comment
      either skipped (verdict: approve, findings: []) or shows just the
      model summary.

## Risk & rollout
- Risk: medium. Three behaviour changes in one release. Each is opt-in
  via existing config (rivet_oracle.enabled), and the verdict-from-findings
  pipeline gracefully handles 0 findings (silence > slop).
- Rollout: self-update on merge.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 __tests__/unit/ai-review-prompt.test.js |  52 +++++++++++-
 __tests__/unit/rivet-oracle.test.js     | 107 +++++++++++++++++++++++-
 src/ai-review-prompt.js                 |  23 ++++-
 src/ai-review.js                        |  60 +++++++++++--
 src/rivet-oracle.js                     |  97 ++++++++++++++++++++-
 5 files changed, 327 insertions(+), 12 deletions(-)

diff --git a/__tests__/unit/ai-review-prompt.test.js b/__tests__/unit/ai-review-prompt.test.js
index 727c338..0d5ee44 100644
--- a/__tests__/unit/ai-review-prompt.test.js
+++ b/__tests__/unit/ai-review-prompt.test.js
@@ -278,7 +278,7 @@ describe('renderReviewMarkdown', () => {
       },
       42, 'abc1234', meta
     );
-    expect(out).toContain('AI Code Review for PR #42');
+    expect(out).toContain('Automated review for PR #42');
     expect(out).toContain('💬 Comment');
     expect(out).toContain('Adds X.');
     expect(out).toContain('`a.js:5`');
@@ -300,6 +300,56 @@ describe('renderReviewMarkdown', () => {
     expect(out).not.toContain('✅ Approve');
   });
 
+  it('renders the new "Automated review" header with the HTML supersede marker', () => {
+    const out = renderReviewMarkdown(
+      {
+        verdict: 'comment',
+        summary: 'x',
+        findings: [{ file: 'a.js', line: 1, quoted_line: '+x', claim: 'no test for x at a.js:1' }]
+      },
+      99, 'def5678', meta
+    );
+    expect(out).toContain('<!-- temper-automated-review -->');
+    expect(out).toContain('## Automated review for PR #99');
+    expect(out).not.toContain('## AI Code Review');
+  });
+
+  it('shows finding-source breakdown footer (mechanical vs AI)', () => {
+    const out = renderReviewMarkdown(
+      {
+        verdict: 'comment',
+        summary: 'mixed.',
+        findings: [
+          { source: 'oracle:rivet-validate', severity: 'warning', artifact_id: 'REQ-001', claim: 'REQ-001: foo' },
+          { file: 'a.js', line: 1, quoted_line: '+x', claim: 'no test for x at a.js:1' }
+        ]
+      },
+      99, 'def5678', meta
+    );
+    expect(out).toMatch(/_Findings: 1 mechanical \(rivet\) · 1 from local AI model\._/);
+  });
+
+  it('renders grouped oracle finding using artifact_ids array', () => {
+    const out = renderReviewMarkdown(
+      {
+        verdict: 'comment',
+        summary: 'group.',
+        findings: [
+          {
+            source: 'oracle:rivet-validate',
+            severity: 'warning',
+            artifact_id: '3 artifacts',
+            artifact_ids: ['REQ-1', 'REQ-2', 'REQ-3'],
+            claim: 'no downstream — affecting: REQ-1, REQ-2, REQ-3'
+          }
+        ]
+      },
+      99, 'def5678', meta
+    );
+    expect(out).toContain('🟡 `3 artifacts`');
+    expect(out).toContain('affecting: REQ-1, REQ-2, REQ-3');
+  });
+
   it('renders oracle findings with severity badge and artifact_id (not file:line)', () => {
     const out = renderReviewMarkdown(
       {
diff --git a/__tests__/unit/rivet-oracle.test.js b/__tests__/unit/rivet-oracle.test.js
index 70efcd7..89b2120 100644
--- a/__tests__/unit/rivet-oracle.test.js
+++ b/__tests__/unit/rivet-oracle.test.js
@@ -10,7 +10,10 @@ import {
   isRivetProject,
   runRivetValidate,
   runRivetImpact,
-  runRivetOracle
+  runRivetOracle,
+  subtractFindings,
+  groupOracleFindings,
+  findingKey
 } from '../../src/rivet-oracle.js';
 
 function makeRunnerOk(stdout) {
@@ -201,6 +204,108 @@ describe('runRivetImpact', () => {
   });
 });
 
+describe('findingKey', () => {
+  it('combines source, artifact_id, and message', () => {
+    const f = {
+      source: 'oracle:rivet-validate',
+      artifact_id: 'REQ-001',
+      message: 'missing field'
+    };
+    expect(findingKey(f)).toBe('oracle:rivet-validate|REQ-001|missing field');
+  });
+
+  it('falls back to claim when message is absent', () => {
+    const f = { source: 'oracle:rivet-validate', artifact_id: 'X', claim: 'X: foo' };
+    expect(findingKey(f)).toContain('X: foo');
+  });
+
+  it('returns empty string for nullish input', () => {
+    expect(findingKey(null)).toBe('');
+    expect(findingKey(undefined)).toBe('');
+  });
+});
+
+describe('subtractFindings', () => {
+  const base = [
+    { source: 'oracle:rivet-validate', artifact_id: 'REQ-001', message: 'no downstream', severity: 'warning' },
+    { source: 'oracle:rivet-validate', artifact_id: 'REQ-002', message: 'no downstream', severity: 'warning' }
+  ];
+
+  it('drops findings already present at base', () => {
+    const head = [...base, { source: 'oracle:rivet-validate', artifact_id: 'REQ-003', message: 'no downstream', severity: 'warning' }];
+    const delta = subtractFindings(head, base);
+    expect(delta).toHaveLength(1);
+    expect(delta[0].artifact_id).toBe('REQ-003');
+  });
+
+  it('keeps everything when base is empty', () => {
+    expect(subtractFindings(base, [])).toEqual(base);
+    expect(subtractFindings(base, undefined)).toEqual(base);
+  });
+
+  it('returns [] when head ⊆ base (no new findings = correct silence)', () => {
+    expect(subtractFindings(base, base)).toEqual([]);
+  });
+
+  it('treats same artifact + same message but different severity as the same finding', () => {
+    const head = [{ source: 'oracle:rivet-validate', artifact_id: 'REQ-001', message: 'no downstream', severity: 'error' }];
+    expect(subtractFindings(head, base)).toEqual([]);
+  });
+});
+
+describe('groupOracleFindings', () => {
+  it('collapses identical messages into one finding with artifact_ids list', () => {
+    const findings = [
+      { source: 'oracle:rivet-validate', severity: 'warning', artifact_id: 'REQ-001', message: 'no downstream' },
+      { source: 'oracle:rivet-validate', severity: 'warning', artifact_id: 'REQ-002', message: 'no downstream' },
+      { source: 'oracle:rivet-validate', severity: 'warning', artifact_id: 'REQ-003', message: 'no downstream' }
+    ];
+    const grouped = groupOracleFindings(findings);
+    expect(grouped).toHaveLength(1);
+    expect(grouped[0].artifact_ids).toEqual(['REQ-001', 'REQ-002', 'REQ-003']);
+    expect(grouped[0].claim).toContain('REQ-001');
+    expect(grouped[0].claim).toContain('REQ-002');
+    expect(grouped[0].artifact_id).toBe('3 artifacts');
+  });
+
+  it('keeps singletons in their original shape', () => {
+    const findings = [
+      { source: 'oracle:rivet-validate', severity: 'error', artifact_id: 'REQ-001', message: 'broken ref' }
+    ];
+    const grouped = groupOracleFindings(findings);
+    expect(grouped).toHaveLength(1);
+    expect(grouped[0].artifact_id).toBe('REQ-001');
+    expect(grouped[0].artifact_ids).toBeUndefined();
+  });
+
+  it('keeps different severities as separate groups even with same message', () => {
+    const findings = [
+      { source: 'oracle:rivet-validate', severity: 'warning', artifact_id: 'A', message: 'foo' },
+      { source: 'oracle:rivet-validate', severity: 'error', artifact_id: 'B', message: 'foo' }
+    ];
+    expect(groupOracleFindings(findings)).toHaveLength(2);
+  });
+
+  it('passes through model findings without grouping', () => {
+    const findings = [
+      { file: 'a.js', line: 1, quoted_line: '+x', claim: 'no test' },
+      { file: 'b.js', line: 1, quoted_line: '+y', claim: 'no test' }
+    ];
+    expect(groupOracleFindings(findings)).toHaveLength(2);
+  });
+
+  it('truncates the displayed id list at maxIdsShown', () => {
+    const ids = Array.from({ length: 25 }, (_, i) => `REQ-${String(i + 1).padStart(3, '0')}`);
+    const findings = ids.map((id) => ({
+      source: 'oracle:rivet-validate', severity: 'warning', artifact_id: id, message: 'shared'
+    }));
+    const grouped = groupOracleFindings(findings, { maxIdsShown: 5 });
+    expect(grouped).toHaveLength(1);
+    expect(grouped[0].artifact_ids).toHaveLength(25);
+    expect(grouped[0].claim).toMatch(/\(\+20 more\)/);
+  });
+});
+
 describe('runRivetOracle', () => {
   let tmpDir;
   beforeEach(() => {
diff --git a/src/ai-review-prompt.js b/src/ai-review-prompt.js
index b5d5807..50f3dcc 100644
--- a/src/ai-review-prompt.js
+++ b/src/ai-review-prompt.js
@@ -250,8 +250,16 @@ export function renderReviewMarkdown(parsed, prNumber, headSha, meta, opts = {})
     request_changes: '🔴 Request changes'
   }[verdict];
 
+  const oracleCount = findings.filter(
+    (f) => typeof f?.source === 'string' && f.source.startsWith('oracle:')
+  ).length;
+  const modelCount = findings.length - oracleCount;
+
   const lines = [];
-  lines.push(`## AI Code Review for PR #${prNumber}`);
+  // HTML marker first so supersedePreviousReviews can identify our comments
+  // even if the visible header changes.
+  lines.push('<!-- temper-automated-review -->');
+  lines.push(`## Automated review for PR #${prNumber}`);
   if (meta?.headRepo && meta?.baseBranch && meta?.headBranch && meta?.baseRepo) {
     lines.push(`${meta.headRepo}:\`${meta.headBranch}\` → ${meta.baseRepo}:\`${meta.baseBranch}\``);
   }
@@ -260,6 +268,10 @@ export function renderReviewMarkdown(parsed, prNumber, headSha, meta, opts = {})
   lines.push('');
   lines.push(`**Summary:** ${parsed.summary || '(none)'}`);
   lines.push('');
+  lines.push(
+    `_Findings: ${oracleCount} mechanical (rivet) · ${modelCount} from local AI model._`
+  );
+  lines.push('');
 
   if (findings.length === 0) {
     lines.push('**Findings:** None.');
@@ -268,13 +280,18 @@ export function renderReviewMarkdown(parsed, prNumber, headSha, meta, opts = {})
     lines.push('');
     findings.forEach((f, i) => {
       if (typeof f.source === 'string' && f.source.startsWith('oracle:')) {
-        // Oracle finding: artifact-anchored, mechanically validated.
+        // Oracle finding: artifact-anchored, mechanically validated. When
+        // findings are grouped, `artifact_ids` is a multi-element array;
+        // when single, it falls back to `artifact_id`.
         const sevBadge =
           f.severity === 'error' ? '🔴'
           : f.severity === 'warning' ? '🟡'
           : 'ℹ️';
         const oracleName = f.source.replace('oracle:', '');
-        lines.push(`${i + 1}. ${sevBadge} \`${f.artifact_id || '<unknown>'}\` _(${oracleName})_`);
+        const label = Array.isArray(f.artifact_ids) && f.artifact_ids.length > 1
+          ? `${f.artifact_ids.length} artifacts`
+          : f.artifact_id || '<unknown>';
+        lines.push(`${i + 1}. ${sevBadge} \`${label}\` _(${oracleName})_`);
         lines.push(`   ${f.claim}`);
       } else {
         // Model finding: file:line-anchored, post-validated.
diff --git a/src/ai-review.js b/src/ai-review.js
index 9f2d193..3307261 100644
--- a/src/ai-review.js
+++ b/src/ai-review.js
@@ -11,7 +11,12 @@ import {
   renderReviewMarkdown,
   buildResponseFormat
 } from './ai-review-prompt.js';
-import { runRivetOracle } from './rivet-oracle.js';
+import {
+  runRivetOracle,
+  runRivetValidate,
+  subtractFindings,
+  groupOracleFindings
+} from './rivet-oracle.js';
 import { hasRivetYaml, withTempRepoCheckout } from './rivet-fetch.js';
 
 const _reviewTimestamps = new Map();
@@ -270,6 +275,11 @@ async function callLocalAI(endpoint, model, systemPrompt, userPrompt, options =
   }
 }
 
+// Stable HTML marker for supersedePreviousReviews. Markdown renders it as
+// nothing visible but a substring search finds it reliably regardless of
+// header text changes. The legacy 'AI Code Review' string also matches old
+// comments still in the wild from before the rename.
+const REVIEW_MARKER = '<!-- temper-automated-review -->';
 const AI_REVIEW_SIGNATURE = 'AI Code Review';
 
 function formatReviewComment(aiResponse, prNumber, headSha, meta) {
@@ -323,7 +333,7 @@ async function supersedePreviousReviews(octokit, owner, repo, prNumber) {
     for (const comment of comments) {
       if (
         comment.body &&
-        comment.body.includes(AI_REVIEW_SIGNATURE) &&
+        (comment.body.includes(REVIEW_MARKER) || comment.body.includes(AI_REVIEW_SIGNATURE)) &&
         !comment.body.startsWith('>')
       ) {
         await octokit.request(
@@ -415,12 +425,50 @@ async function reviewPullRequest(octokit, owner, repo, prNumber) {
               timeout: rivetCfg.timeout_ms || 60000
             })
           );
-          if (oracleSummary?.findings) {
-            oracleFindings = oracleSummary.findings;
+          let headOracleFindings = oracleSummary?.findings || [];
+
+          // Delta filter: subtract findings already present at the PR base.
+          // Without this, the review surfaces the entire repo backlog every
+          // time (see spar#175 — 91 pre-existing warnings drowning the diff).
+          let baseValidate = null;
+          if (baseSha && baseSha !== headSha) {
+            try {
+              baseValidate = await withTempRepoCheckout(
+                octokit,
+                owner,
+                repo,
+                baseSha,
+                (repoPath) => runRivetValidate(resolvedBinary, repoPath, {
+                  timeout: rivetCfg.timeout_ms || 60000
+                })
+              );
+            } catch (err) {
+              getLogger().warn(
+                { pr: prNumber, err: err.message },
+                'base-ref validate failed — falling back to head-only oracle findings'
+              );
+            }
           }
+          const droppedCount = headOracleFindings.length;
+          headOracleFindings = subtractFindings(
+            headOracleFindings,
+            baseValidate?.findings || []
+          );
+          const baseDelta = droppedCount - headOracleFindings.length;
+
+          // Group identical messages so 50 "Every requirement should be
+          // satisfied by ..." diagnostics collapse into one finding listing
+          // the affected artifact ids.
+          oracleFindings = groupOracleFindings(headOracleFindings);
+
           getLogger().info(
-            { pr: prNumber, oracleFindings: oracleFindings.length },
-            'Rivet oracle complete'
+            {
+              pr: prNumber,
+              oracleFindings: oracleFindings.length,
+              droppedAsPreExisting: baseDelta,
+              afterGrouping: oracleFindings.length
+            },
+            'Rivet oracle complete (delta + grouped)'
           );
         }
       } catch (err) {
diff --git a/src/rivet-oracle.js b/src/rivet-oracle.js
index bf960dc..05e9281 100644
--- a/src/rivet-oracle.js
+++ b/src/rivet-oracle.js
@@ -67,6 +67,10 @@ function diagnosticToFinding(diag) {
     source: 'oracle:rivet-validate',
     severity: diag.severity,
     artifact_id: diag.artifact_id,
+    // `message` is stored separately so we can group findings by message text.
+    // `claim` is the rendered form used by the legacy code path that doesn't
+    // group (kept for back-compat — tests assert on it).
+    message: diag.message,
     claim: `${diag.artifact_id}: ${diag.message}`
   };
 }
@@ -79,11 +83,13 @@ function gapToFinding(gap) {
   if (!gap || typeof gap !== 'object') return null;
   const id = gap.artifact_id || gap.id;
   if (!id) return null;
+  const reason = gap.reason || gap.message || 'lifecycle coverage gap: no downstream artifacts';
   return {
     source: 'oracle:rivet-validate',
     severity: 'warning',
     artifact_id: id,
-    claim: `${id} (${gap.type || 'artifact'}) — ${gap.reason || gap.message || 'lifecycle coverage gap: no downstream artifacts'}`
+    message: reason,
+    claim: `${id} (${gap.type || 'artifact'}) — ${reason}`
   };
 }
 
@@ -93,10 +99,12 @@ function gapToFinding(gap) {
 function brokenCrossRefToFinding(ref) {
   const from = ref?.from || ref?.source || 'unknown';
   const to = ref?.to || ref?.target || 'unknown';
+  const message = `broken cross-reference → ${to}${ref?.reason ? ` (${ref.reason})` : ''}`;
   return {
     source: 'oracle:rivet-validate',
     severity: 'error',
     artifact_id: from,
+    message,
     claim: `Broken cross-reference: ${from} → ${to}${ref?.reason ? ` (${ref.reason})` : ''}`
   };
 }
@@ -234,6 +242,93 @@ export async function runRivetImpact(binary, repoPath, baseRef, opts = {}) {
   };
 }
 
+/**
+ * Stable identity key for a finding. Used by `subtractFindings` so the
+ * "is this finding new?" question is decidable without false-positive churn.
+ *
+ * The key intentionally excludes claim text (which is rendered) and severity
+ * upgrades (a warning at base that becomes an error at head is treated as
+ * "the same finding", since it's the same artifact + same message).
+ */
+export function findingKey(f) {
+  if (!f) return '';
+  return `${f.source || ''}|${f.artifact_id || ''}|${f.message || f.claim || ''}`;
+}
+
+/**
+ * Set difference: keep only findings present at HEAD but not at BASE.
+ * The whole point of PR review is "what did this PR change", not "what's
+ * the static state of the project". Pre-existing diagnostics get filtered.
+ */
+export function subtractFindings(headFindings, baseFindings) {
+  if (!Array.isArray(baseFindings) || baseFindings.length === 0) return headFindings;
+  const baseKeys = new Set(baseFindings.map(findingKey));
+  return headFindings.filter((f) => !baseKeys.has(findingKey(f)));
+}
+
+/**
+ * Collapse oracle findings that share the same source + severity + message
+ * into a single finding listing all affected artifact_ids. Today's spar#175
+ * review had 91 findings with 6 unique messages — this turns that into 6
+ * grouped findings with id lists.
+ *
+ * Non-oracle findings (model output) are passed through untouched.
+ */
+export function groupOracleFindings(findings, opts = {}) {
+  const { maxIdsShown = 10 } = opts;
+  if (!Array.isArray(findings) || findings.length === 0) return [];
+
+  const groups = new Map();
+  const out = [];
+
+  for (const f of findings) {
+    if (!f.source || !f.source.startsWith('oracle:')) {
+      out.push(f);
+      continue;
+    }
+    if (!f.message) {
+      out.push(f);
+      continue;
+    }
+    const key = `${f.source}|${f.severity}|${f.message}`;
+    if (!groups.has(key)) {
+      groups.set(key, {
+        source: f.source,
+        severity: f.severity,
+        message: f.message,
+        artifact_ids: []
+      });
+    }
+    groups.get(key).artifact_ids.push(f.artifact_id);
+  }
+
+  for (const g of groups.values()) {
+    if (g.artifact_ids.length === 1) {
+      out.push({
+        source: g.source,
+        severity: g.severity,
+        artifact_id: g.artifact_ids[0],
+        message: g.message,
+        claim: `${g.artifact_ids[0]}: ${g.message}`
+      });
+    } else {
+      const shown = g.artifact_ids.slice(0, maxIdsShown).join(', ');
+      const more = g.artifact_ids.length > maxIdsShown
+        ? ` (+${g.artifact_ids.length - maxIdsShown} more)`
+        : '';
+      out.push({
+        source: g.source,
+        severity: g.severity,
+        artifact_id: `${g.artifact_ids.length} artifacts`,
+        artifact_ids: g.artifact_ids,
+        message: g.message,
+        claim: `${g.message} — affecting: ${shown}${more}`
+      });
+    }
+  }
+  return out;
+}
+
 /**
  * One-shot helper: run both validate and impact (when baseRef given), merge
  * findings, return a single result. Errors from either are non-fatal —