diff --git a/.github/workflows/copilot-agent-analysis.md b/.github/workflows/copilot-agent-analysis.md index 6e55e8d6037..ef8c734f553 100644 --- a/.github/workflows/copilot-agent-analysis.md +++ b/.github/workflows/copilot-agent-analysis.md @@ -18,6 +18,26 @@ permissions: engine: claude strict: true +experiments: + output_format: + variants: [structured, prose] + description: "Test whether a prose-style discussion summary reduces AI credit consumption vs. the current table-centric structured format without sacrificing completeness." + hypothesis: "H0: no change in ai_credits_used. H1: prose format reduces ai_credits_used by >=15% while keeping empty_discussion_rate <=5%" + metric: ai_credits_used + secondary_metrics: [run_duration_seconds, output_length_chars] + guardrail_metrics: + - name: empty_discussion_rate + direction: min + threshold: 0.05 + min_samples: 30 + weight: [50, 50] + start_date: "2026-06-08" + issue: #aw_cop_fmt + analysis_type: t_test + tags: [cost-efficiency, output-quality, daily-report] + notify: + discussion: true + network: allowed: - defaults @@ -316,6 +336,7 @@ Create a **concise** discussion with your findings using the safe-outputs create **Discussion Title**: `Daily Copilot Agent Analysis - [DATE]` +{{#if experiments.output_format == 'structured' }} **Concise Discussion Template**: ```markdown ### 🤖 Copilot Agent PR Analysis - [DATE] @@ -401,6 +422,24 @@ The "Agent Task Texts" section should include a table showing all PRs created in - **Remove "Instruction File Changes"** section entirely - **Eliminate "Recommendations"** section - fold into "Key Insights" (1-2 bullets max) - **Remove verbose methodology** and historical context sections +{{/if}} +{{#if experiments.output_format == 'prose' }} +**Prose Discussion Template**: +```markdown +### 🤖 Copilot Agent PR Analysis - [DATE] + +**Analysis Period**: Last 24 hours + +In the last 24 hours, Copilot agent created [count] PRs (`agent_prs_total`), of which [count] were merged ([percentage]% success rate, `agent_prs_merged`) with an average duration of [time] and [count] human comments per PR. [One sentence on 3-day trend only if success rate changed >10%, e.g. "Success rate improved from X% to Y% over the last 3 days." — otherwise omit.] [One sentence listing any notable PRs by number only if failures, closures, or PRs open >24h exist — otherwise omit.] + +- [Key insight 1: single most actionable observation — omit bullet entirely if nothing notable] +- [Key insight 2: secondary pattern or trend worth flagging — omit bullet entirely if nothing notable] + +--- + +_Generated by Copilot Agent Analysis (Run: [run_id])_ +``` +{{/if}} ## Important Guidelines