diff --git a/.github/workflows/daily-agentrx-trace-optimizer.lock.yml b/.github/workflows/daily-agentrx-trace-optimizer.lock.yml index bf18ee733bc..7d54b4454c8 100644 --- a/.github/workflows/daily-agentrx-trace-optimizer.lock.yml +++ b/.github/workflows/daily-agentrx-trace-optimizer.lock.yml @@ -1,4 +1,4 @@ -# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"594df0cc4de40960208f153363c4e670b08c7218bfcc3ef07d9889253903f543","body_hash":"e24b96a563202135dbb6b48eefab235ae613fef6349ba4c52f2bea7ba0df8986","strict":true,"agent_id":"claude","engine_versions":{"claude":"2.1.168"}} +# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"0d4e9fa2e92274489540883eb33efafc2b5391984468f1d2849355c5a7e8e710","body_hash":"e24b96a563202135dbb6b48eefab235ae613fef6349ba4c52f2bea7ba0df8986","strict":true,"agent_id":"claude","engine_versions":{"claude":"2.1.168"}} # gh-aw-manifest: {"version":1,"secrets":["ANTHROPIC_API_KEY","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GH_AW_OTEL_GRAFANA_AUTHORIZATION","GH_AW_OTEL_GRAFANA_ENDPOINT","GH_AW_OTEL_SENTRY_AUTHORIZATION","GH_AW_OTEL_SENTRY_ENDPOINT","GITHUB_TOKEN"],"actions":[{"repo":"actions/checkout","sha":"df4cb1c069e1874edd31b4311f1884172cec0e10","version":"v6.0.3"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9.0.0"},{"repo":"actions/setup-go","sha":"4a3601121dd01d1626a1e23e37211e3254c1c06c","version":"v6.4.0"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"},{"repo":"docker/build-push-action","sha":"f9f3042f7e2789586610d6e8b85c8f03e5195baf","version":"v7.2.0"},{"repo":"docker/setup-buildx-action","sha":"d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5","version":"v4.1.0"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.27.1","digest":"sha256:55149fa2daf8fa8afa2803f2ac1a3534591a7c96f173ee2aec9545fbe67305df","pinned_image":"ghcr.io/github/gh-aw-firewall/agent:0.27.1@sha256:55149fa2daf8fa8afa2803f2ac1a3534591a7c96f173ee2aec9545fbe67305df"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.27.1","digest":"sha256:2802437f05830336ea3ae8639f628776608d14d95b5b3cf30f161eb505e29752","pinned_image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.27.1@sha256:2802437f05830336ea3ae8639f628776608d14d95b5b3cf30f161eb505e29752"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.27.1","digest":"sha256:1f3df3207dc9faa9080088115ca50a5ab0d7a692c61dffa8c8898d0b7b750413","pinned_image":"ghcr.io/github/gh-aw-firewall/squid:0.27.1@sha256:1f3df3207dc9faa9080088115ca50a5ab0d7a692c61dffa8c8898d0b7b750413"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.25","digest":"sha256:c10331ad17668ef89f38f5e356678788a40b0cd5fef96e8f92e1d9c1de47cbaa","pinned_image":"ghcr.io/github/gh-aw-mcpg:v0.3.25@sha256:c10331ad17668ef89f38f5e356678788a40b0cd5fef96e8f92e1d9c1de47cbaa"},{"image":"ghcr.io/github/github-mcp-server:v1.1.2","digest":"sha256:30197479d8036c7811892bc07e06f9a05c9ef3cdd79bc59f256d50647f95788c","pinned_image":"ghcr.io/github/github-mcp-server:v1.1.2@sha256:30197479d8036c7811892bc07e06f9a05c9ef3cdd79bc59f256d50647f95788c"}]} # ___ _ _ # / _ \ | | (_) @@ -235,7 +235,7 @@ jobs: id: pick-experiment uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 env: - GH_AW_EXPERIMENT_SPEC: '{"sub_agent_strategy":{"variants":["sub_agents","single_agent"],"description":"Test whether delegating trajectory-builder, artifacts-summarizer, and failure-pattern-classifier to small-model sub-agents improves recommendation quality vs. inline analysis by the main agent","hypothesis":"H0: no change in issue quality or run success rate. H1: sub_agents variant yields higher evidence completeness score with equal or lower token cost","metric":"issue_evidence_completeness","secondary_metrics":["run_success_rate","effective_tokens_total","run_duration_ms"],"guardrail_metrics":[{"name":"empty_output_rate","threshold":"\u003c=0.10"},{"name":"noop_rate","threshold":"\u003c=0.30"}],"min_samples":20,"weight":[50,50],"start_date":"2026-06-02"}}' + GH_AW_EXPERIMENT_SPEC: '{"sub_agent_strategy":{"variants":["sub_agents","single_agent"],"description":"Test whether delegating trajectory-builder, artifacts-summarizer, and failure-pattern-classifier to small-model sub-agents improves recommendation quality vs. inline analysis by the main agent","hypothesis":"H0: no change in issue quality or run success rate. H1: sub_agents variant yields higher evidence completeness score with equal or lower token cost","metric":"issue_evidence_completeness","secondary_metrics":["run_success_rate","ai_credits_total","run_duration_ms"],"guardrail_metrics":[{"name":"empty_output_rate","threshold":"\u003c=0.10"},{"name":"noop_rate","threshold":"\u003c=0.30"}],"min_samples":20,"weight":[50,50],"start_date":"2026-06-02"}}' GH_AW_EXPERIMENT_STATE_FILE: /tmp/gh-aw/experiments/state.json GH_AW_EXPERIMENT_STATE_DIR: /tmp/gh-aw/experiments with: diff --git a/.github/workflows/daily-agentrx-trace-optimizer.md b/.github/workflows/daily-agentrx-trace-optimizer.md index 34509f2f904..e2c97f57f1c 100644 --- a/.github/workflows/daily-agentrx-trace-optimizer.md +++ b/.github/workflows/daily-agentrx-trace-optimizer.md @@ -16,7 +16,7 @@ experiments: description: "Test whether delegating trajectory-builder, artifacts-summarizer, and failure-pattern-classifier to small-model sub-agents improves recommendation quality vs. inline analysis by the main agent" hypothesis: "H0: no change in issue quality or run success rate. H1: sub_agents variant yields higher evidence completeness score with equal or lower token cost" metric: issue_evidence_completeness - secondary_metrics: [run_success_rate, effective_tokens_total, run_duration_ms] + secondary_metrics: [run_success_rate, ai_credits_total, run_duration_ms] guardrail_metrics: - name: empty_output_rate threshold: "<=0.10" diff --git a/.github/workflows/daily-cache-strategy-analyzer.lock.yml b/.github/workflows/daily-cache-strategy-analyzer.lock.yml index 227638c07b4..90e429324d3 100644 --- a/.github/workflows/daily-cache-strategy-analyzer.lock.yml +++ b/.github/workflows/daily-cache-strategy-analyzer.lock.yml @@ -1,4 +1,4 @@ -# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"3b17db520c7b63ab76c482b03e678959d69970f2eda5c4c6474fe34fbe50eb96","body_hash":"c959161f2c09734078145fcc5d71f5d920b19a76a3b4f1b94f98a91d6d59af2b","strict":true,"agent_id":"codex","agent_model":"${{ needs.activation.outputs.model_size }}","engine_versions":{"codex":"0.137.0"}} +# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"7767cb1165988158453359c185a753e63f4b03e19cb7c449c3eff864114a0764","body_hash":"c959161f2c09734078145fcc5d71f5d920b19a76a3b4f1b94f98a91d6d59af2b","strict":true,"agent_id":"codex","agent_model":"${{ needs.activation.outputs.model_size }}","engine_versions":{"codex":"0.137.0"}} # gh-aw-manifest: {"version":1,"secrets":["CODEX_API_KEY","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GH_AW_OTEL_GRAFANA_AUTHORIZATION","GH_AW_OTEL_GRAFANA_ENDPOINT","GH_AW_OTEL_SENTRY_AUTHORIZATION","GH_AW_OTEL_SENTRY_ENDPOINT","GITHUB_TOKEN","OPENAI_API_KEY"],"actions":[{"repo":"actions/cache/restore","sha":"27d5ce7f107fe9357f9df03efb73ab90386fccae","version":"v5.0.5"},{"repo":"actions/cache/save","sha":"27d5ce7f107fe9357f9df03efb73ab90386fccae","version":"v5.0.5"},{"repo":"actions/checkout","sha":"df4cb1c069e1874edd31b4311f1884172cec0e10","version":"v6.0.3"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9.0.0"},{"repo":"actions/setup-go","sha":"4a3601121dd01d1626a1e23e37211e3254c1c06c","version":"v6.4.0"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"},{"repo":"docker/build-push-action","sha":"f9f3042f7e2789586610d6e8b85c8f03e5195baf","version":"v7.2.0"},{"repo":"docker/setup-buildx-action","sha":"d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5","version":"v4.1.0"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.27.1","digest":"sha256:55149fa2daf8fa8afa2803f2ac1a3534591a7c96f173ee2aec9545fbe67305df","pinned_image":"ghcr.io/github/gh-aw-firewall/agent:0.27.1@sha256:55149fa2daf8fa8afa2803f2ac1a3534591a7c96f173ee2aec9545fbe67305df"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.27.1","digest":"sha256:2802437f05830336ea3ae8639f628776608d14d95b5b3cf30f161eb505e29752","pinned_image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.27.1@sha256:2802437f05830336ea3ae8639f628776608d14d95b5b3cf30f161eb505e29752"},{"image":"ghcr.io/github/gh-aw-firewall/cli-proxy:0.27.1","digest":"sha256:2e6dc98321dbf82840f83ec0ef8b198506149255a15d3a7854d59c0d34063e27","pinned_image":"ghcr.io/github/gh-aw-firewall/cli-proxy:0.27.1@sha256:2e6dc98321dbf82840f83ec0ef8b198506149255a15d3a7854d59c0d34063e27"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.27.1","digest":"sha256:1f3df3207dc9faa9080088115ca50a5ab0d7a692c61dffa8c8898d0b7b750413","pinned_image":"ghcr.io/github/gh-aw-firewall/squid:0.27.1@sha256:1f3df3207dc9faa9080088115ca50a5ab0d7a692c61dffa8c8898d0b7b750413"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.25","digest":"sha256:c10331ad17668ef89f38f5e356678788a40b0cd5fef96e8f92e1d9c1de47cbaa","pinned_image":"ghcr.io/github/gh-aw-mcpg:v0.3.25@sha256:c10331ad17668ef89f38f5e356678788a40b0cd5fef96e8f92e1d9c1de47cbaa"},{"image":"ghcr.io/github/github-mcp-server:v1.1.2","digest":"sha256:30197479d8036c7811892bc07e06f9a05c9ef3cdd79bc59f256d50647f95788c","pinned_image":"ghcr.io/github/github-mcp-server:v1.1.2@sha256:30197479d8036c7811892bc07e06f9a05c9ef3cdd79bc59f256d50647f95788c"}]} # ___ _ _ # / _ \ | | (_) @@ -240,7 +240,7 @@ jobs: id: pick-experiment uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 env: - GH_AW_EXPERIMENT_SPEC: '{"model_size":{"variants":["gpt-5.4","gpt-5-codex"],"description":"Compares codex-compatible models for cache issue detection quality and efficiency.","hypothesis":"H0: no change in issue creation rate or run success rate. H1: gpt-5-codex reduces effective tokens while keeping run success rate \u003e=0.90.","metric":"effective_tokens_total","secondary_metrics":["run_success_rate","run_duration_ms"],"guardrail_metrics":[{"name":"run_success_rate","threshold":"\u003e=0.90"},{"name":"empty_output_rate","threshold":"\u003c=0.10"}],"min_samples":20,"weight":[50,50],"start_date":"2026-06-04"}}' + GH_AW_EXPERIMENT_SPEC: '{"model_size":{"variants":["gpt-5.4","gpt-5-codex"],"description":"Compares codex-compatible models for cache issue detection quality and efficiency.","hypothesis":"H0: no change in issue creation rate or run success rate. H1: gpt-5-codex reduces AI Credits while keeping run success rate \u003e=0.90.","metric":"ai_credits_total","secondary_metrics":["run_success_rate","run_duration_ms"],"guardrail_metrics":[{"name":"run_success_rate","threshold":"\u003e=0.90"},{"name":"empty_output_rate","threshold":"\u003c=0.10"}],"min_samples":20,"weight":[50,50],"start_date":"2026-06-04"}}' GH_AW_EXPERIMENT_STATE_FILE: /tmp/gh-aw/experiments/state.json GH_AW_EXPERIMENT_STATE_DIR: /tmp/gh-aw/experiments with: diff --git a/.github/workflows/daily-cache-strategy-analyzer.md b/.github/workflows/daily-cache-strategy-analyzer.md index cc2b325c4c9..35b596837c4 100644 --- a/.github/workflows/daily-cache-strategy-analyzer.md +++ b/.github/workflows/daily-cache-strategy-analyzer.md @@ -21,8 +21,8 @@ experiments: model_size: variants: [gpt-5.4, gpt-5-codex] description: "Compares codex-compatible models for cache issue detection quality and efficiency." - hypothesis: "H0: no change in issue creation rate or run success rate. H1: gpt-5-codex reduces effective tokens while keeping run success rate >=0.90." - metric: effective_tokens_total + hypothesis: "H0: no change in issue creation rate or run success rate. H1: gpt-5-codex reduces AI Credits while keeping run success rate >=0.90." + metric: ai_credits_total secondary_metrics: [run_success_rate, run_duration_ms] guardrail_metrics: - name: run_success_rate diff --git a/.github/workflows/daily-caveman-optimizer.lock.yml b/.github/workflows/daily-caveman-optimizer.lock.yml index 3b8b0871e3a..22a8e206944 100644 --- a/.github/workflows/daily-caveman-optimizer.lock.yml +++ b/.github/workflows/daily-caveman-optimizer.lock.yml @@ -1,4 +1,4 @@ -# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"2e71170ad98e4dc9b10f5937a51c0deb1fb5fbc829392b9a98e9fb0c5bf579ea","body_hash":"84e96030ea792b1fb60e2b4c144575d9a213ee83b2e404f4263693134afff813","strict":true,"agent_id":"claude","agent_model":"${{ needs.activation.outputs.model_size }}","engine_versions":{"claude":"2.1.168"}} +# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"c3227d66db6283ac29d8cb669571120ee75009e5b2b8fec5d3cbcfe4104a356e","body_hash":"84e96030ea792b1fb60e2b4c144575d9a213ee83b2e404f4263693134afff813","strict":true,"agent_id":"claude","agent_model":"${{ needs.activation.outputs.model_size }}","engine_versions":{"claude":"2.1.168"}} # gh-aw-manifest: {"version":1,"secrets":["ANTHROPIC_API_KEY","GH_AW_CI_TRIGGER_TOKEN","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GH_AW_OTEL_GRAFANA_AUTHORIZATION","GH_AW_OTEL_GRAFANA_ENDPOINT","GH_AW_OTEL_SENTRY_AUTHORIZATION","GH_AW_OTEL_SENTRY_ENDPOINT","GITHUB_TOKEN"],"actions":[{"repo":"actions/cache/restore","sha":"27d5ce7f107fe9357f9df03efb73ab90386fccae","version":"v5.0.5"},{"repo":"actions/cache/save","sha":"27d5ce7f107fe9357f9df03efb73ab90386fccae","version":"v5.0.5"},{"repo":"actions/checkout","sha":"df4cb1c069e1874edd31b4311f1884172cec0e10","version":"v6.0.3"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9.0.0"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.27.1","digest":"sha256:55149fa2daf8fa8afa2803f2ac1a3534591a7c96f173ee2aec9545fbe67305df","pinned_image":"ghcr.io/github/gh-aw-firewall/agent:0.27.1@sha256:55149fa2daf8fa8afa2803f2ac1a3534591a7c96f173ee2aec9545fbe67305df"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.27.1","digest":"sha256:2802437f05830336ea3ae8639f628776608d14d95b5b3cf30f161eb505e29752","pinned_image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.27.1@sha256:2802437f05830336ea3ae8639f628776608d14d95b5b3cf30f161eb505e29752"},{"image":"ghcr.io/github/gh-aw-firewall/cli-proxy:0.27.1","digest":"sha256:2e6dc98321dbf82840f83ec0ef8b198506149255a15d3a7854d59c0d34063e27","pinned_image":"ghcr.io/github/gh-aw-firewall/cli-proxy:0.27.1@sha256:2e6dc98321dbf82840f83ec0ef8b198506149255a15d3a7854d59c0d34063e27"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.27.1","digest":"sha256:1f3df3207dc9faa9080088115ca50a5ab0d7a692c61dffa8c8898d0b7b750413","pinned_image":"ghcr.io/github/gh-aw-firewall/squid:0.27.1@sha256:1f3df3207dc9faa9080088115ca50a5ab0d7a692c61dffa8c8898d0b7b750413"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.25","digest":"sha256:c10331ad17668ef89f38f5e356678788a40b0cd5fef96e8f92e1d9c1de47cbaa","pinned_image":"ghcr.io/github/gh-aw-mcpg:v0.3.25@sha256:c10331ad17668ef89f38f5e356678788a40b0cd5fef96e8f92e1d9c1de47cbaa"},{"image":"ghcr.io/github/github-mcp-server:v1.1.2","digest":"sha256:30197479d8036c7811892bc07e06f9a05c9ef3cdd79bc59f256d50647f95788c","pinned_image":"ghcr.io/github/github-mcp-server:v1.1.2@sha256:30197479d8036c7811892bc07e06f9a05c9ef3cdd79bc59f256d50647f95788c"}]} # ___ _ _ # / _ \ | | (_) @@ -233,7 +233,7 @@ jobs: id: pick-experiment uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 env: - GH_AW_EXPERIMENT_SPEC: '{"model_size":{"variants":["claude-sonnet-4.6","claude-haiku-4.5"],"description":"Tests whether Claude Haiku produces equivalent instruction conciseness improvements at lower token cost versus Claude Sonnet.","hypothesis":"H0: no change in PR creation rate or run success rate. H1: Claude Haiku reduces AI credit usage \u003e=30% with equivalent run success rate (\u003e=0.90).","metric":"effective_tokens_total","secondary_metrics":["run_success_rate","run_duration_ms"],"guardrail_metrics":[{"name":"run_success_rate","threshold":"\u003e=0.90"},{"name":"empty_output_rate","threshold":"\u003c=0.10"}],"min_samples":20,"weight":[50,50],"start_date":"2026-06-04"}}' + GH_AW_EXPERIMENT_SPEC: '{"model_size":{"variants":["claude-sonnet-4.6","claude-haiku-4.5"],"description":"Tests whether Claude Haiku produces equivalent instruction conciseness improvements at lower token cost versus Claude Sonnet.","hypothesis":"H0: no change in PR creation rate or run success rate. H1: Claude Haiku reduces AI credit usage \u003e=30% with equivalent run success rate (\u003e=0.90).","metric":"ai_credits_total","secondary_metrics":["run_success_rate","run_duration_ms"],"guardrail_metrics":[{"name":"run_success_rate","threshold":"\u003e=0.90"},{"name":"empty_output_rate","threshold":"\u003c=0.10"}],"min_samples":20,"weight":[50,50],"start_date":"2026-06-04"}}' GH_AW_EXPERIMENT_STATE_FILE: /tmp/gh-aw/experiments/state.json GH_AW_EXPERIMENT_STATE_DIR: /tmp/gh-aw/experiments with: diff --git a/.github/workflows/daily-caveman-optimizer.md b/.github/workflows/daily-caveman-optimizer.md index 53682f8bab9..e2c158fe671 100644 --- a/.github/workflows/daily-caveman-optimizer.md +++ b/.github/workflows/daily-caveman-optimizer.md @@ -23,7 +23,7 @@ experiments: variants: [claude-sonnet-4.6, claude-haiku-4.5] description: "Tests whether Claude Haiku produces equivalent instruction conciseness improvements at lower token cost versus Claude Sonnet." hypothesis: "H0: no change in PR creation rate or run success rate. H1: Claude Haiku reduces AI credit usage >=30% with equivalent run success rate (>=0.90)." - metric: effective_tokens_total + metric: ai_credits_total secondary_metrics: [run_success_rate, run_duration_ms] guardrail_metrics: - name: run_success_rate diff --git a/.github/workflows/daily-doc-healer.lock.yml b/.github/workflows/daily-doc-healer.lock.yml index 9f833e51cfd..3add558c52a 100644 --- a/.github/workflows/daily-doc-healer.lock.yml +++ b/.github/workflows/daily-doc-healer.lock.yml @@ -1,4 +1,4 @@ -# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"450cb29f1da22d7bb7b9c440149fc6b4496158023d52395d87f432cd2c0b79bc","body_hash":"17d342d9eb1eb81fe8ee6dc242c2e7dd96fbdcba013840231269807407639fa6","strict":true,"agent_id":"claude","agent_model":"${{ needs.activation.outputs.model_size }}","engine_versions":{"claude":"2.1.168"}} +# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"1971f0e16f842b3005abd044903f3e5c9c830ce03d2ca12e106dc76bbc668bcc","body_hash":"17d342d9eb1eb81fe8ee6dc242c2e7dd96fbdcba013840231269807407639fa6","strict":true,"agent_id":"claude","agent_model":"${{ needs.activation.outputs.model_size }}","engine_versions":{"claude":"2.1.168"}} # gh-aw-manifest: {"version":1,"secrets":["ANTHROPIC_API_KEY","GH_AW_AGENT_TOKEN","GH_AW_CI_TRIGGER_TOKEN","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GH_AW_OTEL_GRAFANA_AUTHORIZATION","GH_AW_OTEL_GRAFANA_ENDPOINT","GH_AW_OTEL_SENTRY_AUTHORIZATION","GH_AW_OTEL_SENTRY_ENDPOINT","GITHUB_TOKEN"],"actions":[{"repo":"actions/cache/restore","sha":"27d5ce7f107fe9357f9df03efb73ab90386fccae","version":"v5.0.5"},{"repo":"actions/cache/save","sha":"27d5ce7f107fe9357f9df03efb73ab90386fccae","version":"v5.0.5"},{"repo":"actions/checkout","sha":"df4cb1c069e1874edd31b4311f1884172cec0e10","version":"v6.0.3"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9.0.0"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.27.1","digest":"sha256:55149fa2daf8fa8afa2803f2ac1a3534591a7c96f173ee2aec9545fbe67305df","pinned_image":"ghcr.io/github/gh-aw-firewall/agent:0.27.1@sha256:55149fa2daf8fa8afa2803f2ac1a3534591a7c96f173ee2aec9545fbe67305df"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.27.1","digest":"sha256:2802437f05830336ea3ae8639f628776608d14d95b5b3cf30f161eb505e29752","pinned_image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.27.1@sha256:2802437f05830336ea3ae8639f628776608d14d95b5b3cf30f161eb505e29752"},{"image":"ghcr.io/github/gh-aw-firewall/cli-proxy:0.27.1","digest":"sha256:2e6dc98321dbf82840f83ec0ef8b198506149255a15d3a7854d59c0d34063e27","pinned_image":"ghcr.io/github/gh-aw-firewall/cli-proxy:0.27.1@sha256:2e6dc98321dbf82840f83ec0ef8b198506149255a15d3a7854d59c0d34063e27"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.27.1","digest":"sha256:1f3df3207dc9faa9080088115ca50a5ab0d7a692c61dffa8c8898d0b7b750413","pinned_image":"ghcr.io/github/gh-aw-firewall/squid:0.27.1@sha256:1f3df3207dc9faa9080088115ca50a5ab0d7a692c61dffa8c8898d0b7b750413"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.25","digest":"sha256:c10331ad17668ef89f38f5e356678788a40b0cd5fef96e8f92e1d9c1de47cbaa","pinned_image":"ghcr.io/github/gh-aw-mcpg:v0.3.25@sha256:c10331ad17668ef89f38f5e356678788a40b0cd5fef96e8f92e1d9c1de47cbaa"},{"image":"ghcr.io/github/github-mcp-server:v1.1.2","digest":"sha256:30197479d8036c7811892bc07e06f9a05c9ef3cdd79bc59f256d50647f95788c","pinned_image":"ghcr.io/github/github-mcp-server:v1.1.2@sha256:30197479d8036c7811892bc07e06f9a05c9ef3cdd79bc59f256d50647f95788c"}]} # ___ _ _ # / _ \ | | (_) @@ -237,7 +237,7 @@ jobs: id: pick-experiment uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 env: - GH_AW_EXPERIMENT_SPEC: '{"model_size":{"variants":["claude-sonnet-4.6","claude-haiku-4.5"],"description":"Tests whether Claude Haiku detects and corrects documentation gaps with equivalent quality at lower token cost versus Claude Sonnet.","hypothesis":"H0: no change in issue/PR creation rate or run success rate. H1: Claude Haiku reduces AI credit usage \u003e=30% with equivalent run success rate (\u003e=0.90).","metric":"effective_tokens_total","secondary_metrics":["run_success_rate","run_duration_ms"],"guardrail_metrics":[{"name":"run_success_rate","threshold":"\u003e=0.90"},{"name":"empty_output_rate","threshold":"\u003c=0.10"}],"min_samples":20,"weight":[50,50],"start_date":"2026-06-04"}}' + GH_AW_EXPERIMENT_SPEC: '{"model_size":{"variants":["claude-sonnet-4.6","claude-haiku-4.5"],"description":"Tests whether Claude Haiku detects and corrects documentation gaps with equivalent quality at lower token cost versus Claude Sonnet.","hypothesis":"H0: no change in issue/PR creation rate or run success rate. H1: Claude Haiku reduces AI credit usage \u003e=30% with equivalent run success rate (\u003e=0.90).","metric":"ai_credits_total","secondary_metrics":["run_success_rate","run_duration_ms"],"guardrail_metrics":[{"name":"run_success_rate","threshold":"\u003e=0.90"},{"name":"empty_output_rate","threshold":"\u003c=0.10"}],"min_samples":20,"weight":[50,50],"start_date":"2026-06-04"}}' GH_AW_EXPERIMENT_STATE_FILE: /tmp/gh-aw/experiments/state.json GH_AW_EXPERIMENT_STATE_DIR: /tmp/gh-aw/experiments with: diff --git a/.github/workflows/daily-doc-healer.md b/.github/workflows/daily-doc-healer.md index 8a6b3eb7eac..3226a42a77b 100644 --- a/.github/workflows/daily-doc-healer.md +++ b/.github/workflows/daily-doc-healer.md @@ -46,7 +46,7 @@ experiments: variants: [claude-sonnet-4.6, claude-haiku-4.5] description: "Tests whether Claude Haiku detects and corrects documentation gaps with equivalent quality at lower token cost versus Claude Sonnet." hypothesis: "H0: no change in issue/PR creation rate or run success rate. H1: Claude Haiku reduces AI credit usage >=30% with equivalent run success rate (>=0.90)." - metric: effective_tokens_total + metric: ai_credits_total secondary_metrics: [run_success_rate, run_duration_ms] guardrail_metrics: - name: run_success_rate diff --git a/.github/workflows/daily-doc-updater.lock.yml b/.github/workflows/daily-doc-updater.lock.yml index 15480dbb03e..149ee736a39 100644 --- a/.github/workflows/daily-doc-updater.lock.yml +++ b/.github/workflows/daily-doc-updater.lock.yml @@ -1,4 +1,4 @@ -# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"b679d2ff23622fbf6617b22eb8f289350d2f9259b5dbdca3dcbe33a8c63bb1a3","body_hash":"8a26c3526466ea37fbaad9a591a0180504a00b8e24c42e1475a409da4b19c948","strict":true,"agent_id":"claude","agent_model":"${{ needs.activation.outputs.model_size }}","engine_versions":{"claude":"2.1.168"}} +# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"76adbab8bf296a212438de5825a283921c2a0f0435b00f0dcf67fd25b22b3fbc","body_hash":"8a26c3526466ea37fbaad9a591a0180504a00b8e24c42e1475a409da4b19c948","strict":true,"agent_id":"claude","agent_model":"${{ needs.activation.outputs.model_size }}","engine_versions":{"claude":"2.1.168"}} # gh-aw-manifest: {"version":1,"secrets":["ANTHROPIC_API_KEY","GH_AW_CI_TRIGGER_TOKEN","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GH_AW_OTEL_GRAFANA_AUTHORIZATION","GH_AW_OTEL_GRAFANA_ENDPOINT","GH_AW_OTEL_SENTRY_AUTHORIZATION","GH_AW_OTEL_SENTRY_ENDPOINT","GITHUB_TOKEN"],"actions":[{"repo":"actions/cache/restore","sha":"27d5ce7f107fe9357f9df03efb73ab90386fccae","version":"v5.0.5"},{"repo":"actions/cache/save","sha":"27d5ce7f107fe9357f9df03efb73ab90386fccae","version":"v5.0.5"},{"repo":"actions/checkout","sha":"df4cb1c069e1874edd31b4311f1884172cec0e10","version":"v6.0.3"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9.0.0"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.27.1","digest":"sha256:55149fa2daf8fa8afa2803f2ac1a3534591a7c96f173ee2aec9545fbe67305df","pinned_image":"ghcr.io/github/gh-aw-firewall/agent:0.27.1@sha256:55149fa2daf8fa8afa2803f2ac1a3534591a7c96f173ee2aec9545fbe67305df"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.27.1","digest":"sha256:2802437f05830336ea3ae8639f628776608d14d95b5b3cf30f161eb505e29752","pinned_image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.27.1@sha256:2802437f05830336ea3ae8639f628776608d14d95b5b3cf30f161eb505e29752"},{"image":"ghcr.io/github/gh-aw-firewall/cli-proxy:0.27.1","digest":"sha256:2e6dc98321dbf82840f83ec0ef8b198506149255a15d3a7854d59c0d34063e27","pinned_image":"ghcr.io/github/gh-aw-firewall/cli-proxy:0.27.1@sha256:2e6dc98321dbf82840f83ec0ef8b198506149255a15d3a7854d59c0d34063e27"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.27.1","digest":"sha256:1f3df3207dc9faa9080088115ca50a5ab0d7a692c61dffa8c8898d0b7b750413","pinned_image":"ghcr.io/github/gh-aw-firewall/squid:0.27.1@sha256:1f3df3207dc9faa9080088115ca50a5ab0d7a692c61dffa8c8898d0b7b750413"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.25","digest":"sha256:c10331ad17668ef89f38f5e356678788a40b0cd5fef96e8f92e1d9c1de47cbaa","pinned_image":"ghcr.io/github/gh-aw-mcpg:v0.3.25@sha256:c10331ad17668ef89f38f5e356678788a40b0cd5fef96e8f92e1d9c1de47cbaa"},{"image":"ghcr.io/github/github-mcp-server:v1.1.2","digest":"sha256:30197479d8036c7811892bc07e06f9a05c9ef3cdd79bc59f256d50647f95788c","pinned_image":"ghcr.io/github/github-mcp-server:v1.1.2@sha256:30197479d8036c7811892bc07e06f9a05c9ef3cdd79bc59f256d50647f95788c"}]} # ___ _ _ # / _ \ | | (_) @@ -233,7 +233,7 @@ jobs: id: pick-experiment uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 env: - GH_AW_EXPERIMENT_SPEC: '{"model_size":{"variants":["claude-sonnet-4.6","claude-haiku-4.5"],"description":"Tests whether Claude Haiku achieves similar documentation update quality at lower token cost compared to Claude Sonnet.","hypothesis":"H0: no change in PR creation rate or run success rate. H1: Claude Haiku reduces AI credit usage \u003e=30% with equivalent run success rate (\u003e=0.90).","metric":"effective_tokens_total","secondary_metrics":["run_success_rate","run_duration_ms"],"guardrail_metrics":[{"name":"run_success_rate","threshold":"\u003e=0.90"},{"name":"empty_output_rate","threshold":"\u003c=0.10"}],"min_samples":20,"weight":[50,50],"start_date":"2026-06-04"}}' + GH_AW_EXPERIMENT_SPEC: '{"model_size":{"variants":["claude-sonnet-4.6","claude-haiku-4.5"],"description":"Tests whether Claude Haiku achieves similar documentation update quality at lower token cost compared to Claude Sonnet.","hypothesis":"H0: no change in PR creation rate or run success rate. H1: Claude Haiku reduces AI credit usage \u003e=30% with equivalent run success rate (\u003e=0.90).","metric":"ai_credits_total","secondary_metrics":["run_success_rate","run_duration_ms"],"guardrail_metrics":[{"name":"run_success_rate","threshold":"\u003e=0.90"},{"name":"empty_output_rate","threshold":"\u003c=0.10"}],"min_samples":20,"weight":[50,50],"start_date":"2026-06-04"}}' GH_AW_EXPERIMENT_STATE_FILE: /tmp/gh-aw/experiments/state.json GH_AW_EXPERIMENT_STATE_DIR: /tmp/gh-aw/experiments with: diff --git a/.github/workflows/daily-doc-updater.md b/.github/workflows/daily-doc-updater.md index 97ef1eb8253..3676b3a2639 100644 --- a/.github/workflows/daily-doc-updater.md +++ b/.github/workflows/daily-doc-updater.md @@ -40,7 +40,7 @@ experiments: variants: [claude-sonnet-4.6, claude-haiku-4.5] description: "Tests whether Claude Haiku achieves similar documentation update quality at lower token cost compared to Claude Sonnet." hypothesis: "H0: no change in PR creation rate or run success rate. H1: Claude Haiku reduces AI credit usage >=30% with equivalent run success rate (>=0.90)." - metric: effective_tokens_total + metric: ai_credits_total secondary_metrics: [run_success_rate, run_duration_ms] guardrail_metrics: - name: run_success_rate diff --git a/.github/workflows/daily-function-namer.lock.yml b/.github/workflows/daily-function-namer.lock.yml index f49d7aebe03..c9b05ef3d16 100644 --- a/.github/workflows/daily-function-namer.lock.yml +++ b/.github/workflows/daily-function-namer.lock.yml @@ -1,4 +1,4 @@ -# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"b98cdb1e3fc38fa32f163b5f499cb03a6cb4694b8b5f67f37b4933de12001c4e","body_hash":"38251618d47145d5907a35b17a1c66b8c99448470e1e015011e7dc79148ab75f","strict":true,"agent_id":"claude","agent_model":"${{ needs.activation.outputs.model_size }}","engine_versions":{"claude":"2.1.168"}} +# gh-aw-metadata: {"schema_version":"v4","frontmatter_hash":"044a56d35a6718d87bd16eb0eb2d17a0285a2b5ee01999d88e6898163052967d","body_hash":"38251618d47145d5907a35b17a1c66b8c99448470e1e015011e7dc79148ab75f","strict":true,"agent_id":"claude","agent_model":"${{ needs.activation.outputs.model_size }}","engine_versions":{"claude":"2.1.168"}} # gh-aw-manifest: {"version":1,"secrets":["ANTHROPIC_API_KEY","GH_AW_GITHUB_MCP_SERVER_TOKEN","GH_AW_GITHUB_TOKEN","GH_AW_OTEL_GRAFANA_AUTHORIZATION","GH_AW_OTEL_GRAFANA_ENDPOINT","GH_AW_OTEL_SENTRY_AUTHORIZATION","GH_AW_OTEL_SENTRY_ENDPOINT","GITHUB_TOKEN"],"actions":[{"repo":"actions/cache/restore","sha":"27d5ce7f107fe9357f9df03efb73ab90386fccae","version":"v5.0.5"},{"repo":"actions/cache/save","sha":"27d5ce7f107fe9357f9df03efb73ab90386fccae","version":"v5.0.5"},{"repo":"actions/checkout","sha":"df4cb1c069e1874edd31b4311f1884172cec0e10","version":"v6.0.3"},{"repo":"actions/download-artifact","sha":"3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c","version":"v8.0.1"},{"repo":"actions/github-script","sha":"3a2844b7e9c422d3c10d287c895573f7108da1b3","version":"v9.0.0"},{"repo":"actions/setup-node","sha":"48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e","version":"v6.4.0"},{"repo":"actions/upload-artifact","sha":"043fb46d1a93c77aae656e7c1c64a875d1fc6a0a","version":"v7.0.1"}],"containers":[{"image":"ghcr.io/github/gh-aw-firewall/agent:0.27.1","digest":"sha256:55149fa2daf8fa8afa2803f2ac1a3534591a7c96f173ee2aec9545fbe67305df","pinned_image":"ghcr.io/github/gh-aw-firewall/agent:0.27.1@sha256:55149fa2daf8fa8afa2803f2ac1a3534591a7c96f173ee2aec9545fbe67305df"},{"image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.27.1","digest":"sha256:2802437f05830336ea3ae8639f628776608d14d95b5b3cf30f161eb505e29752","pinned_image":"ghcr.io/github/gh-aw-firewall/api-proxy:0.27.1@sha256:2802437f05830336ea3ae8639f628776608d14d95b5b3cf30f161eb505e29752"},{"image":"ghcr.io/github/gh-aw-firewall/cli-proxy:0.27.1","digest":"sha256:2e6dc98321dbf82840f83ec0ef8b198506149255a15d3a7854d59c0d34063e27","pinned_image":"ghcr.io/github/gh-aw-firewall/cli-proxy:0.27.1@sha256:2e6dc98321dbf82840f83ec0ef8b198506149255a15d3a7854d59c0d34063e27"},{"image":"ghcr.io/github/gh-aw-firewall/squid:0.27.1","digest":"sha256:1f3df3207dc9faa9080088115ca50a5ab0d7a692c61dffa8c8898d0b7b750413","pinned_image":"ghcr.io/github/gh-aw-firewall/squid:0.27.1@sha256:1f3df3207dc9faa9080088115ca50a5ab0d7a692c61dffa8c8898d0b7b750413"},{"image":"ghcr.io/github/gh-aw-mcpg:v0.3.25","digest":"sha256:c10331ad17668ef89f38f5e356678788a40b0cd5fef96e8f92e1d9c1de47cbaa","pinned_image":"ghcr.io/github/gh-aw-mcpg:v0.3.25@sha256:c10331ad17668ef89f38f5e356678788a40b0cd5fef96e8f92e1d9c1de47cbaa"},{"image":"ghcr.io/github/github-mcp-server:v1.1.2","digest":"sha256:30197479d8036c7811892bc07e06f9a05c9ef3cdd79bc59f256d50647f95788c","pinned_image":"ghcr.io/github/github-mcp-server:v1.1.2@sha256:30197479d8036c7811892bc07e06f9a05c9ef3cdd79bc59f256d50647f95788c"},{"image":"ghcr.io/github/serena-mcp-server:latest","digest":"sha256:bf343399e3725c45528f531a230f3a04521d4cdef29f9a5af6282ff0d3c393c5","pinned_image":"ghcr.io/github/serena-mcp-server:latest@sha256:bf343399e3725c45528f531a230f3a04521d4cdef29f9a5af6282ff0d3c393c5"}]} # ___ _ _ # / _ \ | | (_) @@ -238,7 +238,7 @@ jobs: id: pick-experiment uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 env: - GH_AW_EXPERIMENT_SPEC: '{"model_size":{"variants":["claude-sonnet-4-6","claude-haiku-4-5-20251001"],"description":"Tests whether Claude Haiku identifies function rename candidates with equivalent quality at lower token cost versus Claude Sonnet.","hypothesis":"H0: no change in issue creation rate or run success rate. H1: Claude Haiku reduces AI credit usage \u003e=30% with equivalent run success rate (\u003e=0.90).","metric":"effective_tokens_total","secondary_metrics":["run_success_rate","run_duration_ms"],"guardrail_metrics":[{"name":"run_success_rate","threshold":"\u003e=0.90"},{"name":"empty_output_rate","threshold":"\u003c=0.10"}],"min_samples":20,"weight":[50,50],"start_date":"2026-06-04"}}' + GH_AW_EXPERIMENT_SPEC: '{"model_size":{"variants":["claude-sonnet-4-6","claude-haiku-4-5-20251001"],"description":"Tests whether Claude Haiku identifies function rename candidates with equivalent quality at lower token cost versus Claude Sonnet.","hypothesis":"H0: no change in issue creation rate or run success rate. H1: Claude Haiku reduces AI credit usage \u003e=30% with equivalent run success rate (\u003e=0.90).","metric":"ai_credits_total","secondary_metrics":["run_success_rate","run_duration_ms"],"guardrail_metrics":[{"name":"run_success_rate","threshold":"\u003e=0.90"},{"name":"empty_output_rate","threshold":"\u003c=0.10"}],"min_samples":20,"weight":[50,50],"start_date":"2026-06-04"}}' GH_AW_EXPERIMENT_STATE_FILE: /tmp/gh-aw/experiments/state.json GH_AW_EXPERIMENT_STATE_DIR: /tmp/gh-aw/experiments with: diff --git a/.github/workflows/daily-function-namer.md b/.github/workflows/daily-function-namer.md index 70e4cdd87d5..cf16d6425f1 100644 --- a/.github/workflows/daily-function-namer.md +++ b/.github/workflows/daily-function-namer.md @@ -23,7 +23,7 @@ experiments: variants: [claude-sonnet-4-6, claude-haiku-4-5-20251001] description: "Tests whether Claude Haiku identifies function rename candidates with equivalent quality at lower token cost versus Claude Sonnet." hypothesis: "H0: no change in issue creation rate or run success rate. H1: Claude Haiku reduces AI credit usage >=30% with equivalent run success rate (>=0.90)." - metric: effective_tokens_total + metric: ai_credits_total secondary_metrics: [run_success_rate, run_duration_ms] guardrail_metrics: - name: run_success_rate diff --git a/pkg/cli/README.md b/pkg/cli/README.md index 337f19bdf3b..777d1f42fdd 100644 --- a/pkg/cli/README.md +++ b/pkg/cli/README.md @@ -236,7 +236,7 @@ The `cli` package exports many types used across its command implementations. Th | `ActionlintStats` | struct | Static-analysis statistics from an actionlint run | | `AddInteractiveConfig` | struct | Configuration for the interactive `add-wizard` command | | `AgenticAssessment` | struct | Agentic behavior assessment derived from audit logs | -| `AmbientContextMetrics` | struct | Token metrics for ambient context (input, cached, effective token counts) | +| `AmbientContextMetrics` | struct | Token metrics for ambient context (input, cached, and output token counts) | | `Argument` | struct | A command-line argument definition from the MCP registry API | | `ArtifactSet` | string alias | Named set of artifacts (e.g. `"agent"`, `"detection"`) | | `AuditComparisonClassification` | struct | A classification label and reason codes for an audit comparison | diff --git a/pkg/cli/token_usage.go b/pkg/cli/token_usage.go index 7c992143edc..1fbd5d62e35 100644 --- a/pkg/cli/token_usage.go +++ b/pkg/cli/token_usage.go @@ -16,7 +16,6 @@ import ( "github.com/github/gh-aw/pkg/console" "github.com/github/gh-aw/pkg/logger" "github.com/github/gh-aw/pkg/timeutil" - "github.com/github/gh-aw/pkg/types" ) var tokenUsageLog = logger.New("cli:token_usage") @@ -129,9 +128,7 @@ const awfTimeWarningPrefix = "[AWF TIME WARNING]" var subagentDispatchPattern = regexp.MustCompile(`([A-Za-z0-9][A-Za-z0-9._-]*)\(([A-Za-z0-9][A-Za-z0-9._:-]*)\)`) // parseTokenUsageFile parses a token-usage.jsonl file and returns the aggregated summary. -// Custom weights, when non-nil, override the built-in model multipliers and token class -// weights for effective token computation. -func parseTokenUsageFile(filePath string, _ *types.TokenWeights) (*TokenUsageSummary, error) { +func parseTokenUsageFile(filePath string) (*TokenUsageSummary, error) { tokenUsageLog.Printf("Parsing token usage file: %s", filePath) file, err := os.Open(filePath) @@ -377,7 +374,7 @@ func findAgentUsageFile(runDir string) string { return found } -func parseAgentUsageFile(filePath string, _ *types.TokenWeights) (*TokenUsageSummary, error) { +func parseAgentUsageFile(filePath string) (*TokenUsageSummary, error) { cleanPath := filepath.Clean(filePath) data, err := os.ReadFile(cleanPath) if err != nil { @@ -436,8 +433,6 @@ func parseAgentUsageFile(filePath string, _ *types.TokenWeights) (*TokenUsageSum } // analyzeTokenUsage finds and parses the token-usage.jsonl file from a run directory. -// It automatically reads custom token weights from aw_info.json when present and -// applies them to the effective token computation. func analyzeTokenUsage(runDir string, verbose bool) (*TokenUsageSummary, error) { tokenUsageLog.Printf("Analyzing token usage in: %s", runDir) @@ -450,9 +445,7 @@ func analyzeTokenUsage(runDir string, verbose bool) (*TokenUsageSummary, error) } } - // Try to load custom token weights from aw_info.json for this run - customWeights := extractCustomTokenWeightsFromDir(runDir) - summary, err := parseTokenUsageFile(filePath, customWeights) + summary, err := parseTokenUsageFile(filePath) if err != nil || summary == nil { return summary, err } @@ -472,8 +465,7 @@ func analyzeTokenUsage(runDir string, verbose bool) (*TokenUsageSummary, error) } } - customWeights := extractCustomTokenWeightsFromDir(runDir) - summary, err := parseAgentUsageFile(agentUsagePath, customWeights) + summary, err := parseAgentUsageFile(agentUsagePath) if err != nil || summary == nil { return summary, err } @@ -854,20 +846,6 @@ func findAgentStdioFile(runDir string) string { return found } -// extractCustomTokenWeightsFromDir reads aw_info.json from a run directory and returns -// any custom token weights embedded there at compile time. Returns nil when not found. -func extractCustomTokenWeightsFromDir(runDir string) *types.TokenWeights { - awInfoPath := findAwInfoPath(runDir) - if awInfoPath == "" { - return nil - } - awInfo, err := parseAwInfo(awInfoPath, false) - if err != nil || awInfo == nil { - return nil - } - return awInfo.TokenWeights -} - func correlateToolCallsWithTokenDelta(toolCalls []MCPToolCall, tokenUsageFile string) []MCPToolCall { _ = tokenUsageFile return toolCalls diff --git a/pkg/cli/token_usage_test.go b/pkg/cli/token_usage_test.go index 79f7b76cbec..564bce7251e 100644 --- a/pkg/cli/token_usage_test.go +++ b/pkg/cli/token_usage_test.go @@ -23,7 +23,7 @@ func TestParseTokenUsageFile(t *testing.T) { content := `{"timestamp":"2026-04-01T17:56:38.042Z","request_id":"abc-123","provider":"anthropic","model":"claude-sonnet-4-6","path":"/v1/messages","status":200,"streaming":true,"input_tokens":100,"output_tokens":200,"cache_read_tokens":5000,"cache_write_tokens":3000,"duration_ms":2500,"response_bytes":1500}` require.NoError(t, os.WriteFile(filePath, []byte(content+"\n"), 0o644), "should write test file") - summary, err := parseTokenUsageFile(filePath, nil) + summary, err := parseTokenUsageFile(filePath) require.NoError(t, err, "should parse without error") require.NotNil(t, summary, "should return non-nil summary") @@ -52,7 +52,7 @@ func TestParseTokenUsageFile(t *testing.T) { {"timestamp":"2026-04-01T17:58:00.000Z","request_id":"3","provider":"anthropic","model":"claude-haiku-4-5","path":"/v1/messages","status":200,"streaming":false,"input_tokens":769,"output_tokens":86,"cache_read_tokens":0,"cache_write_tokens":0,"duration_ms":700,"response_bytes":500}` require.NoError(t, os.WriteFile(filePath, []byte(content+"\n"), 0o644), "should write test file") - summary, err := parseTokenUsageFile(filePath, nil) + summary, err := parseTokenUsageFile(filePath) require.NoError(t, err, "should parse without error") require.NotNil(t, summary, "should return non-nil summary") @@ -79,7 +79,7 @@ func TestParseTokenUsageFile(t *testing.T) { {"timestamp":"2026-04-01T17:56:00.000Z","request_id":"1","provider":"anthropic","model":"claude-sonnet-4-6","path":"/v1/messages","status":200,"streaming":true,"input_tokens":7,"output_tokens":5,"cache_read_tokens":3,"cache_write_tokens":0,"duration_ms":1000,"response_bytes":500}` require.NoError(t, os.WriteFile(filePath, []byte(content+"\n"), 0o644), "should write test file") - summary, err := parseTokenUsageFile(filePath, nil) + summary, err := parseTokenUsageFile(filePath) require.NoError(t, err, "should parse without error") require.NotNil(t, summary, "should return non-nil summary") require.NotNil(t, summary.AmbientContext, "ambient context should be present") @@ -95,7 +95,7 @@ func TestParseTokenUsageFile(t *testing.T) { content := `{"timestamp":"2026-04-01T17:56:00.000Z","request_id":"1","provider":"anthropic","model":"claude-sonnet-4-6","path":"/v1/messages","status":200,"streaming":true,"input_tokens":11,"output_tokens":5,"duration_ms":1000,"response_bytes":500}` require.NoError(t, os.WriteFile(filePath, []byte(content+"\n"), 0o644), "should write test file") - summary, err := parseTokenUsageFile(filePath, nil) + summary, err := parseTokenUsageFile(filePath) require.NoError(t, err, "should parse without error") require.NotNil(t, summary, "should return non-nil summary") require.NotNil(t, summary.AmbientContext, "ambient context should be present") @@ -109,7 +109,7 @@ func TestParseTokenUsageFile(t *testing.T) { filePath := filepath.Join(tmpDir, "token-usage.jsonl") require.NoError(t, os.WriteFile(filePath, []byte(""), 0o644)) - summary, err := parseTokenUsageFile(filePath, nil) + summary, err := parseTokenUsageFile(filePath) require.NoError(t, err, "should not error on empty file") assert.Nil(t, summary, "should return nil for empty file") }) @@ -119,7 +119,7 @@ func TestParseTokenUsageFile(t *testing.T) { filePath := filepath.Join(tmpDir, "token-usage.jsonl") require.NoError(t, os.WriteFile(filePath, []byte("\n\n\n"), 0o644)) - summary, err := parseTokenUsageFile(filePath, nil) + summary, err := parseTokenUsageFile(filePath) require.NoError(t, err, "should not error on blank-only file") assert.Nil(t, summary, "should return nil for blank-only file") }) @@ -133,7 +133,7 @@ func TestParseTokenUsageFile(t *testing.T) { also not json` require.NoError(t, os.WriteFile(filePath, []byte(content+"\n"), 0o644)) - summary, err := parseTokenUsageFile(filePath, nil) + summary, err := parseTokenUsageFile(filePath) require.NoError(t, err, "should not error on mixed content") require.NotNil(t, summary, "should return summary from valid lines") assert.Equal(t, 1, summary.TotalRequests, "should count only valid entries") @@ -141,7 +141,7 @@ also not json` }) t.Run("file not found returns error", func(t *testing.T) { - _, err := parseTokenUsageFile("/nonexistent/path/token-usage.jsonl", nil) + _, err := parseTokenUsageFile("/nonexistent/path/token-usage.jsonl") assert.Error(t, err, "should error on missing file") }) @@ -152,7 +152,7 @@ also not json` content := `{"timestamp":"2026-04-01T17:56:38.042Z","request_id":"1","provider":"anthropic","model":"","path":"/v1/messages","status":200,"streaming":true,"input_tokens":50,"output_tokens":25,"cache_read_tokens":0,"cache_write_tokens":0,"duration_ms":500,"response_bytes":200}` require.NoError(t, os.WriteFile(filePath, []byte(content+"\n"), 0o644)) - summary, err := parseTokenUsageFile(filePath, nil) + summary, err := parseTokenUsageFile(filePath) require.NoError(t, err, "should parse without error") require.NotNil(t, summary, "should return non-nil summary") require.Contains(t, summary.ByModel, "unknown", "should use 'unknown' for empty model") @@ -579,7 +579,7 @@ func TestCacheEfficiency(t *testing.T) { content := `{"provider":"anthropic","model":"sonnet","input_tokens":100,"output_tokens":50,"cache_read_tokens":9900,"cache_write_tokens":0,"duration_ms":100}` require.NoError(t, os.WriteFile(filePath, []byte(content+"\n"), 0o644)) - summary, err := parseTokenUsageFile(filePath, nil) + summary, err := parseTokenUsageFile(filePath) require.NoError(t, err) require.NotNil(t, summary) assert.InDelta(t, 0.0, summary.CacheEfficiency, 0.001, "cache efficiency should remain unset") diff --git a/pkg/parser/schemas/main_workflow_schema.json b/pkg/parser/schemas/main_workflow_schema.json index 9bc6823ad64..ab63d0d5c20 100644 --- a/pkg/parser/schemas/main_workflow_schema.json +++ b/pkg/parser/schemas/main_workflow_schema.json @@ -2736,7 +2736,7 @@ ] }, "models": { - "description": "Custom model pricing data in the same structure as models.json. Merged with the built-in models.json at runtime; frontmatter entries override matching models and fill gaps for unknown models. Useful for custom or private models, or to adjust pricing for effective-token cost accounting.", + "description": "Custom model pricing data in the same structure as models.json. Merged with the built-in models.json at runtime; frontmatter entries override matching models and fill gaps for unknown models. Useful for custom or private models, or to adjust pricing for AI Credits cost accounting.", "type": "object", "required": ["providers"], "properties": { @@ -2837,7 +2837,7 @@ }, "metric": { "type": "string", - "description": "Primary metric to observe (e.g. 'effective_tokens')." + "description": "Primary metric to observe (e.g. 'ai_credits_total')." }, "weight": { "type": "array", @@ -2953,9 +2953,9 @@ { "prompt_style": { "variants": ["concise", "verbose"], - "description": "Test whether concise vs verbose prompts reduce token consumption", - "hypothesis": "H0: no change in tokens. H1: concise reduces by >=15%", - "metric": "effective_tokens", + "description": "Test whether concise vs verbose prompts reduce AI Credits consumption", + "hypothesis": "H0: no change in AI Credits. H1: concise reduces by >=15%", + "metric": "ai_credits_total", "secondary_metrics": ["duration_ms", "discussion_word_count"], "guardrail_metrics": [ { @@ -11622,7 +11622,7 @@ }, "token-weights": { "type": "object", - "description": "Custom model token weights for effective token computation. Overrides or extends the built-in model multipliers from model_multipliers.json. Useful for custom models or adjusted cost ratios.", + "description": "Custom model token weights for AI Credits cost ratio adjustment. Overrides or extends the built-in model multipliers from model_multipliers.json. Useful for custom models or adjusted cost ratios.", "properties": { "multipliers": { "type": "object", diff --git a/pkg/types/README.md b/pkg/types/README.md index ae9d05340bc..d4ec74e13d8 100644 --- a/pkg/types/README.md +++ b/pkg/types/README.md @@ -76,7 +76,7 @@ auth := &types.MCPAuthConfig{ ### `TokenWeights` -Defines custom model cost information for effective token computation. Specified under `engine.token-weights` in workflow frontmatter and stored in `aw_info.json` at runtime. +Defines custom model cost information for AI Credits cost ratios. Specified under `engine.token-weights` in workflow frontmatter and stored in `aw_info.json` at runtime. ```go weights := types.TokenWeights{ @@ -92,7 +92,7 @@ weights := types.TokenWeights{ ### `TokenClassWeights` -Per-token-class weights for effective token computation. Each field corresponds to one token class; a zero value means "use the default weight". +Per-token-class weights for cost computation. Each field corresponds to one token class; a zero value means "use the default weight". | Field | Token class | |-------|-------------| diff --git a/pkg/types/token_weights.go b/pkg/types/token_weights.go index 01e1874a00b..f6df52135e5 100644 --- a/pkg/types/token_weights.go +++ b/pkg/types/token_weights.go @@ -1,6 +1,6 @@ package types -// TokenClassWeights holds per-token-class weights for effective token computation. +// TokenClassWeights holds per-token-class weights for cost computation. // Each field corresponds to one token class; a zero value means "use default". // The JSON keys use underscores to match pkg/cli/data/model_multipliers.json format. type TokenClassWeights struct { @@ -11,7 +11,7 @@ type TokenClassWeights struct { CacheWrite float64 `json:"cache_write,omitempty"` } -// TokenWeights defines custom model cost information for effective token computation. +// TokenWeights defines custom model cost information for AI Credits cost ratios. // It mirrors the structure of model_multipliers.json and allows per-workflow overrides. // Specified under engine.token-weights in the workflow frontmatter and stored in // aw_info.json at runtime. diff --git a/pkg/workflow/README.md b/pkg/workflow/README.md index ff60b7444be..b82af7c0e23 100644 --- a/pkg/workflow/README.md +++ b/pkg/workflow/README.md @@ -139,7 +139,7 @@ The package is intentionally large (~320 source files) because it encodes all Gi | `Agent` | `string` | `engine.agent` | Agent identifier for `copilot --agent` flag (copilot engine only) | | `APITarget` | `string` | `engine.api-target` | Custom API endpoint hostname | | `Bare` | `bool` | `engine.bare` | Disables automatic loading of context/instructions | -| `TokenWeights` | `*types.TokenWeights` | `engine.token-weights` | Custom model cost data for effective token computation | +| `TokenWeights` | `*types.TokenWeights` | `engine.token-weights` | Custom model cost data for AI Credits cost ratios | | `IsInlineDefinition` | `bool` | _(internal)_ | `true` when engine is defined inline via `engine.runtime` | | `MCPSessionTimeout` | `string` | `engine.mcp.session-timeout` | Go duration for MCP gateway sessions (e.g. `"4h"`) | | `MCPToolTimeout` | `string` | `engine.mcp.tool-timeout` | Go duration for individual MCP tool calls (e.g. `"2m"`) | diff --git a/pkg/workflow/engine.go b/pkg/workflow/engine.go index 6faad68cdab..23ad3f4adc4 100644 --- a/pkg/workflow/engine.go +++ b/pkg/workflow/engine.go @@ -48,7 +48,7 @@ type EngineConfig struct { Agent string // Agent identifier for copilot --agent flag (copilot engine only) APITarget string // Custom API endpoint hostname (e.g., "api.acme.ghe.com" or "api.enterprise.githubcopilot.com") Bare bool // When true, disables automatic loading of context/instructions (copilot: --no-custom-instructions, claude: --bare, codex: --no-system-prompt, gemini: GEMINI_SYSTEM_MD=/dev/null) - // TokenWeights provides custom model cost data for effective token computation. + // TokenWeights provides custom model cost data for AI Credits cost ratios. // When set, overrides or extends the built-in model_multipliers.json values. TokenWeights *types.TokenWeights diff --git a/pkg/workflow/schemas/awf-config.schema.json b/pkg/workflow/schemas/awf-config.schema.json index f75dfd8d563..439664e4e6b 100644 --- a/pkg/workflow/schemas/awf-config.schema.json +++ b/pkg/workflow/schemas/awf-config.schema.json @@ -53,7 +53,7 @@ }, "enableTokenSteering": { "type": "boolean", - "description": "Enable effective token budget steering. When true, the proxy injects budget-warning system messages at 80%, 90%, 95%, and 99% usage to nudge the agent to wrap up. Requires maxEffectiveTokens. Default: false." + "description": "Enable AI Credits budget steering. When true, the proxy injects budget-warning system messages at 80%, 90%, 95%, and 99% usage to nudge the agent to wrap up. Requires maxAiCredits. Default: false." }, "anthropicAutoCache": { "type": "boolean", @@ -104,7 +104,7 @@ }, "modelMultipliers": { "type": "object", - "description": "Per-model multipliers for effective token accounting. Each model's weighted tokens are multiplied by this value before accumulation. Unlisted models use defaultModelMultiplier when set, otherwise the highest configured multiplier. See spec §10.2.", + "description": "Per-model cost multipliers. Each model's weighted tokens are multiplied by this value before accumulation. Unlisted models use defaultModelMultiplier when set, otherwise the highest configured multiplier. See spec §10.2.", "additionalProperties": { "type": "number", "exclusiveMinimum": 0