diff --git a/CHANGELOG.md b/CHANGELOG.md index bec0124..519c4bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ Initial release. - **Subagent capture** — separate ATIF trajectories for each subagent invocation, linked to parent via `SubagentTrajectoryRef` - **API request capture** — local reverse proxy captures raw request/response bodies, system prompts, tool definitions, token usage, and compaction events - **Turn-level resampling** — replay a specific API request N times to study response variance (stateless, no tool execution) -- **Intervention testing** — edit captured API requests (assistant text, tool results, system prompt) and resample with modified inputs; available from both CLI (`harness resample-edit`) and web UI +- **Intervention testing** — edit captured API requests (thinking, text, tool results, system prompt) and resample with modified inputs; available from both CLI (`harness resample-edit`) and web UI - **Session-level resampling** — re-run a forked session N times with full tool execution (`harness resample-session`) - **Turn-level replay** — branch execution from any API turn with exact-match context, filesystem reset via git worktrees, and full tool execution; replicates run in parallel (`harness replay`) - **Transcript capture** — Claude Code transcript JSONL copied into session output for replay support diff --git a/README.md b/README.md index c6ad584..7b4af0e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # AgentLens -Developed at [MATS Exploration Phase](https://www.matsprogram.org/) under [Neel Nanda](https://github.com/neelnanda-io), for a research project with [Greg Kocher](https://github.com/gregkocher). +> **This repository has moved to [dreadnode/agent-lens](https://github.com/dreadnode/agent-lens).** This copy is no longer maintained — please use the new location for the latest code, issues, and contributions. A harness for running multi-session agent trajectories using the Claude Agent SDK, capturing them in [ATIF](https://harborframework.com/docs/agents/trajectory-format) (Agent Trajectory Interchange Format), and tracking file state changes across sessions. @@ -13,7 +13,7 @@ The harness takes a YAML config describing a sequence of sessions (prompts to an - **ATIF trajectories** — standardized JSON capturing every agent step, tool call, observation, and thinking block - **Shadow git change tracking** — automatic tracking of all file changes via an invisible git repo, with per-step write attribution and full unified diffs - **Session chaining** — three modes for controlling how sessions relate to each other (isolated, chained, forked) -- **Resampling & replay** — study behavioral variance at multiple levels: stateless API resampling, intervention testing (edit assistant text, tool results, or system prompts and resample), session-level resampling, and turn-level replay with full tool execution from any branch point +- **Resampling & replay** — study behavioral variance at multiple levels: stateless API resampling, intervention testing (edit inputs and resample), session-level resampling, and turn-level replay with full tool execution from any branch point - **Subagent capture** — separate ATIF trajectories for each subagent invocation, linked to the parent via `SubagentTrajectoryRef` ## Install @@ -325,7 +325,7 @@ Edit a captured API request and resample with the modified version — the CLI e # Step 1: Dump the request for editing harness resample-edit runs/my-run --session 1 --request 5 --dump > edit.json -# Step 2: Edit the JSON (assistant text, tool results, system prompt...) +# Step 2: Edit the JSON (thinking, text, tool results, system prompt...) # Step 3: Resample with the modified request harness resample-edit runs/my-run --session 1 --request 5 \ --input edit.json --label "removed hedging" --count 5 @@ -335,13 +335,11 @@ Pipe through `jq` for programmatic edits: ```bash harness resample-edit runs/my-run --session 1 --request 5 --dump \ - | jq '.system = "You are a cautious engineer. Double-check everything."' \ + | jq '.messages[-1].content[0].thinking = "Be more direct."' \ | harness resample-edit runs/my-run --session 1 --request 5 \ - --input - --label "cautious prompt" --count 10 + --input - --label "direct thinking" --count 10 ``` -> **Note:** Thinking blocks cannot be edited — they carry cryptographic signatures validated by the API. See [Thinking blocks](docs/guide/resampling.md#thinking-blocks-not-editable) for details. - Variants are saved alongside vanilla resamples and appear in the web UI. ### `harness resample-session` @@ -362,18 +360,13 @@ Replay a session from any API turn with full tool execution. Each replicate runs # List available turns harness replay runs/my-run --session 1 --list-turns -# Replay from turn 5, three times (only session 1 runs) +# Replay from turn 5, three times (runs in parallel) harness replay runs/my-run --session 1 --turn 5 --count 3 -# Replay session 1 turn 5, then continue with sessions 2, 3, etc. -harness replay runs/my-run --session 1 --turn 5 --continue-sessions - # Replay with an additional prompt after tool results harness replay runs/my-run --session 1 --turn 5 --prompt "Try a different approach" ``` -By default, replay only runs the targeted session. Use `--continue-sessions` to also run subsequent sessions from the original config. - Replay creates new run directories (e.g. `replay_my-run_s1_t5_r01_/`) with full artifacts. Each includes a `replay_meta.json` with provenance linking back to the source run, session, and turn. The source working directory is never modified. ## Web UI @@ -395,7 +388,7 @@ Open `http://localhost:5173`. The UI reads from the `runs/` directory and provid - **API captures** — request/response viewer with token usage, system prompts, tool definitions, compaction events - **Subagent viewer** — separate trajectory view for each subagent, with task prompt and return value - **Resamples** — compare N resample outputs for a given API turn -- **Edit & Resample** — interactive message editor for intervention testing: edit assistant text, tool results, or system prompts in the conversation, then resample with the modified input to study how changes affect behavior (thinking blocks are shown read-only — see [why](docs/guide/resampling.md#thinking-blocks-not-editable)) +- **Edit & Resample** — interactive message editor for intervention testing: edit thinking, text, tool results, or system prompts in the conversation, then resample with the modified input to study how changes affect behavior - **Changelog** — per-step file write log across all sessions with expandable diffs - **Config viewer** — frozen YAML config from the run - **Analysis** — rendered markdown from `analysis.md` diff --git a/docs/cli.md b/docs/cli.md index a7c30db..812ff6a 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -126,7 +126,7 @@ Results are saved to `session_NN/resamples/request_NNN/` (and `request_NNN_vNN/` Edit a captured API request and resample with the modified version. -For intervention strategy and output details, see [Resampling & Replay](guide/resampling.md#intervention-testing). +For intervention strategy and output details, see [Resampling & Replay](guide/resampling.md#intervention-testing-edit-resample). ```bash harness resample-edit [OPTIONS] @@ -151,9 +151,7 @@ harness resample-edit [OPTIONS] harness resample-edit runs/my-run --session 1 --request 5 --dump > edit.json ``` -**Step 2** — Edit the JSON file (change assistant text, tool results, system prompt, etc.), then resample. - -> **Do not edit thinking blocks.** They carry cryptographic signatures validated by the API — any modification will cause a 400 error. See [Thinking blocks](guide/resampling.md#thinking-blocks-not-editable) for details. +**Step 2** — Edit the JSON file (change thinking, text, tool results, system prompt, etc.), then resample: ```bash harness resample-edit runs/my-run --session 1 --request 5 \ @@ -164,9 +162,9 @@ harness resample-edit runs/my-run --session 1 --request 5 \ ```bash harness resample-edit runs/my-run --session 1 --request 5 --dump \ - | jq '.system = "You are a cautious engineer. Always check for edge cases."' \ + | jq '.messages[-1].content[0].thinking = "I should be more direct."' \ | harness resample-edit runs/my-run --session 1 --request 5 \ - --input - --label "cautious prompt" --count 10 + --input - --label "direct thinking" --count 10 ``` ### Batch interventions @@ -174,9 +172,9 @@ harness resample-edit runs/my-run --session 1 --request 5 --dump \ ```bash for req in 3 5 7 9; do harness resample-edit runs/my-run --session 1 --request $req --dump \ - | jq '(.messages[] | select(.role == "user") | .content[] | select(.type == "tool_result")).content = "Error: file not found"' \ + | jq '.messages[-1].content[0].thinking = "Skip exploration, go straight to implementation."' \ | harness resample-edit runs/my-run --session 1 --request $req \ - --input - --label "tool-error" --count 5 + --input - --label "skip-exploration" --count 5 done ``` @@ -239,20 +237,18 @@ Turns in session 1 (12 total): ### Replaying -By default, only the targeted session is replayed. Use `--continue-sessions` to also run sessions after it. - ```bash -# Replay from turn 5, three times (only session 1 runs) +# Replay from turn 5, three times (runs in parallel) harness replay runs/my-run --session 1 --turn 5 --count 3 -# Replay session 1 turn 5, then continue with sessions 2, 3, etc. -harness replay runs/my-run --session 1 --turn 5 --continue-sessions - # Replay with an additional prompt harness replay runs/my-run --session 1 --turn 5 --prompt "Try a different approach" # Replay from turn 1 (re-run from scratch) harness replay runs/my-run --session 1 --turn 1 --count 2 + +# Replay session 1 turn 5, then continue sessions 2..end +harness replay runs/my-run --session 1 --turn 5 --continue-sessions ``` Each replay creates a new run directory (e.g. `replay_my-run_s1_t5_r01_2026-03-16T00-00-00/`) with full artifacts including `replay_meta.json` for provenance tracking. The source working directory is never modified — each replicate operates in its own git worktree. diff --git a/docs/glossary.md b/docs/glossary.md index 716194f..6f268d0 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -73,7 +73,7 @@ A full-fidelity re-execution from a specific turn. Each replicate runs in an iso ### Intervention (variant) -A modified resample — the API request is edited before being sent (e.g. changing assistant text, tool results, or system prompt) to test counterfactuals. Thinking blocks cannot be edited due to cryptographic signature requirements. Variants are saved alongside vanilla resamples with a `_vNN` suffix and include the edited request for reproducibility. +A modified resample — the API request is edited before being sent (e.g. changing a thinking block or system prompt) to test counterfactuals. Variants are saved alongside vanilla resamples with a `_vNN` suffix and include the edited request for reproducibility. ### Shadow git diff --git a/docs/guide/output.md b/docs/guide/output.md index c82580d..dc36712 100644 --- a/docs/guide/output.md +++ b/docs/guide/output.md @@ -13,8 +13,6 @@ runs// │ ├── session_01/ │ ├── trajectory.json # ATIF v1.6 trajectory (parent) -│ ├── transcript.jsonl # Claude Code transcript (for replay) -│ ├── uuid_map.json # turn correlation map (transcript ↔ ATIF ↔ raw dumps) │ ├── session_diff.patch # unified diff of this session's changes │ ├── subagent__.json # subagent ATIF trajectory (if any) │ ├── api_captures.jsonl # API request/response metadata diff --git a/docs/guide/resampling.md b/docs/guide/resampling.md index bfde192..a9912c9 100644 --- a/docs/guide/resampling.md +++ b/docs/guide/resampling.md @@ -25,7 +25,7 @@ Cheapest / fastest Most thorough | I want to... | Method | Command | |--------------|--------|---------| | Check if the model would say the same thing again | [Turn resample](#turn-level-resampling) | `harness resample` | -| See what happens if the model had seen different text or tool results | [Intervention](#intervention-testing) | `harness resample-edit` | +| See what happens if the model had different thinking | [Intervention](#intervention-testing) | `harness resample-edit` | | See what happens if a tool returned something different | [Intervention](#intervention-testing) | `harness resample-edit` | | Compare N complete trajectories for the same task | [Session resample](#session-level-resampling) | `harness resample-session` | | Branch from a specific point and let the agent continue | [Turn replay](#turn-level-replay) | `harness replay` | @@ -87,18 +87,17 @@ session_01/resamples/request_005/ ## Intervention testing -Edit the conversation inputs — text, tool results, or system prompt — then resample. This lets you test counterfactuals: "What would the model do differently if it had seen X instead of Y?" +Edit the conversation inputs — thinking blocks, text, tool results, or system prompt — then resample. This lets you test counterfactuals: "What would the model do differently if it had seen X instead of Y?" Like turn-level resampling, this is **stateless** — no tools execute. But the input is modified before sending, so you can study causal effects. **What you can edit:** -- **Assistant text** — alter what the model said in prior turns (e.g., remove hedging, change a decision) -- **Tool results** — change what a tool returned (e.g., different file contents, simulated errors) +- **Thinking blocks** — change the model's internal reasoning +- **Text responses** — alter what the model said in prior turns +- **Tool results** — change what a tool returned (e.g., different file contents) - **System prompt** — modify instructions -> **Note:** Thinking blocks are visible in the dump and UI but are **not editable** — the API requires cryptographic signatures on thinking blocks that can't survive modification. They are preserved as-is so the model retains its original reasoning context. See [Thinking blocks](#thinking-blocks) for details. - ### From the CLI Two-step workflow: dump the request, edit it, resample. @@ -107,7 +106,7 @@ Two-step workflow: dump the request, edit it, resample. # 1. Dump the request to a file harness resample-edit runs/my-run --session 1 --request 5 --dump > edit.json -# 2. Edit edit.json (change assistant text, tool results, system prompt...) +# 2. Edit edit.json (change thinking, text, tool results, system prompt...) # 3. Resample with the modified request harness resample-edit runs/my-run --session 1 --request 5 \ @@ -117,22 +116,20 @@ harness resample-edit runs/my-run --session 1 --request 5 \ For scriptable interventions, pipe through `jq`: ```bash -# Change the system prompt harness resample-edit runs/my-run --session 1 --request 5 --dump \ - | jq '.system = "You are a cautious engineer. Always check for edge cases."' \ + | jq '.messages[-1].content[0].thinking = "Be more direct."' \ | harness resample-edit runs/my-run --session 1 --request 5 \ - --input - --label "cautious prompt" --count 10 + --input - --label "direct thinking" --count 10 ``` Batch across multiple requests: ```bash -# Change a tool result across several turns for req in 3 5 7 9; do harness resample-edit runs/my-run --session 1 --request $req --dump \ - | jq '(.messages[] | select(.role == "user") | .content[] | select(.type == "tool_result")).content = "Error: file not found"' \ + | jq '.messages[-1].content[0].thinking = "Skip exploration."' \ | harness resample-edit runs/my-run --session 1 --request $req \ - --input - --label "tool-error" --count 5 + --input - --label "skip-exploration" --count 5 done ``` @@ -140,7 +137,7 @@ done 1. Open a session's API captures 2. Click "Edit & Resample" on any request -3. Modify text, tool results, or system prompts (thinking blocks are shown read-only) +3. Modify thinking blocks, text, tool results, or system prompts 4. Resample with the modified input ### Output @@ -214,24 +211,22 @@ Bracketed tags (e.g. `[_step_1_3]`) indicate shadow git snapshots — turns wher ### Running -By default, replay **only runs the targeted session** — it branches from the specified turn and lets the agent continue until that session ends. Subsequent sessions from the original config are not run. - -To replay the full remaining experiment (the targeted session *and* all sessions after it), use `--continue-sessions`. - ```bash -# Replay from turn 5, three times (only session 1 runs) +# Replay from turn 5, three times (runs in parallel) harness replay runs/my-run --session 1 --turn 5 --count 3 -# Replay session 1 turn 5, then continue with sessions 2, 3, etc. -harness replay runs/my-run --session 1 --turn 5 --continue-sessions - # Replay with an additional prompt after tool results harness replay runs/my-run --session 1 --turn 5 --prompt "Try a different approach" # Replay from turn 1 (re-run from scratch with same config) harness replay runs/my-run --session 1 --turn 1 --count 2 + +# Replay session 1 turn 5, then continue with sessions 2..end +harness replay runs/my-run --session 1 --turn 5 --continue-sessions ``` +When `--continue-sessions` is enabled, each replicate runs the replayed session first, then continues with sessions `N+1..end` from the original config. + ### Output Each replay creates a new independent run directory: @@ -258,28 +253,6 @@ runs/replay_my-run_s1_t5_r01_2026-03-16T00-00-00/ Each session generates a `uuid_map.json` that correlates entries across the three data formats (transcript, ATIF trajectory, raw API dumps). The primary join key is `tool_call_id`. The replay system uses this to find shadow git tags for filesystem reset. -### Thinking blocks (not editable) - -> **Warning:** Thinking blocks cannot be edited in interventions. Any attempt to modify thinking content in a dumped request JSON will cause the API to reject the request with a 400 error. The UI editor shows thinking blocks as read-only. - -#### Why: cryptographic signatures - -When the Anthropic API returns a response with extended thinking enabled, each `thinking` block includes a cryptographic `signature` field. On subsequent requests, the API validates this signature to confirm the thinking content has not been tampered with. This is a server-side integrity check — there is no way to regenerate or forge a valid signature outside of Anthropic's infrastructure. - -This means: -- **Unmodified thinking blocks** have valid signatures and are accepted by the API -- **Edited thinking blocks** have invalidated signatures and are rejected (HTTP 400) -- **Stripped signatures** (keeping the text but removing the `signature` field) are also rejected - -`redacted_thinking` blocks are similarly protected — they contain opaque encrypted content that cannot be inspected or modified. - -#### What this means for interventions - -All resampling methods preserve thinking blocks with their original signatures intact, so the model always sees its full original reasoning context. This is faithful — the model receives the same thinking it originally produced. - -To test counterfactuals about model behavior, edit the fields that *are* modifiable: -- **Assistant text** — change what the model said (its visible output) -- **Tool results** — change what a tool returned (e.g., different file contents, simulated errors) -- **System prompt** — change the instructions +### Thinking signatures -These fields have no signature requirements and can be freely modified. +When resampling, the harness automatically strips thinking block signatures from the request. Signatures are response-specific and would cause errors if replayed verbatim. diff --git a/docs/guide/web-ui.md b/docs/guide/web-ui.md index ac08677..aa9b841 100644 --- a/docs/guide/web-ui.md +++ b/docs/guide/web-ui.md @@ -22,7 +22,7 @@ Configure the UI via `ui/.env` or shell environment: | `ANTHROPIC_API_KEY` | — | Required for resampling via Anthropic API | | `ANTHROPIC_BASE_URL` | `https://api.anthropic.com` | Override the API base URL for resampling | -The resampling API keys are needed for any resampling in the UI (both vanilla resamples and "Edit & Resample"). The UI auto-detects whether to use OpenRouter or Anthropic based on the original run's API target. +The resampling API keys are only needed if you use the "Edit & Resample" feature in the UI. The UI auto-detects whether to use OpenRouter or Anthropic based on the original run's API target. ## Features @@ -62,7 +62,7 @@ Compare N resample outputs for a given API turn side-by-side. ### Edit & Resample Interactive message editor for intervention testing: -1. Edit assistant text, tool results, or system prompts (thinking blocks are shown read-only) +1. Edit thinking blocks, text, tool results, or system prompts 2. Resample with the modified input 3. Compare original vs. variant responses diff --git a/docs/index.md b/docs/index.md index 63577d4..af67c99 100644 --- a/docs/index.md +++ b/docs/index.md @@ -11,7 +11,7 @@ The harness takes a YAML config describing a sequence of sessions (prompts to an - **ATIF trajectories** — standardized JSON capturing every agent step, tool call, observation, and thinking block - **Shadow git change tracking** — automatic tracking of all file changes via an invisible git repo, with per-step write attribution and full unified diffs - **Session chaining** — three modes for controlling how sessions relate to each other (isolated, chained, forked) -- **Resampling & replay** — four methods for studying behavioral variance, from quick API resampling to full trajectory replay with tool execution. Edit assistant text, tool results, or system prompts to test counterfactuals +- **Resampling & replay** — four methods for studying behavioral variance, from quick API resampling to full trajectory replay with tool execution. Edit thinking, text, tool results, or prompts to test counterfactuals - **Subagent capture** — separate ATIF trajectories for each subagent invocation, linked to the parent via `SubagentTrajectoryRef` ## Next steps diff --git a/examples/chained.yaml b/examples/chained.yaml index d6e8abf..fa5e212 100644 --- a/examples/chained.yaml +++ b/examples/chained.yaml @@ -1,4 +1,4 @@ -model: "claude-sonnet-4-20250514" +model: "claude-sonnet-4-5" provider: anthropic work_dir: "./repos/test_repo" session_mode: chained diff --git a/examples/isolated.yaml b/examples/isolated.yaml index d25e202..8c51ebd 100644 --- a/examples/isolated.yaml +++ b/examples/isolated.yaml @@ -1,4 +1,4 @@ -model: "claude-sonnet-4-20250514" +model: "claude-sonnet-4-5" provider: anthropic work_dir: "./repos/test_repo" session_mode: isolated diff --git a/src/harness/cli.py b/src/harness/cli.py index a3d6d59..da54b8e 100644 --- a/src/harness/cli.py +++ b/src/harness/cli.py @@ -72,10 +72,7 @@ def list_runs( typer.echo("No runs directory found.") raise typer.Exit() - run_dirs = sorted( - d for d in runs_dir.iterdir() - if d.is_dir() and not d.name.startswith(("_", ".")) - ) + run_dirs = sorted(runs_dir.iterdir()) if not run_dirs: if output_json: typer.echo("[]") diff --git a/src/harness/config.py b/src/harness/config.py index 84f3cc3..aaf09a1 100644 --- a/src/harness/config.py +++ b/src/harness/config.py @@ -150,5 +150,17 @@ def build_provider_env(config: RunConfig) -> dict[str, str]: env["CLAUDE_CODE_USE_BEDROCK"] = "1" elif config.provider == "vertex": env["CLAUDE_CODE_USE_VERTEX"] = "1" + elif config.provider == "ollama": + ollama_url = config.base_url or os.environ.get( + "OLLAMA_HOST", "http://localhost:11434" + ) + env["ANTHROPIC_BASE_URL"] = ollama_url + env["ANTHROPIC_API_KEY"] = "" + env["ANTHROPIC_AUTH_TOKEN"] = "ollama" + env["CLAUDE_CODE_ATTRIBUTION_HEADER"] = "0" + env["ANTHROPIC_DEFAULT_OPUS_MODEL"] = config.model + env["ANTHROPIC_DEFAULT_SONNET_MODEL"] = config.model + env["ANTHROPIC_DEFAULT_HAIKU_MODEL"] = config.model + env["CLAUDE_CODE_SUBAGENT_MODEL"] = config.model return env diff --git a/src/harness/proxy.py b/src/harness/proxy.py index 3ae6615..ce95cb2 100644 --- a/src/harness/proxy.py +++ b/src/harness/proxy.py @@ -14,6 +14,7 @@ import hashlib import json import logging +import os from datetime import datetime, timezone from pathlib import Path @@ -80,6 +81,7 @@ class CaptureProxy: def __init__(self, raw_dump_count: int = 0) -> None: self._target_url: str = "" + self._provider: str | None = None self._log_path: Path | None = None self._site: web.TCPSite | None = None self._runner: web.AppRunner | None = None @@ -91,9 +93,12 @@ def __init__(self, raw_dump_count: int = 0) -> None: self._seen_system_hashes: set[str | None] = set() self._seen_tools_hashes: set[str | None] = set() - async def start(self, target_url: str, log_path: Path) -> int: + async def start( + self, target_url: str, log_path: Path, provider: str | None = None + ) -> int: """Start the proxy server. Returns the assigned port.""" self._target_url = target_url.rstrip("/") + self._provider = provider self._log_path = log_path self._request_index = 0 self._main_system_hash = None @@ -202,8 +207,13 @@ async def _handle(self, request: web.Request) -> web.StreamResponse: hdr_path = raw_dir / f"request_{idx:03d}_headers.json" with open(hdr_path, "w") as f: json.dump( - {"method": request.method, "path": request.path, - "target": target, "headers": safe_headers}, + { + "method": request.method, + "path": request.path, + "provider": self._provider, + "target": target, + "headers": safe_headers, + }, f, indent=2, ) # Response @@ -331,6 +341,8 @@ def get_target_url(provider: str, base_url: str | None) -> str: return base_url if provider == "openrouter": return "https://openrouter.ai/api" + if provider == "ollama": + return os.environ.get("OLLAMA_HOST", "http://localhost:11434") # Default to Anthropic API for all other providers. # For bedrock/vertex, Claude Code may or may not route through # ANTHROPIC_BASE_URL — if it doesn't, the proxy will simply diff --git a/src/harness/replay.py b/src/harness/replay.py index eaf36e3..80ef67a 100644 --- a/src/harness/replay.py +++ b/src/harness/replay.py @@ -131,6 +131,9 @@ async def run_replay( typer.echo(f"Error: No session config for index {session_index}", err=True) raise typer.Exit(1) + # Resolve paths + project_dir = get_project_dir(str(Path(config.work_dir).resolve())) + # Source shadow git for worktree creation source_shadow_git_dir = source_run_dir / ".shadow_git" if not source_shadow_git_dir.exists(): @@ -157,7 +160,7 @@ async def run_replay( ) # Create worktrees for all replicates - worktree_base = output_base / "_worktrees" / f"replay_{source_name}_s{session_index}_t{turn_index}" + worktree_base = output_base / ".worktrees" / f"replay_{source_name}_s{session_index}_t{turn_index}" worktree_base.mkdir(parents=True, exist_ok=True) worktree_paths: list[Path] = [] @@ -183,6 +186,7 @@ async def run_replay( tool_result_entries=tool_result_entries, source_session_id=source_session_id, prompt_override=prompt_override, + project_dir=project_dir, output_base=output_base, reset_tag=reset_tag, continue_sessions=continue_sessions, @@ -200,11 +204,6 @@ async def run_replay( except Exception: logger.warning("Failed to remove worktree: %s", wt) shutil.rmtree(worktree_base, ignore_errors=True) - # Clean up _worktrees parent if empty - try: - worktree_base.parent.rmdir() - except OSError: - pass # not empty or already gone # Process results new_dirs: list[Path] = [] @@ -251,6 +250,7 @@ async def _run_single_replicate( tool_result_entries: list[dict], source_session_id: str, prompt_override: str | None, + project_dir: Path, output_base: Path, reset_tag: str, continue_sessions: bool, @@ -268,11 +268,9 @@ async def _run_single_replicate( replay_run_dir = output_base / run_name replay_run_dir.mkdir(parents=True) - # Write truncated transcript to Claude's project dir for the worktree path - # (SDK computes project hash from cwd, which is the worktree, not original work_dir) - worktree_project_dir = get_project_dir(str(worktree_dir.resolve())) + # Write truncated transcript to Claude's project dir truncated_path = write_truncated_transcript( - truncated_entries, new_session_id, worktree_project_dir + truncated_entries, new_session_id, project_dir ) # Save a copy in the replay run for reference @@ -293,11 +291,7 @@ async def _run_single_replicate( # Build AsyncIterable prompt prompt: str | AsyncIterable[dict[str, Any]] if turn_index == 1: - # Replay from scratch — use original prompt, optionally with override appended - if prompt_override: - prompt = f"{session_config.prompt}\n\n{prompt_override}" - else: - prompt = session_config.prompt + prompt = session_config.prompt resume_id = None else: prompt = _build_replay_prompt(tool_result_entries, prompt_override) @@ -333,6 +327,7 @@ async def _run_single_replicate( # Optionally continue with remaining sessions from config results: list[SessionResult] = [result] session_ids: dict[int, str | None] = {session_index: result.session_id} + fork_counts: dict[int | None, int] = {} if continue_sessions: for sc in sorted(config.sessions, key=lambda s: s.session_index): @@ -372,16 +367,16 @@ async def _run_single_replicate( resume_id = session_ids[1] fork = True - # Determine if working dir reset is needed for this fork. - # In replay context, we always reset for forked sessions because - # the replayed session (or prior continuation sessions) will have - # mutated the worktree since the fork point's clean state. + # Determine if working dir reset is needed for this fork effective_fork_from = fork_from if effective_fork_from is None and config.session_mode.value == "forked": effective_fork_from = 1 needs_reset = False if fork or config.session_mode.value == "forked": - needs_reset = True + fork_key = effective_fork_from + fork_counts[fork_key] = fork_counts.get(fork_key, 0) + 1 + if fork_counts[fork_key] > 1 or rep_idx > 1: + needs_reset = True replay_shadow_git.begin_session( sc.session_index, diff --git a/src/harness/resample.py b/src/harness/resample.py index cae63ac..5bd2bfb 100644 --- a/src/harness/resample.py +++ b/src/harness/resample.py @@ -17,6 +17,7 @@ import logging from datetime import datetime, timezone from pathlib import Path +from urllib.parse import urlparse import httpx import typer @@ -56,14 +57,54 @@ def _load_headers(raw_dumps_dir: Path, request_index: int) -> dict: return {} +def _clean_thinking_signatures(messages: list) -> list: + """Strip thinking block signatures — they're response-specific.""" + cleaned = [] + for msg in messages: + if isinstance(msg.get("content"), list): + cleaned_content = [] + for block in msg["content"]: + if block.get("type") == "thinking": + cleaned_content.append({ + "type": "thinking", + "thinking": block.get("thinking", ""), + }) + else: + cleaned_content.append(block) + cleaned.append({**msg, "content": cleaned_content}) + else: + cleaned.append(msg) + return cleaned + + +def _is_ollama_target(target_url: str, provider: str | None = None) -> bool: + """Check if a target URL points to an Ollama server.""" + if provider == "ollama": + return True + + parsed = urlparse(target_url) + host = parsed.hostname or "" + port = parsed.port + + if host in {"localhost", "127.0.0.1"}: + return True + + return port == 11434 + def _build_headers( - captured_headers: dict[str, str], api_key: str, target_url: str + captured_headers: dict[str, str], + api_key: str, + target_url: str, + provider: str | None = None, ) -> dict[str, str]: """Build replay headers from captured headers, replacing auth.""" headers = {**captured_headers} # Use the right auth header for the target - if "openrouter.ai" in target_url: + if _is_ollama_target(target_url, provider) and not api_key: + headers["x-api-key"] = "ollama" + headers.pop("Authorization", None) + elif "openrouter.ai" in target_url: headers["Authorization"] = f"Bearer {api_key}" headers.pop("x-api-key", None) else: @@ -78,16 +119,19 @@ def _build_headers( return headers -def _resolve_api_config(raw_dumps_dir: Path, request_index: int) -> tuple[str, dict[str, str]]: +def _resolve_api_config( + raw_dumps_dir: Path, request_index: int +) -> tuple[str, dict[str, str], str | None]: """Resolve target URL and captured headers for a request. - Returns (target_url, captured_headers). + Returns (target_url, captured_headers, provider). """ import os hdr_data = _load_headers(raw_dumps_dir, request_index) target_url = hdr_data.get("target") captured_headers = hdr_data.get("headers", {}) + provider = hdr_data.get("provider") if not target_url: base_url = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com") @@ -97,14 +141,16 @@ def _resolve_api_config(raw_dumps_dir: Path, request_index: int) -> tuple[str, d "content-type": "application/json", } - return target_url, captured_headers + return target_url, captured_headers, provider -def _resolve_api_key(target_url: str) -> str: +def _resolve_api_key(target_url: str, provider: str | None = None) -> str: """Resolve the API key based on the target URL.""" import os - if "openrouter.ai" in target_url: + if _is_ollama_target(target_url, provider): + return "" + elif "openrouter.ai" in target_url: api_key = os.environ.get("OPENROUTER_API_KEY", "") if not api_key: typer.echo("Error: OPENROUTER_API_KEY not set", err=True) @@ -126,14 +172,12 @@ async def _call_api(url: str, headers: dict[str, str], request_data: dict) -> di def _prepare_request( - request_data: dict, - model_override: str | None = None, + request_data: dict, model_override: str | None = None ) -> dict: """Apply standard modifications to a request before resampling.""" request_data["stream"] = False - # Remove SDK-specific fields that aren't part of the public API - for key in ("context_management", "metadata"): - request_data.pop(key, None) + if isinstance(request_data.get("messages"), list): + request_data["messages"] = _clean_thinking_signatures(request_data["messages"]) if model_override: request_data["model"] = model_override return request_data @@ -230,13 +274,7 @@ def dump_request( request_index: int, replicate: int | None = None, ) -> dict: - """Load and prepare a request for editing. Returns the request data. - - Thinking blocks are preserved in the dump so researchers can read them - for context. They are stripped automatically at send time (variant - resampling) since the API requires cryptographic signatures that can't - survive edits. - """ + """Load and prepare a request for editing. Returns the request data.""" session_dir = resolve_session_dir(run_dir, session_index, replicate) raw_dumps_dir = session_dir / "raw_dumps" request_data = _load_request(raw_dumps_dir, request_index) @@ -263,12 +301,14 @@ async def run_resample( raw_dumps_dir = session_dir / "raw_dumps" request_data = _load_request(raw_dumps_dir, request_index) - target_url, captured_headers = _resolve_api_config(raw_dumps_dir, request_index) + target_url, captured_headers, provider = _resolve_api_config( + raw_dumps_dir, request_index + ) _prepare_request(request_data, model_override) - api_key = _resolve_api_key(target_url) - headers = _build_headers(captured_headers, api_key, target_url) + api_key = _resolve_api_key(target_url, provider) + headers = _build_headers(captured_headers, api_key, target_url, provider) # Output directory resample_dir = session_dir / "resamples" / f"request_{request_index:03d}" @@ -359,17 +399,17 @@ async def run_variant_resample( raw_dumps_dir = session_dir / "raw_dumps" # Resolve API config from the original request headers - target_url, captured_headers = _resolve_api_config(raw_dumps_dir, request_index) + target_url, captured_headers, provider = _resolve_api_config( + raw_dumps_dir, request_index + ) - # Clean up edited request for API compatibility - for key in ("context_management", "metadata"): - edited_request.pop(key, None) + # Apply model override to the edited request if model_override: edited_request["model"] = model_override edited_request["stream"] = False - api_key = _resolve_api_key(target_url) - headers = _build_headers(captured_headers, api_key, target_url) + api_key = _resolve_api_key(target_url, provider) + headers = _build_headers(captured_headers, api_key, target_url, provider) # Create variant directory variant_id = _next_variant_id(session_dir, request_index) diff --git a/src/harness/resample_session.py b/src/harness/resample_session.py index b18014b..8c38084 100644 --- a/src/harness/resample_session.py +++ b/src/harness/resample_session.py @@ -144,14 +144,10 @@ async def run_resample_session( with open(meta_path) as f: meta = json.load(f) - # Determine fork_from session_id and worktree reset tag + # Determine fork_from session_id fork_from = session_config.fork_from resume_id: str | None = None - # Handle session_mode: forked (implicit fork from session 1) - if fork_from is None and config.session_mode.value == "forked" and session_index > 1: - fork_from = 1 - if fork_from is not None: for s in meta["sessions"]: if s["session_index"] == fork_from and s.get("session_id"): @@ -170,9 +166,6 @@ async def run_resample_session( err=True, ) - # Worktree tag: use the fork point's end state, not baseline - worktree_ref = f"session_{fork_from:02d}" if fork_from is not None else "baseline" - # Find next replicate number existing = _find_existing_replicates(run_dir, session_index) plain_dir = run_dir / f"session_{session_index:02d}" @@ -199,7 +192,7 @@ async def run_resample_session( ) # Create worktrees for all replicates - worktree_base = run_dir / "_worktrees" / f"resample_s{session_index}" + worktree_base = run_dir / ".worktrees" / f"resample_s{session_index}" worktree_base.mkdir(parents=True, exist_ok=True) worktree_paths: list[Path] = [] @@ -207,7 +200,7 @@ async def run_resample_session( for i in range(count): rep = next_num + i wt = worktree_base / f"rep_{rep:02d}" - source_shadow_git.add_worktree(wt, worktree_ref) + source_shadow_git.add_worktree(wt, "baseline") worktree_paths.append(wt) # Launch all replicates in parallel @@ -236,11 +229,6 @@ async def run_resample_session( except Exception: logger.warning("Failed to remove worktree: %s", wt) shutil.rmtree(worktree_base, ignore_errors=True) - # Clean up _worktrees parent if empty - try: - worktree_base.parent.rmdir() - except OSError: - pass # not empty or already gone # Process results new_dirs: list[Path] = [] diff --git a/src/harness/runner.py b/src/harness/runner.py index f528b44..f8ed72f 100644 --- a/src/harness/runner.py +++ b/src/harness/runner.py @@ -90,7 +90,7 @@ async def run_session( max_turns = session_config.max_turns or run_config.max_turns # Inject working directory and memory file hint - cwd = str(Path(cwd_override).resolve()) if cwd_override else str(Path(run_config.work_dir).resolve()) + cwd = cwd_override or str(Path(run_config.work_dir).resolve()) memory_path = Path(cwd) / run_config.memory_file file_hint = ( f"\n\nYour working directory is {cwd}\n" @@ -159,7 +159,9 @@ async def run_session( target_url = get_target_url(run_config.provider, run_config.base_url) proxy = CaptureProxy(raw_dump_count=9999) port = await proxy.start( - target_url, session_dir / "api_captures.jsonl" + target_url, + session_dir / "api_captures.jsonl", + provider=run_config.provider, ) provider_env["ANTHROPIC_BASE_URL"] = f"http://127.0.0.1:{port}" options.env = provider_env diff --git a/src/harness/transcript.py b/src/harness/transcript.py index 1796f1b..99cd63c 100644 --- a/src/harness/transcript.py +++ b/src/harness/transcript.py @@ -102,21 +102,14 @@ def _flush_turn() -> None: current_turn.assistant_lines.append(entry) # Detect content block types - content = msg.get("content", []) - if isinstance(content, str): - # String-form content (no tool_use blocks to detect) - pass - else: - for block in content: - if not isinstance(block, dict): - continue - block_type = block.get("type") - if block_type == "tool_use": - current_turn.has_tool_use = True - current_tool_use_ids.add(block.get("id", "")) - tool_name = block.get("name", "") - if tool_name: - current_turn.tool_names.append(tool_name) + for block in msg.get("content", []): + block_type = block.get("type") + if block_type == "tool_use": + current_turn.has_tool_use = True + current_tool_use_ids.add(block.get("id", "")) + tool_name = block.get("name", "") + if tool_name: + current_turn.tool_names.append(tool_name) elif entry_type == "user": if not seen_first_assistant: @@ -288,13 +281,7 @@ def list_turns( has_thinking = False for entry in turn.assistant_lines: msg = entry.get("message", {}) - content = msg.get("content", []) - if isinstance(content, str): - has_text = True - continue - for block in content: - if not isinstance(block, dict): - continue + for block in msg.get("content", []): if block.get("type") == "text": has_text = True elif block.get("type") == "thinking": diff --git a/tests/edit_test.yaml b/tests/edit_test.yaml deleted file mode 100644 index bc002ed..0000000 --- a/tests/edit_test.yaml +++ /dev/null @@ -1,22 +0,0 @@ -model: "claude-sonnet-4-20250514" -provider: openrouter -work_dir: "./repos/test_repo" -session_mode: isolated -tags: ["edit-test"] -capture_api_requests: true -revert_work_dir: true - -max_turns: 15 -permission_mode: bypassPermissions - -memory_seed: "# Project Notes\n" - -sessions: - - session_index: 1 - prompt: | - Make these changes to main.py: - 1. Add a `multiply(a, b)` function - 2. Add a `greet(name)` function that returns "Hello, {name}!" - 3. Update the __main__ block to call both new functions with example args and print the results - 4. Create a new file called utils.py with a `is_even(n)` function - Then write a summary of what you changed to MEMORY.md. diff --git a/tests/final_chained.yaml b/tests/final_chained.yaml deleted file mode 100644 index 6d6f697..0000000 --- a/tests/final_chained.yaml +++ /dev/null @@ -1,24 +0,0 @@ -model: "claude-sonnet-4-20250514" -provider: anthropic -hypothesis: "Final validation: chained mode across 3 sessions with full context" -work_dir: "./repos/test_repo" -session_mode: chained -max_turns: 10 -permission_mode: bypassPermissions -capture_api_requests: true -revert_work_dir: true -max_budget_usd: 1.00 -tags: ["final-test", "chained"] - -system_prompt: | - You are reviewing a small Python project. Use MEMORY.md to keep notes across sessions. - -memory_seed: "# Project Notes\n" - -sessions: - - session_index: 1 - prompt: "Read main.py and note the functions defined. Write a summary to MEMORY.md." - - session_index: 2 - prompt: "Based on what you found in session 1, read any other Python files in the project. Update MEMORY.md with the full picture." - - session_index: 3 - prompt: "Review MEMORY.md and write a final one-paragraph project summary at the top of the file." diff --git a/tests/final_isolated_subagent.yaml b/tests/final_isolated_subagent.yaml deleted file mode 100644 index 9859e86..0000000 --- a/tests/final_isolated_subagent.yaml +++ /dev/null @@ -1,33 +0,0 @@ -model: "claude-sonnet-4-20250514" -provider: anthropic -hypothesis: "Final validation: isolated mode with subagent delegation across 3 sessions" -work_dir: "./repos/test_repo" -session_mode: isolated -max_turns: 10 -permission_mode: bypassPermissions -capture_api_requests: true -capture_subagent_trajectories: true -revert_work_dir: true -max_budget_usd: 1.00 -tags: ["final-test", "isolated", "subagent"] - -system_prompt: | - You are reviewing a small Python project. Use MEMORY.md to keep notes. - Delegate file reading to your code-reader agent when appropriate. - -memory_seed: "# Project Notes\n" - -agents: - - name: "code-reader" - description: "Reads source files and reports their contents and structure." - prompt: "You are a code reading specialist. Read the requested files and report what you find. Be concise." - tools: ["Read", "Glob", "Grep"] - model: "sonnet" - -sessions: - - session_index: 1 - prompt: "Use the code-reader to read main.py and summarize what functions are defined. Write your findings to MEMORY.md." - - session_index: 2 - prompt: "Read MEMORY.md for prior notes. Then use the code-reader to check if there's a utils.py file. Update MEMORY.md with what you find." - - session_index: 3 - prompt: "Read MEMORY.md for context from prior sessions. Write a brief project summary based on your accumulated notes." diff --git a/tests/subagent_3session.yaml b/tests/subagent_3session.yaml deleted file mode 100644 index 54fe66f..0000000 --- a/tests/subagent_3session.yaml +++ /dev/null @@ -1,31 +0,0 @@ -model: "claude-sonnet-4-20250514" -provider: openrouter -work_dir: "./repos/test_repo" -session_mode: chained -tags: ["subagent-test", "3-session"] -capture_api_requests: true -capture_subagent_trajectories: true - -system_prompt: | - You are exploring a small Python project. Delegate file reading to your code-explorer subagent. - -max_turns: 15 -max_budget_usd: 1.00 -permission_mode: bypassPermissions - -agents: - - name: "code-explorer" - description: "Explores code structure, reads files, and reports findings." - prompt: "You are a code exploration specialist. Read the files you are asked about and report what you find. Be concise." - tools: ["Read", "Glob", "Grep"] - model: "sonnet" - -memory_seed: "# Project Notes\n" - -sessions: - - session_index: 1 - prompt: "Use the code-explorer agent to read main.py. Then write a brief summary of what you learned to MEMORY.md." - - session_index: 2 - prompt: "Use the code-explorer agent to find all Python files in the project and list them. Update MEMORY.md with what you find." - - session_index: 3 - prompt: "Use the code-explorer agent to check if there are any test files. Summarize the full project structure in MEMORY.md." diff --git a/tests/test_config.py b/tests/test_config.py index 2e59e20..5347d67 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -261,9 +261,42 @@ def test_vertex_provider(self): env = build_provider_env(rc) assert env["CLAUDE_CODE_USE_VERTEX"] == "1" + def test_ollama_provider(self): + rc = RunConfig.model_validate( + _minimal(provider="ollama", model="kimi-k2.5:cloud") + ) + env = build_provider_env(rc) + assert env["ANTHROPIC_BASE_URL"] == "http://localhost:11434" + assert env["ANTHROPIC_API_KEY"] == "" + assert env["ANTHROPIC_AUTH_TOKEN"] == "ollama" + assert env["CLAUDE_CODE_ATTRIBUTION_HEADER"] == "0" + assert env["ANTHROPIC_DEFAULT_OPUS_MODEL"] == "kimi-k2.5:cloud" + assert env["ANTHROPIC_DEFAULT_SONNET_MODEL"] == "kimi-k2.5:cloud" + assert env["ANTHROPIC_DEFAULT_HAIKU_MODEL"] == "kimi-k2.5:cloud" + assert env["CLAUDE_CODE_SUBAGENT_MODEL"] == "kimi-k2.5:cloud" + + def test_ollama_custom_host(self, monkeypatch): + monkeypatch.setenv("OLLAMA_HOST", "http://gpu-box:11434") + rc = RunConfig.model_validate( + _minimal(provider="ollama", model="qwen3.5:cloud") + ) + env = build_provider_env(rc) + assert env["ANTHROPIC_BASE_URL"] == "http://gpu-box:11434" + + def test_ollama_base_url_override(self): + rc = RunConfig.model_validate( + _minimal( + provider="ollama", + model="kimi-k2.5:cloud", + base_url="http://custom:9999", + ) + ) + env = build_provider_env(rc) + assert env["ANTHROPIC_BASE_URL"] == "http://custom:9999" + def test_claudecode_unset(self): """All providers should unset CLAUDECODE to allow nested launches.""" - for provider in ["openrouter", "anthropic", "bedrock", "vertex"]: + for provider in ["openrouter", "anthropic", "bedrock", "vertex", "ollama"]: rc = RunConfig.model_validate(_minimal(provider=provider)) env = build_provider_env(rc) assert env["CLAUDECODE"] == "" diff --git a/tests/test_resample.py b/tests/test_resample.py index 8f97b4b..1743e70 100644 --- a/tests/test_resample.py +++ b/tests/test_resample.py @@ -9,8 +9,11 @@ from harness.resample import ( _build_headers, + _clean_thinking_signatures, + _is_ollama_target, _next_variant_id, _prepare_request, + _resolve_api_key, dump_request, list_requests, resolve_session_dir, @@ -30,6 +33,7 @@ def _make_raw_dumps(session_dir: Path, requests: list[dict]) -> Path: (raw / f"request_{i:03d}_headers.json").write_text(json.dumps({ "target": "https://api.anthropic.com/v1/messages", "headers": {"content-type": "application/json"}, + "provider": "anthropic", })) return raw @@ -90,8 +94,7 @@ def test_sets_stream_false(self): result = _prepare_request(req) assert result["stream"] is False - def test_preserves_thinking_signatures(self): - """Thinking blocks are preserved as-is (signatures required by API).""" + def test_cleans_thinking_signatures(self): req = { "model": "test", "messages": [ @@ -101,7 +104,7 @@ def test_preserves_thinking_signatures(self): ], } result = _prepare_request(req) - assert result["messages"][0]["content"][0]["signature"] == "s" + assert "signature" not in result["messages"][0]["content"][0] def test_model_override(self): req = {"model": "old", "stream": True} @@ -130,13 +133,12 @@ def test_dump_replicate(self, tmp_path: Path): req = dump_request(tmp_path, 2, 1, replicate=1) assert req["model"] == "claude-test" - def test_dump_preserves_signatures(self, tmp_path: Path): - """Thinking block signatures are preserved (required by API).""" + def test_dump_cleans_signatures(self, tmp_path: Path): _make_session(tmp_path, 1) req = dump_request(tmp_path, 1, 2) # request 2 has thinking blocks assistant_msg = req["messages"][1] thinking_block = assistant_msg["content"][0] - assert thinking_block["signature"] == "sig1" + assert "signature" not in thinking_block # --------------------------------------------------------------------------- @@ -202,7 +204,73 @@ def test_ignores_other_requests(self, tmp_path: Path): # --------------------------------------------------------------------------- -# _build_headers +# _clean_thinking_signatures (kept from original) +# --------------------------------------------------------------------------- + +class TestCleanThinkingSignatures: + def test_strips_signatures(self): + messages = [ + { + "role": "assistant", + "content": [ + { + "type": "thinking", + "thinking": "Let me think...", + "signature": "abc123", + }, + {"type": "text", "text": "Here's my answer"}, + ], + } + ] + cleaned = _clean_thinking_signatures(messages) + thinking_block = cleaned[0]["content"][0] + + assert thinking_block["type"] == "thinking" + assert thinking_block["thinking"] == "Let me think..." + assert "signature" not in thinking_block + + def test_preserves_non_thinking_blocks(self): + messages = [ + { + "role": "assistant", + "content": [ + {"type": "text", "text": "hello"}, + {"type": "tool_use", "id": "tc1", "name": "Read", "input": {}}, + ], + } + ] + cleaned = _clean_thinking_signatures(messages) + assert cleaned[0]["content"][0] == {"type": "text", "text": "hello"} + assert cleaned[0]["content"][1]["type"] == "tool_use" + + def test_handles_string_content(self): + messages = [{"role": "user", "content": "Hello!"}] + cleaned = _clean_thinking_signatures(messages) + assert cleaned == messages + + def test_handles_empty_messages(self): + assert _clean_thinking_signatures([]) == [] + + def test_multiple_thinking_blocks(self): + messages = [ + { + "role": "assistant", + "content": [ + {"type": "thinking", "thinking": "thought 1", "signature": "s1"}, + {"type": "text", "text": "answer"}, + {"type": "thinking", "thinking": "thought 2", "signature": "s2"}, + ], + } + ] + cleaned = _clean_thinking_signatures(messages) + assert "signature" not in cleaned[0]["content"][0] + assert "signature" not in cleaned[0]["content"][2] + assert cleaned[0]["content"][0]["thinking"] == "thought 1" + assert cleaned[0]["content"][2]["thinking"] == "thought 2" + + +# --------------------------------------------------------------------------- +# _build_headers (kept from original) # --------------------------------------------------------------------------- class TestBuildHeaders: @@ -261,3 +329,52 @@ def test_preserves_other_headers(self): ) assert headers["content-type"] == "application/json" assert headers["anthropic-version"] == "2023-06-01" + + def test_ollama_headers_no_api_key(self): + headers = _build_headers( + captured_headers={"content-type": "application/json"}, + api_key="", + target_url="http://localhost:11434/v1/messages", + ) + assert headers["x-api-key"] == "ollama" + assert "Authorization" not in headers + + def test_ollama_headers_provider_marker(self): + headers = _build_headers( + captured_headers={"content-type": "application/json"}, + api_key="", + target_url="http://custom:9999/v1/messages", + provider="ollama", + ) + assert headers["x-api-key"] == "ollama" + + +class TestIsOllamaTarget: + def test_localhost(self): + assert _is_ollama_target("http://localhost:11434/v1/messages") is True + + def test_loopback(self): + assert _is_ollama_target("http://127.0.0.1:11434/v1/messages") is True + + def test_default_port(self): + assert _is_ollama_target("http://gpu-box:11434/v1/messages") is True + + def test_provider_marker(self): + assert _is_ollama_target("http://custom:9999/v1/messages", "ollama") is True + + def test_anthropic(self): + assert _is_ollama_target("https://api.anthropic.com/v1/messages") is False + + def test_openrouter(self): + assert _is_ollama_target("https://openrouter.ai/api/v1/messages") is False + + +class TestResolveApiKeyOllama: + def test_ollama_returns_empty(self): + assert _resolve_api_key("http://localhost:11434/v1/messages") == "" + + def test_ollama_custom_port_returns_empty(self): + assert _resolve_api_key("http://gpu-box:11434/v1/messages") == "" + + def test_ollama_provider_marker_returns_empty(self): + assert _resolve_api_key("http://custom:9999/v1/messages", "ollama") == "" diff --git a/ui/src/app.css b/ui/src/app.css index 5e78fd5..0c2c09e 100644 --- a/ui/src/app.css +++ b/ui/src/app.css @@ -145,32 +145,19 @@ [id^="step-"] { scroll-margin-top: 4.5rem; } - /* Ensure pre/code inside cards inherit foreground color */ - pre, code { - color: inherit; + /* Override prose typography to use our theme foreground for better contrast */ + .prose { + --tw-prose-body: var(--foreground); + --tw-prose-headings: var(--foreground); + --tw-prose-bold: var(--foreground); + --tw-prose-code: var(--foreground); + --tw-prose-links: var(--foreground); + } + .dark .prose { + --tw-prose-body: var(--foreground); + --tw-prose-headings: var(--foreground); + --tw-prose-bold: var(--foreground); + --tw-prose-code: var(--foreground); + --tw-prose-links: var(--foreground); } -} - -/* Override prose typography — outside @layer for higher specificity than the plugin */ -.prose h1, .prose h2, .prose h3, .prose h4, .prose h5, .prose h6 { - color: var(--foreground); -} -.prose strong, .prose b { - color: var(--foreground); -} -.prose code { - color: var(--foreground); -} -.prose a { - color: var(--foreground); -} -.prose pre { - color: var(--foreground); - background-color: var(--muted); -} -.prose pre code { - color: var(--foreground); -} -.prose ol > li::marker, .prose ul > li::marker { - color: var(--muted-foreground); } diff --git a/ui/src/lib/components/chat/MessageEditor.svelte b/ui/src/lib/components/chat/MessageEditor.svelte index 9ec5669..1556a01 100644 --- a/ui/src/lib/components/chat/MessageEditor.svelte +++ b/ui/src/lib/components/chat/MessageEditor.svelte @@ -185,8 +185,13 @@ {#each msg.content as block, blockIdx} {#if block.type === "thinking"}
-
thinking (read-only — signature-protected)
-
{block.thinking || ""}
+
thinking
+
{:else if isSystemReminder(block)} {@const rKey = `${absIdx}-${blockIdx}`} diff --git a/ui/src/lib/server/runs.ts b/ui/src/lib/server/runs.ts index 7b9c959..b508f54 100644 --- a/ui/src/lib/server/runs.ts +++ b/ui/src/lib/server/runs.ts @@ -1,47 +1,7 @@ import { join } from "node:path"; -import { readdir } from "node:fs/promises"; import type { RunMeta } from "$lib/types/run"; import { runsDir, readJsonFile, listDirectories, fileExists } from "./fs"; -/** Count resample samples and variants across all sessions in a run. */ -async function countResamples(runDir: string): Promise<{ samples: number; variants: number }> { - let samples = 0; - let variants = 0; - - try { - const entries = await readdir(runDir, { withFileTypes: true }); - const sessionDirs = entries - .filter((e) => e.isDirectory() && e.name.startsWith("session_")) - .map((e) => e.name); - - for (const sd of sessionDirs) { - const resamplesDir = join(runDir, sd, "resamples"); - try { - const resampleEntries = await readdir(resamplesDir, { withFileTypes: true }); - for (const re of resampleEntries) { - if (!re.isDirectory()) continue; - if (re.name.match(/^request_\d+_v\d+$/)) { - // Variant directory - variants++; - const vFiles = await readdir(join(resamplesDir, re.name)); - samples += vFiles.filter((f) => f.startsWith("sample_") && f.endsWith(".json") && !f.includes("error")).length; - } else if (re.name.match(/^request_\d+$/)) { - // Vanilla resample directory - const sFiles = await readdir(join(resamplesDir, re.name)); - samples += sFiles.filter((f) => f.startsWith("sample_") && f.endsWith(".json") && !f.includes("error")).length; - } - } - } catch { - // No resamples dir - } - } - } catch { - // Can't read run dir - } - - return { samples, variants }; -} - export async function listRuns(): Promise { const base = runsDir(); const dirs = await listDirectories(base); @@ -52,9 +12,6 @@ export async function listRuns(): Promise { if (await fileExists(metaPath)) { try { const meta = await readJsonFile(metaPath); - const { samples, variants } = await countResamples(join(base, dir)); - meta.resample_count = samples; - meta.variant_count = variants; runs.push(meta); } catch { // Skip malformed run directories diff --git a/ui/src/lib/types/run.ts b/ui/src/lib/types/run.ts index 18b2b49..31dc979 100644 --- a/ui/src/lib/types/run.ts +++ b/ui/src/lib/types/run.ts @@ -46,18 +46,4 @@ export interface RunMeta { total_compaction_events: number; total_subagent_invocations: number; errors: string[]; - /** Present on replay runs — name of the immediate source run */ - replay_source?: string; - /** Present on replay runs — turn index that was replayed from */ - replay_turn?: number; - /** Number of resample samples across all sessions (computed at load time) */ - resample_count?: number; - /** Number of resample variants across all sessions (computed at load time) */ - variant_count?: number; -} - -/** A run with its replay children grouped together. */ -export interface RunGroup { - run: RunMeta; - replays: RunMeta[]; } diff --git a/ui/src/routes/+page.svelte b/ui/src/routes/+page.svelte index ae8ed5e..a10b779 100644 --- a/ui/src/routes/+page.svelte +++ b/ui/src/routes/+page.svelte @@ -1,196 +1,22 @@ -{#snippet runRow(run: RunMeta, isReplay: boolean)} - window.location.href = `/runs/${run.run_name}`} - > - -
- {#if isReplay} - - {/if} - - {#if isReplay} - {replayLabel(run)} - {:else} - {run.run_name} - {/if} - - {#if run.errors.length > 0} - - {/if} -
- {#if !isReplay} -
- {run.provider} - · - {run.session_mode} -
- {:else} -
- {run.run_name} -
- {/if} - - {run.model} - {run.session_count} - {run.total_steps} - - {#if !isReplay} - {@const rs = run.resample_count ?? 0} - {@const vs = run.variant_count ?? 0} - {@const rps = groups.find(g => g.run.run_name === run.run_name)?.replays.length ?? 0} - {#if rs || rps} -
- {#if rs}{rs} resample{rs !== 1 ? 's' : ''}{#if vs} ({vs} variant{vs !== 1 ? 's' : ''}){/if}{/if} - {#if rps}{rps} replay{rps !== 1 ? 's' : ''}{/if} -
- {:else} - - {/if} - {:else} - - {/if} - - - {formatCost(run.total_cost_usd)} - - -
- {#each run.tags.filter(t => t !== 'replay') as tag} - {tag} - {/each} -
- - - {formatDate(run.started_at)} - - -{/snippet} -

Runs

@@ -216,34 +42,47 @@ Model Sessions Steps - Resamples Cost Tags Date - {#each filtered as group} - {@render runRow(group.run, false)} - {#if group.replays.length > 0} - {@const isExpanded = expandedGroups.has(group.run.run_name)} - - - - - - {#if isExpanded} - {#each group.replays as replay} - {@render runRow(replay, true)} - {/each} - {/if} - {/if} + {#each filtered as run} + window.location.href = `/runs/${run.run_name}`} + > + + + {run.run_name} + + {#if run.errors.length > 0} + + {/if} +
+ {run.provider} + · + {run.session_mode} +
+ + {run.model} + {run.session_count} + {run.total_steps} + + {formatCost(run.total_cost_usd)} + + +
+ {#each run.tags as tag} + {tag} + {/each} +
+ + + {formatDate(run.started_at)} + + {/each} diff --git a/ui/src/routes/api/resample/+server.ts b/ui/src/routes/api/resample/+server.ts index 057dbe5..31eb232 100644 --- a/ui/src/routes/api/resample/+server.ts +++ b/ui/src/routes/api/resample/+server.ts @@ -247,16 +247,14 @@ async function loadRawRequest(dir: string, pad: string) { /** GET: load existing resample results + variants */ export const GET: RequestHandler = async ({ url }) => { const runName = url.searchParams.get("runName"); - const rawSessionIndex = url.searchParams.get("sessionIndex") || "0"; + const sessionIndex = parseInt(url.searchParams.get("sessionIndex") || "0"); const requestIndex = parseInt(url.searchParams.get("requestIndex") || "0"); if (!runName || !requestIndex) { return error(400, "Missing runName, sessionIndex, or requestIndex"); } - // Support both plain numbers ("2") and replicate keys ("2_r01") - const sessionKey = rawSessionIndex.includes("_") ? rawSessionIndex : parseInt(rawSessionIndex); - const dir = sessionDir(runName, sessionKey); + const dir = sessionDir(runName, sessionIndex); const pad = String(requestIndex).padStart(3, "0"); const resamplesDir = join(dir, "resamples"); @@ -354,11 +352,8 @@ export const POST: RequestHandler = async ({ request }) => { return error(500, `${keyName} not configured`); } - // Force non-streaming + // Force non-streaming (keep thinking signatures — the API requires them) requestData.stream = false; - // Remove SDK-specific fields that aren't part of the public API - delete requestData.context_management; - delete requestData.metadata; // Determine output directory (always keyed off the original requestIndex) let resampleDir: string; @@ -370,9 +365,6 @@ export const POST: RequestHandler = async ({ request }) => { for (const edit of variant.edits) { applyEdit(requestData, edit); } - // Thinking blocks are read-only in the editor, so signatures stay valid. - // If someone manually tampers with thinking via the CLI, the API will - // return a clear 400 error. // Assign variant ID const vid = await nextVariantId(resamplesDir, pad);