softpudding · softpudding · Apr 18, 2026 · Apr 18, 2026 · Apr 18, 2026
diff --git a/frontend/index.html b/frontend/index.html
@@ -7645,21 +7645,13 @@ <h1>Sisyphus</h1>
         }
 
         function buildRoutinePrompt(routine) {
-            // The agent receives the Routine markdown verbatim plus a one-line
-            // instruction. Keeping the Routine fully visible in the prompt is
-            // intentional: the OpenBrowser agent's tools (highlight/click/etc)
-            // are exactly the vocabulary the Routine was compiled in.
-            const goal = (routine.goal || routine.name || '').trim();
-            const goalLine = goal
-                ? `Goal: ${goal}`
-                : `Goal: run the saved routine "${routine.name}".`;
-            return [
-                `Run the saved routine "${routine.name}".`,
-                goalLine,
-                'Follow this Routine step by step:',
-                '',
-                routine.routine_markdown || '',
-            ].join('\n');
+            // In routine_replay mode the user message carries only the SOP
+            // markdown verbatim. The routine name, goal, and "follow step
+            // by step" framing live in the routine-replay system prompt
+            // block, not here — repeating them in the user message gave the
+            // model a redundant identifier to lose track of when long
+            // contexts decayed (session d1395b5d).
+            return routine.routine_markdown || '';
         }
 
         async function refreshRoutines() {

diff --git a/pyproject.toml b/pyproject.toml
@@ -76,5 +76,5 @@ override-dependencies = [
 ]
 
 [tool.uv.sources]
-openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "bd4cb296355c3d03dd411883e78527b1915fa8c4" }
-openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "bd4cb296355c3d03dd411883e78527b1915fa8c4" }
+openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "c92a185a" }
+openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "c92a185a" }
diff --git a/server/agent/browser_condenser.py b/server/agent/browser_condenser.py
@@ -11,9 +11,38 @@
 DEFAULT_BROWSER_CONDENSER_MAX_SIZE = 1000
 DEFAULT_BROWSER_CONDENSER_TOKEN_RATIO = 0.7
 
+# Per-model token caps for models with known long-context attention decay.
+# Matched by case-insensitive substring against llm.model so provider
+# prefixes (e.g. "dashscope/qwen3.5-flash") and variant suffixes still
+# trigger the cap. Session d1395b5d saw qwen3.5-flash lose the original
+# user message after ~100 browser events because the condenser's
+# context-window-ratio threshold (~700k for a 1M-token model) never fired.
+SMALL_MODEL_TOKEN_OVERRIDES: dict[str, int] = {
+    "qwen3.5-flash": 100_000,
+}
+
+
+def _small_model_token_override(model: str | None) -> int | None:
+    if not model:
+        return None
+    needle = model.lower()
+    for fragment, token_cap in SMALL_MODEL_TOKEN_OVERRIDES.items():
+        if fragment.lower() in needle:
+            return token_cap
+    return None
+
 
 def derive_browser_condenser_max_tokens(llm: LLM) -> int | None:
-    """Derive a token threshold for browser-heavy conversations."""
+    """Derive a token threshold for browser-heavy conversations.
+
+    For models listed in ``SMALL_MODEL_TOKEN_OVERRIDES`` the cap is
+    returned directly, regardless of the model's advertised context
+    window. Otherwise the threshold is a fraction of the context window.
+    """
+
+    override = _small_model_token_override(llm.model)
+    if override is not None:
+        return override
 
     max_input_tokens = llm.max_input_tokens
     if not max_input_tokens or max_input_tokens <= 0:

diff --git a/server/tests/unit/test_browser_condenser.py b/server/tests/unit/test_browser_condenser.py
@@ -37,6 +37,64 @@ def test_configure_browser_condenser_prefers_token_limit() -> None:
     assert condenser.max_tokens is None
 
 
+def test_derive_browser_condenser_max_tokens_uses_small_model_override() -> None:
+    """Small models with known long-context attention decay get a stricter
+    token budget than the 0.7×context_window derivation, even when their
+    advertised context window is much larger.
+
+    Rationale: session d1395b5d ran qwen3.5-flash past ~100 events with no
+    condensation and watched the model lose track of the original user
+    message. The override forces the condenser to kick in earlier for
+    these models regardless of advertised context size.
+    """
+    llm = LLM.model_construct(
+        model="dashscope/qwen3.5-flash", max_input_tokens=1_000_000
+    )
+
+    assert derive_browser_condenser_max_tokens(llm) == 100_000
+
+
+def test_derive_browser_condenser_max_tokens_override_matches_model_substring() -> None:
+    """The override matches by substring so provider prefixes (litellm
+    style like ``dashscope/qwen3.5-flash`` or ``openai/qwen3.5-flash``)
+    still trigger the small-model cap.
+    """
+    for model_name in (
+        "qwen3.5-flash",
+        "dashscope/qwen3.5-flash",
+        "openai/qwen3.5-flash-preview",
+    ):
+        llm = LLM.model_construct(model=model_name, max_input_tokens=1_000_000)
+        assert derive_browser_condenser_max_tokens(llm) == 100_000, model_name
+
+
+def test_derive_browser_condenser_max_tokens_override_ignores_unrelated_models() -> (
+    None
+):
+    """Models not in the override map keep the 0.7×context_window
+    derivation."""
+    llm = LLM.model_construct(
+        model="dashscope/qwen3.5-plus", max_input_tokens=1_000_000
+    )
+
+    assert derive_browser_condenser_max_tokens(llm) == 700_000
+
+
+def test_configure_browser_condenser_applies_small_model_override() -> None:
+    """When the LLM matches a small-model override, ``configure`` must use
+    the override value rather than the context-window derivation, even if
+    the derivation would give a higher threshold."""
+    llm = LLM.model_construct(
+        model="dashscope/qwen3.5-flash", max_input_tokens=1_000_000
+    )
+    condenser = LLMSummarizingCondenser(llm=llm, max_size=80, keep_first=4)
+
+    configured = configure_browser_condenser(condenser, llm)
+
+    assert isinstance(configured, LLMSummarizingCondenser)
+    assert configured.max_tokens == 100_000
+
+
 def test_configure_browser_condenser_preserves_explicit_token_limit() -> None:
     llm = LLM.model_construct(model="test-model", max_input_tokens=100_000)
     condenser = LLMSummarizingCondenser(

diff --git a/uv.lock b/uv.lock