From 9c99622d1550cd2cedbd5d27058c0b13d5f85b69 Mon Sep 17 00:00:00 2001 From: Korivi Date: Mon, 13 Apr 2026 16:02:54 +0900 Subject: [PATCH 01/30] CLI Anything added --- skills/cli-anything/SKILL.md | 187 +++++++++++++++++++++++++++++++++ skills/cli-anything/_meta.json | 6 ++ 2 files changed, 193 insertions(+) create mode 100644 skills/cli-anything/SKILL.md create mode 100644 skills/cli-anything/_meta.json diff --git a/skills/cli-anything/SKILL.md b/skills/cli-anything/SKILL.md new file mode 100644 index 00000000..5194429b --- /dev/null +++ b/skills/cli-anything/SKILL.md @@ -0,0 +1,187 @@ +--- +name: cli-anything +description: "Generate agent-native CLI harnesses for any GUI application using the CLI-Anything methodology, or discover and install pre-built CLIs via CLI-Hub." +metadata: {"clawdbot":{"emoji":"⚡","os":["darwin","linux","windows"],"requires":{"bins":["python"]}}} +--- + +# CLI-Anything Skill + +CLI-Anything transforms any GUI application into an agent-native command-line interface. Use this skill when the user asks to: +- Generate a CLI harness for any software (GIMP, Blender, LibreOffice, etc.) +- Install or discover CLIs via CLI-Hub +- Refine or test an existing generated harness + +--- + +## Quick Install (CLI-Hub) + +For software that already has a pre-built harness: + +```bash +pip install cli-anything-hub +cli-hub install +``` + +Browse the full catalog: https://hkuds.github.io/CLI-Anything/ + +--- + +## Generate a New CLI Harness + +Follow the **7-Phase Methodology** below. Work sequentially — each phase depends on the prior. + +### Phase 1 — Codebase Analysis + +Before writing code, study the target application: + +``` +- Identify the backend engine (separate from the GUI presentation layer) +- Map each GUI action to its underlying API or Python call +- Understand the data model and native file formats (e.g., .blend, ODF, SVG) +- Locate any existing CLI entry points or scripting interfaces +- Catalog the undo/redo and session management system +``` + +### Phase 2 — CLI Architecture Design + +Choose one of: +- **Stateful REPL** — for interactive, session-based workflows +- **Subcommand CLI** — for scriptable, one-shot invocations +- **Both** — recommended; REPL wraps the subcommand interface + +Design command groups that mirror the app's logical domains (e.g., `image`, `layer`, `export` for GIMP). Plan dual output: human-readable text and machine-readable `--json`. + +### Phase 3 — Implementation + +Directory layout: +``` +cli_anything/ # Namespace package — NO __init__.py here +└── / # Sub-package — HAS __init__.py + ├── __main__.py + ├── README.md + ├── _cli.py + ├── core/ # Domain modules wrapping the real software + ├── utils/ # Shared utilities + repl_skin.py + └── tests/ + ├── TEST.md + ├── test_core.py + └── test_full_e2e.py +``` + +**Critical rule**: The CLI MUST call the actual software for rendering and export — never reimplement the software's functionality in Python. Generate valid native project files and hand them to the real application backend. + +Required patterns for every command: +- `--json` flag for machine-readable output +- Fail loudly with unambiguous error messages +- Introspection commands (`info`, `list`, `status`) for state inspection + +Use the unified REPL skin (`repl_skin.py` from `cli-anything-plugin/repl_skin.py`) so all generated CLIs share a consistent interface. + +### Phase 4 — Test Planning (write TEST.md Part 1) + +Before any test code, document in `tests/TEST.md`: +- Test inventory and what each test covers +- Unit test plans (synthetic data, no external deps) +- E2E test plans (real software backend invoked) +- Realistic end-to-end workflow scenarios + +### Phase 5 — Test Implementation + +Four layers, all required: +1. **Unit tests** — synthetic data, deterministic, fast +2. **E2E native tests** — verify project file generation and structure +3. **E2E backend tests** — invoke the real software, check output exists with correct format (magic bytes, ZIP structure, pixel analysis, etc.) +4. **CLI subprocess tests** — install the CLI entry point, run full workflows end-to-end + +**Never assume an export is correct because it ran without errors.** Validate outputs programmatically and print artifact paths for manual inspection. + +### Phase 6 — Test Documentation (write TEST.md Part 2) + +Append full `pytest` output and summary statistics to `TEST.md`. + +### Phase 6.5 — SKILL.md Generation + +Create `cli_anything//skills/SKILL.md` with: +- YAML frontmatter for agent discovery (`name`, `description`, `tags`, `requires`) +- All command groups and subcommands +- Usage examples for common workflows +- Agent-specific guidance for `--json` output and error handling + +The REPL should print the absolute path to `SKILL.md` on startup so agents can find it. + +### Phase 7 — Package & Install + +```bash +# setup.py uses PEP 420 namespace packaging +cd cli_anything/ +pip install -e . + +# Verify the CLI is on PATH +which cli-anything- +cli-anything- --help +``` + +Publish to PyPI when ready: +```bash +python -m build +twine upload dist/* +``` + +--- + +## Using a Generated CLI + +```bash +# Interactive REPL (default when no subcommand given) +cli-anything- + +# One-shot subcommand with JSON output for agent consumption +cli-anything- --json [args] + +# Help +cli-anything- --help +cli-anything- --help +``` + +--- + +## Refining an Existing Harness + +After initial generation, run a gap analysis: + +```bash +# Broad refinement +/cli-anything:refine ./ + +# Focused refinement on specific capabilities +/cli-anything:refine ./ "batch processing and filters" +``` + +Then re-run tests: `/cli-anything:test ` + +--- + +## Supported Applications (Pre-built) + +CLI-Anything has verified harnesses for 26+ applications: + +| Category | Applications | +|---|---| +| Creative | GIMP, Blender, Inkscape, Krita, MuseScore | +| Office | LibreOffice, Zotero | +| Media | Audacity, OBS Studio, Kdenlive, Shotcut, VideoCaptioner | +| Diagramming | Draw.io, Mermaid | +| AI/ML | ComfyUI, Ollama, NotebookLM | +| Web/Cloud | Zoom, AdGuard Home, Exa | +| Dev Tools | Godot Engine, RenderDoc | + +--- + +## Architecture Pitfalls + +**The Rendering Gap** — project files may reference filters/effects that simple file readers ignore. Solution priority: +1. Use the app's native renderer +2. Build a translation layer for effect conversion +3. Generate a render script as fallback + +**Testing with missing software** — tests MUST NOT skip or fake results when the target software is missing. They should fail loudly so the absence is visible. diff --git a/skills/cli-anything/_meta.json b/skills/cli-anything/_meta.json new file mode 100644 index 00000000..af8c9adc --- /dev/null +++ b/skills/cli-anything/_meta.json @@ -0,0 +1,6 @@ +{ + "ownerId": "kn70pywhg0fyz996kpa8xj89s57yhv26", + "slug": "cli-anything", + "version": "1.0.0", + "publishedAt": 1744574400000 +} From 5d121ea43281b733e609516a8cc4f019a6dcdecf Mon Sep 17 00:00:00 2001 From: Korivi Date: Tue, 14 Apr 2026 08:11:14 +0900 Subject: [PATCH 02/30] Added name character limit to 20 Added name character limit to 20 --- app/onboarding/interfaces/steps.py | 3 ++- .../browser/frontend/src/pages/Onboarding/OnboardingPage.tsx | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/app/onboarding/interfaces/steps.py b/app/onboarding/interfaces/steps.py index e8899440..1e95dc27 100644 --- a/app/onboarding/interfaces/steps.py +++ b/app/onboarding/interfaces/steps.py @@ -209,7 +209,8 @@ def get_options(self) -> List[StepOption]: return [] def validate(self, value: Any) -> tuple[bool, Optional[str]]: - # Optional, any string is valid + if value and len(str(value)) > 20: + return False, "Agent name must be 20 characters or fewer" return True, None def get_default(self) -> str: diff --git a/app/ui_layer/browser/frontend/src/pages/Onboarding/OnboardingPage.tsx b/app/ui_layer/browser/frontend/src/pages/Onboarding/OnboardingPage.tsx index 46bf5e23..e1b55b0d 100644 --- a/app/ui_layer/browser/frontend/src/pages/Onboarding/OnboardingPage.tsx +++ b/app/ui_layer/browser/frontend/src/pages/Onboarding/OnboardingPage.tsx @@ -510,6 +510,7 @@ export function OnboardingPage() { value={textValue} onChange={e => setTextValue(e.target.value)} placeholder={isApiKey ? 'Enter your API key' : 'Enter a name'} + maxLength={isApiKey ? undefined : 20} autoFocus onKeyDown={e => { if (e.key === 'Enter' && canSubmit) handleSubmit() }} /> From f85346bebf313934a305e77aaccdbad73b4bef6c Mon Sep 17 00:00:00 2001 From: Korivi Date: Tue, 14 Apr 2026 08:31:16 +0900 Subject: [PATCH 03/30] Add CLI-Anything integration to crafbot --- app/config/skills_config.json | 1 + 1 file changed, 1 insertion(+) diff --git a/app/config/skills_config.json b/app/config/skills_config.json index 9f6df29a..0975a5d4 100644 --- a/app/config/skills_config.json +++ b/app/config/skills_config.json @@ -1,6 +1,7 @@ { "auto_load": true, "enabled_skills": [ + "cli-anything", "docx", "pdf", "playwright-mcp", From 139eae850b5714caa4e7b0aa47bb592052339d35 Mon Sep 17 00:00:00 2001 From: Korivi Date: Tue, 14 Apr 2026 09:59:47 +0900 Subject: [PATCH 04/30] Grok KV caching issue fixed! MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There were two bugs fixed: Bug 1: `prompt_cache_key` is an OpenAI-specific routing hint in `extra_body`. xAI’s API ignores it, so it doesn’t help with cache routing for Grok. So I skipped it when `self.provider == "grok"`. Bug 2: Wrong field was used for reading cached tokens: * OpenAI → `usage.prompt_tokens_details.cached_tokens` * Grok (xAI) → `usage.prompt_cache_hit_tokens` ---------------- The code was always reading the OpenAI field, so Grok always returned 0 cached tokens, making it look like every call was a full cache miss. I fixed this by branching on `self.provider == "grok"` to read the correct field. Additionally, I updated the cache metrics log to show the actual provider name (grok, openai, etc.). ---------------- I updated the fixed in the same branch " feature/CLI" --- agent_core/core/impl/llm/interface.py | 28 ++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/agent_core/core/impl/llm/interface.py b/agent_core/core/impl/llm/interface.py index 94b7923d..5114cfae 100644 --- a/agent_core/core/impl/llm/interface.py +++ b/agent_core/core/impl/llm/interface.py @@ -1155,9 +1155,10 @@ def _generate_openai( # Always enforce JSON output format request_kwargs["response_format"] = {"type": "json_object"} - # Add prompt_cache_key when call_type is provided for better cache routing - # This helps when alternating between different call types (reasoning, action_selection) - if call_type and system_prompt and len(system_prompt) >= config.min_cache_tokens: + # Add prompt_cache_key for OpenAI/DeepSeek cache routing. + # Grok (xAI) does not support prompt_cache_key — it uses automatic + # prefix caching and ignores this parameter, so skip it for Grok. + if self.provider != "grok" and call_type and system_prompt and len(system_prompt) >= config.min_cache_tokens: prompt_hash = hashlib.sha256(system_prompt.encode()).hexdigest()[:16] cache_key = f"{call_type}_{prompt_hash}" request_kwargs["extra_body"] = {"prompt_cache_key": cache_key} @@ -1168,21 +1169,26 @@ def _generate_openai( token_count_input = response.usage.prompt_tokens token_count_output = response.usage.completion_tokens - # Extract cached tokens from prompt_tokens_details (OpenAI automatic caching) - # Available for prompts ≥1024 tokens - prompt_tokens_details = getattr(response.usage, "prompt_tokens_details", None) - if prompt_tokens_details: - cached_tokens = getattr(prompt_tokens_details, "cached_tokens", 0) or 0 + # Extract cached tokens — field name differs by provider: + # - OpenAI: response.usage.prompt_tokens_details.cached_tokens + # - Grok (xAI): response.usage.prompt_cache_hit_tokens + if self.provider == "grok": + cached_tokens = getattr(response.usage, "prompt_cache_hit_tokens", 0) or 0 + else: + prompt_tokens_details = getattr(response.usage, "prompt_tokens_details", None) + if prompt_tokens_details: + cached_tokens = getattr(prompt_tokens_details, "cached_tokens", 0) or 0 # Record cache metrics + provider_label = self.provider # "openai", "grok", "deepseek", etc. metrics = get_cache_metrics() if cached_tokens > 0: - logger.info(f"[CACHE] OpenAI {cache_type} cache hit: {cached_tokens}/{token_count_input} tokens from cache") - metrics.record_hit("openai", cache_type, cached_tokens=cached_tokens, total_tokens=token_count_input) + logger.info(f"[CACHE] {provider_label} {cache_type} cache hit: {cached_tokens}/{token_count_input} tokens from cache") + metrics.record_hit(provider_label, cache_type, cached_tokens=cached_tokens, total_tokens=token_count_input) elif system_prompt and len(system_prompt) >= config.min_cache_tokens: # Caching should have been attempted (prompt long enough) # This is a miss - either first call or cache expired - metrics.record_miss("openai", cache_type, total_tokens=token_count_input) + metrics.record_miss(provider_label, cache_type, total_tokens=token_count_input) status = "success" except Exception as exc: From 26da4b9d85ca26ae1651f011889121527cba27ea Mon Sep 17 00:00:00 2001 From: korivi-CraftOS Date: Tue, 14 Apr 2026 17:31:24 +0900 Subject: [PATCH 05/30] Delete craftbot.pid --- craftbot.pid | 1 - 1 file changed, 1 deletion(-) delete mode 100644 craftbot.pid diff --git a/craftbot.pid b/craftbot.pid deleted file mode 100644 index b86a3065..00000000 --- a/craftbot.pid +++ /dev/null @@ -1 +0,0 @@ -10948 \ No newline at end of file From 403398fb4ef39a81150b0397ad64d5a04775bd50 Mon Sep 17 00:00:00 2001 From: korivi-CraftOS Date: Tue, 14 Apr 2026 17:31:34 +0900 Subject: [PATCH 06/30] Delete craftbot.log --- craftbot.log | 299 --------------------------------------------------- 1 file changed, 299 deletions(-) delete mode 100644 craftbot.log diff --git a/craftbot.log b/craftbot.log deleted file mode 100644 index fe1ee0ea..00000000 --- a/craftbot.log +++ /dev/null @@ -1,299 +0,0 @@ - -============================================================ -CraftBot service started at 2026-04-08 14:51:22 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ -Traceback (most recent call last): - File "C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py", line 1074, in - print_browser_header() - ~~~~~~~~~~~~~~~~~~~~^^ - File "C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py", line 610, in print_browser_header - print("\n\U0001f916 CraftBot") - ~~~~~^^^^^^^^^^^^^^^^^ - File "C:\Python314\Lib\encodings\cp1252.py", line 19, in encode - return codecs.charmap_encode(input,self.errors,encoding_table)[0] - ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f916' in position 2: character maps to - -============================================================ -CraftBot service started at 2026-04-08 14:59:15 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ -Traceback (most recent call last): - File "C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py", line 1074, in - print_browser_header() - ~~~~~~~~~~~~~~~~~~~~^^ - File "C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py", line 610, in print_browser_header - print("\n\U0001f916 CraftBot") - ~~~~~^^^^^^^^^^^^^^^^^ - File "C:\Python314\Lib\encodings\cp1252.py", line 19, in encode - return codecs.charmap_encode(input,self.errors,encoding_table)[0] - ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f916' in position 2: character maps to - -============================================================ -CraftBot service started at 2026-04-08 15:07:33 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ -Traceback (most recent call last): - File "C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py", line 1074, in - print_browser_header() - ~~~~~~~~~~~~~~~~~~~~^^ - File "C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py", line 610, in print_browser_header - print("\n\U0001f916 CraftBot") - ~~~~~^^^^^^^^^^^^^^^^^ - File "C:\Python314\Lib\encodings\cp1252.py", line 19, in encode - return codecs.charmap_encode(input,self.errors,encoding_table)[0] - ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f916' in position 2: character maps to - -============================================================ -CraftBot service started at 2026-04-08 15:18:54 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ - -🤖 CraftBot -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -Mode: Browser - - [ 1/8] Starting frontend server... ✓ - [ 2/8] Starting agent backend... ✓ - [ 3/8] Initializing agent... ✓ - [ 4/8] Connecting to MCP servers... ✓ - [ 5/8] Loading skills... ✓ - [ 6/8] Loading libraries... ✓ - [ 7/8] Starting scheduler... ✓ - [ 8/8] Starting communications... ✓ - - ---- Cleanup Initiated (Exit Status: 1073807364) --- -[*] Skipping Docker cleanup (not started in CLI mode). - -============================================================ -CraftBot service started at 2026-04-08 16:27:25 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ - -🤖 CraftBot -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -Mode: Browser - - [ 1/8] Starting frontend server... ✓ - [ 2/8] Starting agent backend... ✓ - [ 3/8] Initializing agent... ✓ - [ 4/8] Connecting to MCP servers... ✓ - [ 5/8] Loading skills... ✓ - [ 6/8] Loading libraries... ✓ - [ 7/8] Starting scheduler... ✓ - [ 8/8] Starting communications... ✓ - -============================================================ -CraftBot service started at 2026-04-08 16:51:37 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ - -🤖 CraftBot -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -Mode: Browser - - [ 1/8] Starting frontend server... ✓ - [ 2/8] Starting agent backend... ✓ - [ 3/8] Initializing agent... ✓ - [ 4/8] Connecting to MCP servers... ✓ - [ 5/8] Loading skills... ✓ - [ 6/8] Loading libraries... ✓ - [ 7/8] Starting scheduler... ✓ - [ 8/8] Starting communications... ✓ - -============================================================ -CraftBot service started at 2026-04-08 17:18:37 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ - -🤖 CraftBot -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -Mode: Browser - - [ 1/8] Starting frontend server... ✓ - [ 2/8] Starting agent backend... ✓ - [ 3/8] Initializing agent... ✓ - [ 4/8] Connecting to MCP servers... ✓ - [ 5/8] Loading skills... ✓ - [ 6/8] Loading libraries... ✓ - [ 7/8] Starting scheduler... ✓ - [ 8/8] Starting communications... ✓ - -============================================================ -CraftBot service started at 2026-04-08 17:38:37 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ - -🤖 CraftBot -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -Mode: Browser - - [ 1/8] Starting frontend server... ✓ - [ 2/8] Starting agent backend... ✓ - [ 3/8] Initializing agent... ✓ - [ 4/8] Connecting to MCP servers... ✓ - [ 5/8] Loading skills... ✓ - [ 6/8] Loading libraries... ✓ - [ 7/8] Starting scheduler... ✓ - [ 8/8] Starting communications... ✓ - -============================================================ -CraftBot service started at 2026-04-08 17:52:20 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ - -🤖 CraftBot -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -Mode: Browser - - [ 1/8] Starting frontend server... ✓ - [ 2/8] Starting agent backend... ✓ - [ 3/8] Initializing agent... ✓ - [ 4/8] Connecting to MCP servers... ✓ - [ 5/8] Loading skills... ✓ - [ 6/8] Loading libraries... ✓ - [ 7/8] Starting scheduler... ✓ - [ 8/8] Starting communications... ✓ - -============================================================ -CraftBot service started at 2026-04-08 17:59:29 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ - -🤖 CraftBot -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -Mode: Browser - - [ 1/8] Starting frontend server... ✓ - [ 2/8] Starting agent backend... ✓ - [ 3/8] Initializing agent... ✓ - [ 4/8] Connecting to MCP servers... ✓ - [ 5/8] Loading skills... ✓ - [ 6/8] Loading libraries... ✓ - [ 7/8] Starting scheduler... ✓ - [ 8/8] Starting communications... ✓ - -============================================================ -CraftBot service started at 2026-04-08 18:05:16 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ - -🤖 CraftBot -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -Mode: Browser - - [ 1/8] Starting frontend server... ✓ - [ 2/8] Starting agent backend... ✓ - [ 3/8] Initializing agent... ✓ - [ 4/8] Connecting to MCP servers... ✓ - [ 5/8] Loading skills... ✓ - [ 6/8] Loading libraries... ✓ - [ 7/8] Starting scheduler... ✓ - [ 8/8] Starting communications... ✓ - -============================================================ -CraftBot service started at 2026-04-08 18:16:07 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ - -🤖 CraftBot -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -Mode: Browser - - [ 1/8] Starting frontend server... ✓ - [ 2/8] Starting agent backend... ✓ - [ 3/8] Initializing agent... ✓ - [ 4/8] Connecting to MCP servers... ✓ - [ 5/8] Loading skills... ✓ - [ 6/8] Loading libraries... ✓ - [ 7/8] Starting scheduler... ✓ - [ 8/8] Starting communications... ✓ - -============================================================ -CraftBot service started at 2026-04-08 20:52:14 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ - -🤖 CraftBot -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -Mode: Browser - - [ 1/8] Starting frontend server... ✓ - [ 2/8] Starting agent backend... ✓ - [ 3/8] Initializing agent... ✓ - [ 4/8] Connecting to MCP servers... ✓ - [ 5/8] Loading skills... ✓ - [ 6/8] Loading libraries... ✓ - [ 7/8] Starting scheduler... ✓ - [ 8/8] Starting communications... ✓ - -============================================================ -CraftBot service started at 2026-04-08 20:57:52 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ - -🤖 CraftBot -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -Mode: Browser - - [ 1/8] Starting frontend server... ✓ - [ 2/8] Starting agent backend... ✓ - [ 3/8] Initializing agent... ✓ - [ 4/8] Connecting to MCP servers... ✓ - [ 5/8] Loading skills... ✓ - [ 6/8] Loading libraries... ✓ - [ 7/8] Starting scheduler... ✓ - [ 8/8] Starting communications... ✓ - -============================================================ -CraftBot service started at 2026-04-08 21:06:10 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ - -🤖 CraftBot -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -Mode: Browser - - [ 1/8] Starting frontend server... ✓ - [ 2/8] Starting agent backend... ✓ - [ 3/8] Initializing agent... ✓ - [ 4/8] Connecting to MCP servers... ✓ - [ 5/8] Loading skills... ✓ - [ 6/8] Loading libraries... ✓ - [ 7/8] Starting scheduler... ✓ - [ 8/8] Starting communications... ✓ - -============================================================ -CraftBot service started at 2026-04-09 00:58:46 -Command: C:\Python314\pythonw.exe C:\Users\ganiy\OneDrive\Desktop\OneDrive\Korivi Important Data\Aether\CraftOS\CraftBot\CraftBot\run.py --no-open-browser -============================================================ - -🤖 CraftBot -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -Mode: Browser - - [ 1/8] Starting frontend server... ✓ - [ 2/8] Starting agent backend... ✓ - [ 3/8] Initializing agent... ✓ - [ 4/8] Connecting to MCP servers... ✓ - [ 5/8] Loading skills... ✓ - [ 6/8] Loading libraries... ✓ - [ 7/8] Starting scheduler... ✓ - [ 8/8] Starting communications... ✓ From 25f21c18e4fcb938ee953b7890b4f6dbdf7fbc2d Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Tue, 14 Apr 2026 20:04:15 +0530 Subject: [PATCH 07/30] feat: add OCR and video analysis actions (#155) --- agent_core/core/impl/action/router.py | 2 + agent_core/core/impl/vlm/interface.py | 173 +++++ agent_core/core/llm/google_gemini_client.py | 69 ++ agent_core/decorators/log_events.py | 1 + agent_core/decorators/profiler.py | 1 + app/data/action/perform_ocr.py | 82 +++ app/data/action/understand_video.py | 92 +++ app/internal_action_interface.py | 65 ++ requirements.txt | 1 + tests/test_step1_vlm_interface.py | 563 ++++++++++++++++ tests/test_step2_iai_methods.py | 76 +++ tests/test_step2_internal_action_interface.py | 599 ++++++++++++++++++ tests/test_step3_perform_ocr_action.py | 129 ++++ tests/test_step4_understand_video_action.py | 116 ++++ 14 files changed, 1969 insertions(+) create mode 100644 app/data/action/perform_ocr.py create mode 100644 app/data/action/understand_video.py create mode 100644 tests/test_step1_vlm_interface.py create mode 100644 tests/test_step2_iai_methods.py create mode 100644 tests/test_step2_internal_action_interface.py create mode 100644 tests/test_step3_perform_ocr_action.py create mode 100644 tests/test_step4_understand_video_action.py diff --git a/agent_core/core/impl/action/router.py b/agent_core/core/impl/action/router.py index 12f1fef9..210c2458 100644 --- a/agent_core/core/impl/action/router.py +++ b/agent_core/core/impl/action/router.py @@ -6,6 +6,8 @@ based on user queries using LLM reasoning. """ +from __future__ import annotations + import json import ast from typing import Optional, List, Dict, Any, Tuple diff --git a/agent_core/core/impl/vlm/interface.py b/agent_core/core/impl/vlm/interface.py index dce58675..455de4af 100644 --- a/agent_core/core/impl/vlm/interface.py +++ b/agent_core/core/impl/vlm/interface.py @@ -286,6 +286,112 @@ async def generate_response_async( log_response, ) + def describe_image_ocr( + self, + image_path: str, + user_prompt: str | None = None, + ) -> str: + """ + Run OCR on an image. Returns raw extracted text, not a description. + Uses a structured extraction system prompt regardless of provider. + """ + if not os.path.isfile(image_path): + raise FileNotFoundError(f"Image file not found: {image_path}") + + with open(image_path, "rb") as f: + image_bytes = f.read() + + system_prompt = ( + "You are a precise OCR engine. Extract ALL text from this image exactly as it appears. " + "Preserve line breaks, indentation, and formatting. " + "Do NOT add commentary, interpretation, or markdown. " + "Output only the raw extracted text. If no text is present, output an empty string." + ) + effective_user = user_prompt or "Extract all text from this image." + + logger.info(f"[LLM SEND] OCR request | path={image_path}") + + if self.provider in ("openai", "minimax", "deepseek", "moonshot", "grok"): + response = self._openai_describe_bytes_plain(image_bytes, system_prompt, effective_user) + elif self.provider == "remote": + response = self._ollama_describe_bytes(image_bytes, system_prompt, effective_user) + elif self.provider == "gemini": + response = self._gemini_describe_bytes(image_bytes, system_prompt, effective_user) + elif self.provider == "byteplus": + response = self._byteplus_describe_bytes(image_bytes, system_prompt, effective_user) + elif self.provider == "anthropic": + response = self._anthropic_describe_bytes(image_bytes, system_prompt, effective_user) + else: + raise RuntimeError(f"Unknown provider {self.provider!r}") + + cleaned = re.sub(self._CODE_BLOCK_RE, "", response.get("content", "").strip()) + + tokens_used = response.get("tokens_used", 0) + if tokens_used: + self._set_token_count(self._get_token_count() + tokens_used) + + logger.info(f"[LLM RECV OCR] {cleaned[:120]}...") + return cleaned + + def describe_video_frames( + self, + video_path: str, + query: str | None = None, + max_frames: int = 8, + ) -> str: + """ + Analyse video by extracting evenly-spaced keyframes and sending to VLM. + Falls back to graceful error if OpenCV is unavailable. + """ + try: + import cv2 + except ImportError: + raise RuntimeError( + "opencv-python-headless is required for video analysis. " + "Install with: pip install opencv-python-headless" + ) + + if not os.path.isfile(video_path): + raise FileNotFoundError(f"Video file not found: {video_path}") + + cap = cv2.VideoCapture(video_path) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + if total_frames == 0: + cap.release() + raise ValueError("Video has 0 frames or could not be read.") + + indices = [int(i * total_frames / max_frames) for i in range(max_frames)] + frame_bytes_list: list[bytes] = [] + + for idx in indices: + cap.set(cv2.CAP_PROP_POS_FRAMES, idx) + ret, frame = cap.read() + if ret: + success, buf = cv2.imencode(".jpg", frame) + if success: + frame_bytes_list.append(buf.tobytes()) + cap.release() + + if not frame_bytes_list: + raise ValueError("Could not extract any frames from the video.") + + system_prompt = ( + f"You are analysing a video represented by {len(frame_bytes_list)} evenly-spaced keyframes. " + "Provide: 1) An overall narrative summary of what is happening, " + "2) Any visible text or titles, " + "3) Key objects, people, or scenes, " + "4) Notable transitions between frames." + ) + effective_user = query or "Summarise the content of this video." + + # For multi-frame, send frames sequentially (all providers support single-image per call) + # Gemini 1.5 Pro supports native multi-image; others receive concatenated descriptions + if self.provider == "gemini" and len(frame_bytes_list) > 1: + return self._gemini_describe_video_frames(frame_bytes_list, system_prompt, effective_user) + else: + # Universal fallback: describe each frame, then synthesise + return self._multi_frame_describe_fallback(frame_bytes_list, system_prompt, effective_user) + # ───────────────────── Provider Helpers ───────────────────── def _report_usage_async( @@ -317,6 +423,73 @@ def _report_usage_async( except Exception as e: logger.warning(f"[VLM] Failed to report usage: {e}") + def _openai_describe_bytes_plain(self, image_bytes: bytes, sys: str | None, usr: str) -> Dict[str, Any]: + """OpenAI vision request WITHOUT json_object enforcement — for raw text output (OCR).""" + img_b64 = base64.b64encode(image_bytes).decode() + messages: list[Dict[str, Any]] = [] + if sys: + messages.append({"role": "system", "content": sys}) + messages.append({ + "role": "user", + "content": [ + {"type": "text", "text": usr}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}, + ], + }) + response = self.client.chat.completions.create( + model=self.model, + messages=messages, + temperature=self.temperature, + max_tokens=4096, # OCR may return large amounts of text + # NOTE: No response_format — OCR returns plain text + ) + content = response.choices[0].message.content.strip() + total_tokens = response.usage.prompt_tokens + response.usage.completion_tokens + return {"tokens_used": total_tokens, "content": content} + + def _gemini_describe_video_frames( + self, frame_bytes_list: list[bytes], sys: str | None, usr: str + ) -> str: + """Gemini-specific multi-image frame analysis in a single API call.""" + result = self._gemini_client.generate_multimodal_multi_image( + self.model, + text=usr, + image_bytes_list=frame_bytes_list, + system_prompt=sys, + temperature=self.temperature, + json_mode=False, + ) + tokens_used = result.get("tokens_used", 0) + if tokens_used: + self._set_token_count(self._get_token_count() + tokens_used) + return re.sub(self._CODE_BLOCK_RE, "", result.get("content", "").strip()) + + def _multi_frame_describe_fallback( + self, frame_bytes_list: list[bytes], system_prompt: str, user_prompt: str + ) -> str: + """Describe each frame individually, then synthesise into a narrative.""" + frame_descriptions = [] + for i, fb in enumerate(frame_bytes_list): + desc = self.describe_image_bytes( + fb, + system_prompt=f"Frame {i+1} of {len(frame_bytes_list)}: Describe what you see.", + user_prompt=user_prompt, + log_response=False, + ) + frame_descriptions.append(f"[Frame {i+1}]: {desc}") + + synthesis_prompt = ( + "You received descriptions of video keyframes. Write a coherent video summary:\n\n" + + "\n".join(frame_descriptions) + ) + synthesis = self.describe_image_bytes( + frame_bytes_list[-1], # anchor with last frame for context + system_prompt=system_prompt, + user_prompt=synthesis_prompt, + log_response=True, + ) + return synthesis + def _openai_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) -> Dict[str, Any]: """OpenAI vision request with automatic prompt caching metrics.""" img_b64 = base64.b64encode(image_bytes).decode() diff --git a/agent_core/core/llm/google_gemini_client.py b/agent_core/core/llm/google_gemini_client.py index f6d1688b..3cbffe44 100644 --- a/agent_core/core/llm/google_gemini_client.py +++ b/agent_core/core/llm/google_gemini_client.py @@ -236,6 +236,75 @@ def generate_multimodal( "cached_tokens": cached_tokens, } + def generate_multimodal_multi_image( + self, + model: str, + *, + text: str, + image_bytes_list: List[bytes], + system_prompt: Optional[str] = None, + temperature: Optional[float] = None, + json_mode: bool = False, + ) -> Dict[str, Any]: + """Generate text from a prompt that contains multiple inline images. + + Args: + model: Model identifier + text: The text prompt + image_bytes_list: List of PNG/JPEG image data + system_prompt: Optional system instruction + temperature: Sampling temperature + json_mode: If True, enforce JSON output format + + Returns: + Dict with generation results and token counts + """ + parts: List[Dict[str, Any]] = [{"text": text}] + + for image_bytes in image_bytes_list: + inline_data = { + "mimeType": "image/jpeg", + "data": base64.b64encode(image_bytes).decode("utf-8"), + } + parts.append({"inlineData": inline_data}) + + contents = [{"role": "user", "parts": parts}] + + payload: Dict[str, Any] = {"contents": contents} + if system_prompt: + payload["systemInstruction"] = { + "parts": [{"text": system_prompt}], + } + + generation_config: Dict[str, Any] = {} + if temperature is not None: + generation_config["temperature"] = temperature + if json_mode: + generation_config["responseMimeType"] = "application/json" + if generation_config: + payload["generationConfig"] = generation_config + + response = self._post_json( + f"{_normalise_model_name(model)}:generateContent", payload + ) + + # Extract token usage from usageMetadata + usage_metadata = response.get("usageMetadata", {}) + total_tokens = usage_metadata.get("totalTokenCount", 0) + prompt_tokens = usage_metadata.get("promptTokenCount", 0) + completion_tokens = usage_metadata.get("candidatesTokenCount", 0) + cached_tokens = usage_metadata.get("cachedContentTokenCount", 0) + + content = self._extract_text(response) + + return { + "tokens_used": total_tokens, + "content": content, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "cached_tokens": cached_tokens, + } + def embed_text(self, model: str, *, text: str) -> List[float]: """Fetch an embedding vector for the supplied text. diff --git a/agent_core/decorators/log_events.py b/agent_core/decorators/log_events.py index ab9a7cfe..3e6d1571 100644 --- a/agent_core/decorators/log_events.py +++ b/agent_core/decorators/log_events.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import annotations """ Flexible function-level logging: - logs start diff --git a/agent_core/decorators/profiler.py b/agent_core/decorators/profiler.py index 38e5e77c..78fc4f5b 100644 --- a/agent_core/decorators/profiler.py +++ b/agent_core/decorators/profiler.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import annotations """ Profiler Module - Comprehensive performance tracking for the agent. diff --git a/app/data/action/perform_ocr.py b/app/data/action/perform_ocr.py new file mode 100644 index 00000000..3c1d01d9 --- /dev/null +++ b/app/data/action/perform_ocr.py @@ -0,0 +1,82 @@ +from agent_core import action + +@action( + name="perform_ocr", + description="Extracts all text from an image using OCR via a Vision Language Model. Use this when the user wants to read text from a screenshot, scanned document, photo of a receipt, whiteboard, sign, or any image containing text. Returns extracted text saved to a file in workspace.", + mode="CLI", + action_sets=["document_processing, image"], + input_schema={ + "image_path": { + "type": "string", + "example": "C:\\Users\\user\\Pictures\\receipt.jpg", + "description": "Absolute path to the image file containing text to extract." + }, + "user_prompt": { + "type": "string", + "example": "Extract all text including prices and product names.", + "description": "Optional: extra instruction to guide the OCR (e.g. focus on specific regions or text types)." + } + }, + output_schema={ + "status": { + "type": "string", + "example": "success", + "description": "'success' if OCR completed, 'error' otherwise." + }, + "summary": { + "type": "string", + "example": "OCR complete: 42 lines, 1250 characters extracted.", + "description": "Brief summary of extraction results." + }, + "file_path": { + "type": "string", + "example": "/workspace/ocr_result_20260414_153000.txt", + "description": "Absolute path to the .txt file containing full extracted text." + }, + "file_saved": { + "type": "boolean", + "example": True, + "description": "True if the extracted text was saved to disk." + }, + "message": { + "type": "string", + "example": "File not found.", + "description": "Error message if applicable." + } + }, + test_payload={ + "image_path": "C:\\Users\\user\\Pictures\\sample.jpg", + "user_prompt": "Extract all visible text.", + "simulated_mode": True + } +) +def perform_ocr(input_data: dict) -> dict: + import os + + image_path = str(input_data.get('image_path', '')).strip() + user_prompt = str(input_data.get('user_prompt', '')).strip() or None + simulated_mode = input_data.get('simulated_mode', False) + + if simulated_mode: + return { + 'status': 'success', + 'summary': 'OCR complete: 5 lines, 120 characters extracted.', + 'file_path': '/workspace/ocr_result_simulated.txt', + 'file_saved': True, + 'message': '' + } + + if not image_path: + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'image_path is required.'} + + if not os.path.isfile(image_path): + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'File not found.'} + + try: + import app.internal_action_interface as iai + result = iai.InternalActionInterface.perform_ocr(image_path, user_prompt=user_prompt) + return {**result, 'message': ''} + except Exception as e: + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': str(e)} + +execute = perform_ocr diff --git a/app/data/action/understand_video.py b/app/data/action/understand_video.py new file mode 100644 index 00000000..d40b4dfb --- /dev/null +++ b/app/data/action/understand_video.py @@ -0,0 +1,92 @@ +from agent_core import action + +@action( + name="understand_video", + description="Analyses a video file by sampling keyframes and generating a narrative summary using a Vision Language Model. Use when the user shares a video and wants to know what happens in it, extract visible text, or answer a specific question about video content.", + mode="CLI", + action_sets=["document_processing, image"], + input_schema={ + "video_path": { + "type": "string", + "example": "C:\\Users\\user\\Videos\\meeting.mp4", + "description": "Absolute path to the video file (MP4, AVI, MOV supported)." + }, + "query": { + "type": "string", + "example": "What is being presented on the slides?", + "description": "Optional: specific question to answer about the video." + }, + "max_frames": { + "type": "integer", + "example": 8, + "description": "Number of evenly-spaced keyframes to sample (default: 8, max recommended: 16)." + } + }, + output_schema={ + "status": { + "type": "string", + "example": "success", + "description": "'success' if analysis completed, 'error' otherwise." + }, + "summary": { + "type": "string", + "example": "The video shows a person presenting slides about quarterly sales...", + "description": "First 500 characters of the video summary. Full summary saved to file." + }, + "file_path": { + "type": "string", + "example": "/workspace/video_summary_20260414_153000.txt", + "description": "Absolute path to the .txt file containing the full video summary." + }, + "file_saved": { + "type": "boolean", + "example": True, + "description": "True if the full summary was saved to disk." + }, + "message": { + "type": "string", + "example": "File not found.", + "description": "Error message if applicable." + } + }, + test_payload={ + "video_path": "C:\\Users\\user\\Videos\\sample.mp4", + "query": "Summarise the video content.", + "max_frames": 8, + "simulated_mode": True + } +) +def understand_video(input_data: dict) -> dict: + import os + + video_path = str(input_data.get('video_path', '')).strip() + query = str(input_data.get('query', '')).strip() or None + max_frames = int(input_data.get('max_frames', 8)) + simulated_mode = input_data.get('simulated_mode', False) + + if simulated_mode: + return { + 'status': 'success', + 'summary': 'The video shows a simulated presentation with 3 speakers.', + 'file_path': '/workspace/video_summary_simulated.txt', + 'file_saved': True, + 'message': '' + } + + if not video_path: + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'video_path is required.'} + + if not os.path.isfile(video_path): + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'File not found.'} + + try: + import app.internal_action_interface as iai + result = iai.InternalActionInterface.understand_video(video_path, query=query, max_frames=max_frames) + return {**result, 'message': ''} + except RuntimeError as e: + # Catches missing opencv gracefully + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': str(e)} + except Exception as e: + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': str(e)} + +execute = understand_video diff --git a/app/internal_action_interface.py b/app/internal_action_interface.py index a1486f1b..45cb7c8a 100644 --- a/app/internal_action_interface.py +++ b/app/internal_action_interface.py @@ -5,6 +5,8 @@ framework internal functions. """ +from __future__ import annotations + from typing import Dict, Any, Optional, List, TYPE_CHECKING from app.llm import LLMInterface, LLMCallType from app.vlm_interface import VLMInterface @@ -98,6 +100,69 @@ def describe_image(cls, image_path: str, prompt: Optional[str] = None) -> str: raise RuntimeError("InternalActionInterface not initialized with VLMInterface.") return cls.vlm_interface.describe_image(image_path, user_prompt=prompt) + @classmethod + def perform_ocr(cls, image_path: str, user_prompt: Optional[str] = None) -> dict: + """ + Run OCR on an image and persist the extracted text to workspace. + Returns a concise status dict + saved file path to avoid TUI flooding. + """ + if cls.vlm_interface is None: + raise RuntimeError("InternalActionInterface not initialized with VLMInterface.") + + import os + from datetime import datetime + + raw_text = cls.vlm_interface.describe_image_ocr(image_path, user_prompt=user_prompt) + + # Persist to workspace to prevent token ballooning in the agent context + ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + out_path = os.path.join(AGENT_WORKSPACE_ROOT, f"ocr_result_{ts}.txt") + with open(out_path, "w", encoding="utf-8") as f: + f.write(raw_text) + + line_count = raw_text.count("\n") + 1 + char_count = len(raw_text) + return { + "status": "success", + "summary": f"OCR complete: {line_count} lines, {char_count} characters extracted.", + "text": raw_text, + "file_path": out_path, + "file_saved": True, + } + + @classmethod + def understand_video( + cls, + video_path: str, + query: Optional[str] = None, + max_frames: int = 8, + ) -> dict: + """ + Analyse a video by extracting keyframes and querying the VLM. + Persists the summary to workspace to avoid TUI/context flooding. + """ + if cls.vlm_interface is None: + raise RuntimeError("InternalActionInterface not initialized with VLMInterface.") + + import os + from datetime import datetime + + summary = cls.vlm_interface.describe_video_frames( + video_path, query=query, max_frames=max_frames + ) + + ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + out_path = os.path.join(AGENT_WORKSPACE_ROOT, f"video_summary_{ts}.txt") + with open(out_path, "w", encoding="utf-8") as f: + f.write(summary) + + return { + "status": "success", + "summary": summary[:500] + ("..." if len(summary) > 500 else ""), + "file_path": out_path, + "file_saved": True, + } + # ───────────────── Memory Search ───────────────── @classmethod diff --git a/requirements.txt b/requirements.txt index bd6fdd9f..53eda7dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -45,3 +45,4 @@ watchdog telethon croniter>=2.0.0 # Cron expression parsing for scheduler playwright # WhatsApp Web browser automation +opencv-python-headless # Video analysis keyframe extraction diff --git a/tests/test_step1_vlm_interface.py b/tests/test_step1_vlm_interface.py new file mode 100644 index 00000000..c1bf516f --- /dev/null +++ b/tests/test_step1_vlm_interface.py @@ -0,0 +1,563 @@ +# -*- coding: utf-8 -*- +""" +Step 1 Verification Suite — VLM Interface Extensions +Tests for: describe_image_ocr, describe_video_frames, _openai_describe_bytes_plain, + _gemini_describe_video_frames, _multi_frame_describe_fallback, + GeminiClient.generate_multimodal_multi_image + +Run with: + python -m pytest tests/test_step1_vlm_interface.py -v + +ALL tests must pass. Zero real API calls are made. +Zero imports of app.* are required — only agent_core. +""" + +from __future__ import annotations + +import base64 +import io +import os +import sys +import tempfile +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch, call + +# ───────────────────────────────────────────────────────────────── +# SECTION A: GeminiClient.generate_multimodal_multi_image +# ───────────────────────────────────────────────────────────────── + +class TestGeminiClientMultiImage(unittest.TestCase): + """ + VERIFY: GeminiClient.generate_multimodal_multi_image exists and + constructs the correct payload (one inlineData part per frame). + """ + + def _make_client(self): + from agent_core.core.llm.google_gemini_client import GeminiClient + client = GeminiClient.__new__(GeminiClient) + client._api_key = "fake-key" + client._api_base = "https://generativelanguage.googleapis.com" + client._api_version = "v1beta" + client._timeout = 30 + return client + + def test_method_exists(self): + """generate_multimodal_multi_image must exist on GeminiClient.""" + from agent_core.core.llm.google_gemini_client import GeminiClient + self.assertTrue( + hasattr(GeminiClient, "generate_multimodal_multi_image"), + "FAIL: GeminiClient.generate_multimodal_multi_image not found. " + "Add it to agent_core/core/llm/google_gemini_client.py" + ) + + def test_payload_contains_multiple_inline_data_parts(self): + """The API payload must contain one inlineData entry per frame passed in.""" + client = self._make_client() + fake_response = { + "candidates": [{"content": {"parts": [{"text": "video summary"}]}, "finishReason": "STOP"}], + "usageMetadata": {"totalTokenCount": 100, "promptTokenCount": 80, "candidatesTokenCount": 20}, + } + + captured_payload = {} + + def fake_post(path, payload): + captured_payload.update(payload) + return fake_response + + client._post_json = fake_post + + frame_bytes = [b"frame1_bytes", b"frame2_bytes", b"frame3_bytes"] + result = client.generate_multimodal_multi_image( + "gemini-2.5-flash", + text="What is happening?", + image_bytes_list=frame_bytes, + system_prompt="Analyse these frames.", + temperature=0.5, + json_mode=False, + ) + + # Assert return shape + self.assertIn("content", result) + self.assertIn("tokens_used", result) + self.assertEqual(result["content"], "video summary") + + # Assert payload structure: must have text part + 3 inlineData parts + parts = captured_payload["contents"][0]["parts"] + inline_parts = [p for p in parts if "inlineData" in p] + text_parts = [p for p in parts if "text" in p] + + self.assertEqual(len(inline_parts), 3, + f"Expected 3 inlineData parts, got {len(inline_parts)}") + self.assertEqual(len(text_parts), 1, + f"Expected 1 text part, got {len(text_parts)}") + + # Assert each frame is correctly base64-encoded in the payload + for i, (part, raw) in enumerate(zip(inline_parts, frame_bytes)): + expected_b64 = base64.b64encode(raw).decode() + actual_b64 = part["inlineData"]["data"] + self.assertEqual(actual_b64, expected_b64, + f"Frame {i+1}: base64 mismatch in payload") + + def test_system_prompt_is_included(self): + """systemInstruction must be present in payload when system_prompt is given.""" + client = self._make_client() + fake_response = { + "candidates": [{"content": {"parts": [{"text": "ok"}]}, "finishReason": "STOP"}], + "usageMetadata": {"totalTokenCount": 10, "promptTokenCount": 8, "candidatesTokenCount": 2}, + } + captured = {} + client._post_json = lambda path, payload: (captured.update(payload), fake_response)[1] + + client.generate_multimodal_multi_image( + "gemini-2.5-flash", + text="Describe", + image_bytes_list=[b"img"], + system_prompt="You are an expert.", + ) + self.assertIn("systemInstruction", captured, + "FAIL: systemInstruction missing from payload when system_prompt is provided") + + def test_no_system_prompt_omits_key(self): + """systemInstruction must be absent when system_prompt is None.""" + client = self._make_client() + fake_response = { + "candidates": [{"content": {"parts": [{"text": "ok"}]}, "finishReason": "STOP"}], + "usageMetadata": {"totalTokenCount": 5}, + } + captured = {} + client._post_json = lambda path, payload: (captured.update(payload), fake_response)[1] + + client.generate_multimodal_multi_image( + "gemini-2.5-flash", + text="Describe", + image_bytes_list=[b"img"], + system_prompt=None, + ) + self.assertNotIn("systemInstruction", captured, + "FAIL: systemInstruction should be absent when no system_prompt is given") + + +# ───────────────────────────────────────────────────────────────── +# SECTION B: VLMInterface._openai_describe_bytes_plain +# ───────────────────────────────────────────────────────────────── + +class TestOpenAIDescribeBytesPlain(unittest.TestCase): + """ + VERIFY: _openai_describe_bytes_plain exists and does NOT set + response_format=json_object (that would break raw OCR text output). + """ + + def _make_vlm(self): + """Instantiate VLMInterface in deferred mode so no real API calls are made.""" + with patch("app.models.factory.ModelFactory.create") as mock_create: + mock_create.return_value = { + "model": "gpt-4o", + "client": MagicMock(), + "gemini_client": None, + "remote_url": None, + "anthropic_client": None, + "initialized": True, + "byteplus": None, + "provider": "openai", + } + from agent_core.core.impl.vlm.interface import VLMInterface + vlm = VLMInterface(provider="openai", deferred=True) + vlm.provider = "openai" + return vlm + + def test_method_exists(self): + """_openai_describe_bytes_plain must exist on VLMInterface.""" + from agent_core.core.impl.vlm.interface import VLMInterface + self.assertTrue( + hasattr(VLMInterface, "_openai_describe_bytes_plain"), + "FAIL: _openai_describe_bytes_plain not found on VLMInterface. " + "Add it to agent_core/core/impl/vlm/interface.py" + ) + + def test_no_response_format_json_object(self): + """ + CRITICAL: _openai_describe_bytes_plain must NOT pass + response_format={'type': 'json_object'} to the OpenAI client. + OCR returns raw text — json_object enforces a JSON wrapper and breaks it. + """ + vlm = self._make_vlm() + + mock_choice = MagicMock() + mock_choice.message.content = "Hello World\nLine 2" + mock_response = MagicMock() + mock_response.choices = [mock_choice] + mock_response.usage.prompt_tokens = 50 + mock_response.usage.completion_tokens = 20 + + vlm.client = MagicMock() + vlm.client.chat.completions.create.return_value = mock_response + + vlm._openai_describe_bytes_plain(b"fake_image_bytes", "sys prompt", "Extract text") + + call_kwargs = vlm.client.chat.completions.create.call_args[1] + self.assertNotIn("response_format", call_kwargs, + "FAIL: response_format is present in _openai_describe_bytes_plain. " + "Remove it — OCR must return raw text, not JSON.") + + def test_returns_dict_with_content_and_tokens(self): + """Must return dict with 'content' and 'tokens_used' keys.""" + vlm = self._make_vlm() + + mock_choice = MagicMock() + mock_choice.message.content = "Extracted: Invoice #1234" + mock_response = MagicMock() + mock_response.choices = [mock_choice] + mock_response.usage.prompt_tokens = 40 + mock_response.usage.completion_tokens = 15 + vlm.client = MagicMock() + vlm.client.chat.completions.create.return_value = mock_response + + result = vlm._openai_describe_bytes_plain(b"img", None, "Extract text") + + self.assertIsInstance(result, dict) + self.assertIn("content", result) + self.assertIn("tokens_used", result) + self.assertEqual(result["content"], "Extracted: Invoice #1234") + self.assertEqual(result["tokens_used"], 55) + + def test_max_tokens_is_at_least_4096(self): + """ + OCR may produce large amounts of text. max_tokens must be >= 4096. + """ + vlm = self._make_vlm() + mock_choice = MagicMock() + mock_choice.message.content = "text" + mock_response = MagicMock() + mock_response.choices = [mock_choice] + mock_response.usage.prompt_tokens = 10 + mock_response.usage.completion_tokens = 5 + vlm.client = MagicMock() + vlm.client.chat.completions.create.return_value = mock_response + + vlm._openai_describe_bytes_plain(b"img", None, "Extract text") + + call_kwargs = vlm.client.chat.completions.create.call_args[1] + max_tokens = call_kwargs.get("max_tokens", call_kwargs.get("max_completion_tokens", 0)) + self.assertGreaterEqual(max_tokens, 4096, + f"FAIL: max_tokens={max_tokens}. OCR needs at least 4096 to handle large text blocks.") + + +# ───────────────────────────────────────────────────────────────── +# SECTION C: VLMInterface.describe_image_ocr +# ───────────────────────────────────────────────────────────────── + +class TestDescribeImageOcr(unittest.TestCase): + """ + VERIFY: describe_image_ocr exists, routes to the correct provider branch, + uses an OCR-specific system prompt, and handles FileNotFoundError. + """ + + def _make_vlm_patched(self, provider="openai"): + with patch("app.models.factory.ModelFactory.create") as mock_create: + mock_create.return_value = { + "model": "gpt-4o", + "client": MagicMock(), + "gemini_client": None, + "remote_url": None, + "anthropic_client": None, + "initialized": True, + "byteplus": None, + "provider": provider, + } + from agent_core.core.impl.vlm.interface import VLMInterface + vlm = VLMInterface(provider=provider, deferred=True) + vlm.provider = provider + return vlm + + def test_method_exists(self): + from agent_core.core.impl.vlm.interface import VLMInterface + self.assertTrue( + hasattr(VLMInterface, "describe_image_ocr"), + "FAIL: describe_image_ocr not found on VLMInterface. " + "Add it to agent_core/core/impl/vlm/interface.py" + ) + + def test_raises_file_not_found_for_missing_path(self): + """Must raise FileNotFoundError when the image path does not exist.""" + vlm = self._make_vlm_patched() + with self.assertRaises(FileNotFoundError): + vlm.describe_image_ocr("/nonexistent/path/image.png") + + def test_routes_to_plain_method_for_openai(self): + """ + For provider='openai', describe_image_ocr must call + _openai_describe_bytes_plain (not _openai_describe_bytes). + This ensures json_object response format is not applied. + """ + vlm = self._make_vlm_patched(provider="openai") + + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(b"fake_png_data") + tmp_path = f.name + + try: + vlm._openai_describe_bytes_plain = MagicMock( + return_value={"content": "INVOICE\nTotal: $100", "tokens_used": 30} + ) + vlm._openai_describe_bytes = MagicMock() + + result = vlm.describe_image_ocr(tmp_path) + + vlm._openai_describe_bytes_plain.assert_called_once() + vlm._openai_describe_bytes.assert_not_called() + self.assertEqual(result, "INVOICE\nTotal: $100") + finally: + os.unlink(tmp_path) + + def test_system_prompt_contains_ocr_keywords(self): + """ + The system prompt passed to the provider must contain OCR-specific + language ('OCR', 'extract', 'text') — not a generic description prompt. + """ + vlm = self._make_vlm_patched(provider="openai") + + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(b"fake_png_data") + tmp_path = f.name + + try: + captured_sys_prompt = {} + + def capture_plain(image_bytes, sys_prompt, user_prompt): + captured_sys_prompt["sys"] = sys_prompt or "" + return {"content": "Hello", "tokens_used": 10} + + vlm._openai_describe_bytes_plain = capture_plain + vlm.describe_image_ocr(tmp_path) + + sys_lower = captured_sys_prompt.get("sys", "").lower() + self.assertTrue( + "ocr" in sys_lower or "extract" in sys_lower or "text" in sys_lower, + f"FAIL: OCR system prompt does not mention OCR/extraction. Got: '{captured_sys_prompt.get('sys')}'" + ) + finally: + os.unlink(tmp_path) + + def test_returns_string(self): + """describe_image_ocr must return a string, not a dict.""" + vlm = self._make_vlm_patched(provider="openai") + + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(b"fake_png_data") + tmp_path = f.name + + try: + vlm._openai_describe_bytes_plain = MagicMock( + return_value={"content": "TEXT FROM IMAGE", "tokens_used": 20} + ) + result = vlm.describe_image_ocr(tmp_path) + self.assertIsInstance(result, str) + finally: + os.unlink(tmp_path) + + +# ───────────────────────────────────────────────────────────────── +# SECTION D: VLMInterface.describe_video_frames +# ───────────────────────────────────────────────────────────────── + +class TestDescribeVideoFrames(unittest.TestCase): + """ + VERIFY: describe_video_frames exists, handles missing file, + handles missing opencv gracefully, and calls the correct + provider path (Gemini native vs. fallback). + """ + + def _make_vlm_patched(self, provider="openai"): + with patch("app.models.factory.ModelFactory.create") as mock_create: + mock_create.return_value = { + "model": "gpt-4o", + "client": MagicMock(), + "gemini_client": None, + "remote_url": None, + "anthropic_client": None, + "initialized": True, + "byteplus": None, + "provider": provider, + } + from agent_core.core.impl.vlm.interface import VLMInterface + vlm = VLMInterface(provider=provider, deferred=True) + vlm.provider = provider + return vlm + + def test_method_exists(self): + from agent_core.core.impl.vlm.interface import VLMInterface + self.assertTrue( + hasattr(VLMInterface, "describe_video_frames"), + "FAIL: describe_video_frames not found on VLMInterface." + ) + + def test_raises_file_not_found_for_missing_video(self): + """Must raise FileNotFoundError when the video path does not exist.""" + vlm = self._make_vlm_patched() + with self.assertRaises(FileNotFoundError): + vlm.describe_video_frames("/nonexistent/video.mp4") + + def test_raises_runtime_error_when_opencv_missing(self): + """ + When opencv is not installed, describe_video_frames must raise + a RuntimeError with an actionable install message — not an ImportError. + This ensures a clean error surface for the user. + """ + vlm = self._make_vlm_patched() + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(b"fake_mp4_data") + tmp_path = f.name + + try: + with patch.dict(sys.modules, {"cv2": None}): + with self.assertRaises(RuntimeError) as ctx: + vlm.describe_video_frames(tmp_path) + self.assertIn("opencv", str(ctx.exception).lower(), + "FAIL: RuntimeError message must mention 'opencv' to guide the user.") + finally: + os.unlink(tmp_path) + + def test_gemini_uses_native_multi_image_method(self): + """ + For provider='gemini', describe_video_frames must call + _gemini_describe_video_frames (native multi-image path). + It must NOT fall back to the sequential per-frame fallback. + """ + vlm = self._make_vlm_patched(provider="gemini") + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(b"fake_mp4_data") + tmp_path = f.name + + try: + mock_cv2 = MagicMock() + mock_cap = MagicMock() + mock_cap.get.return_value = 30.0 + mock_cap.read.return_value = (True, MagicMock()) + mock_cv2.VideoCapture.return_value = mock_cap + mock_cv2.imencode.return_value = (True, MagicMock(tobytes=lambda: b"frame")) + + vlm._gemini_describe_video_frames = MagicMock(return_value="Gemini video summary") + vlm._multi_frame_describe_fallback = MagicMock(return_value="fallback summary") + + with patch.dict(sys.modules, {"cv2": mock_cv2}): + result = vlm.describe_video_frames(tmp_path, max_frames=2) + + vlm._gemini_describe_video_frames.assert_called_once() + vlm._multi_frame_describe_fallback.assert_not_called() + self.assertEqual(result, "Gemini video summary") + finally: + os.unlink(tmp_path) + + def test_non_gemini_uses_fallback(self): + """ + For provider='openai', describe_video_frames must call + _multi_frame_describe_fallback (sequential frame path). + """ + vlm = self._make_vlm_patched(provider="openai") + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(b"fake_mp4_data") + tmp_path = f.name + + try: + mock_cv2 = MagicMock() + mock_cap = MagicMock() + mock_cap.get.return_value = 30.0 + mock_cap.read.return_value = (True, MagicMock()) + mock_cv2.VideoCapture.return_value = mock_cap + mock_cv2.imencode.return_value = (True, MagicMock(tobytes=lambda: b"frame")) + + vlm._gemini_describe_video_frames = MagicMock(return_value="should not be called") + vlm._multi_frame_describe_fallback = MagicMock(return_value="OpenAI fallback summary") + + with patch.dict(sys.modules, {"cv2": mock_cv2}): + result = vlm.describe_video_frames(tmp_path, max_frames=2) + + vlm._multi_frame_describe_fallback.assert_called_once() + vlm._gemini_describe_video_frames.assert_not_called() + self.assertEqual(result, "OpenAI fallback summary") + finally: + os.unlink(tmp_path) + + +# ───────────────────────────────────────────────────────────────── +# SECTION E: Regression — existing describe_image still works +# ───────────────────────────────────────────────────────────────── + +class TestRegressionDescribeImage(unittest.TestCase): + """ + REGRESSION GUARD: Ensure existing describe_image and describe_image_bytes + are untouched and still produce the same output contract. + This confirms Step 1 did not break any existing functionality. + """ + + def _make_vlm_patched(self): + with patch("app.models.factory.ModelFactory.create") as mock_create: + mock_create.return_value = { + "model": "gpt-4o", + "client": MagicMock(), + "gemini_client": None, + "remote_url": None, + "anthropic_client": None, + "initialized": True, + "byteplus": None, + "provider": "openai", + } + from agent_core.core.impl.vlm.interface import VLMInterface + vlm = VLMInterface(provider="openai", deferred=True) + vlm.provider = "openai" + return vlm + + def test_describe_image_still_raises_on_missing_file(self): + """describe_image must still raise FileNotFoundError (unchanged).""" + vlm = self._make_vlm_patched() + with self.assertRaises(FileNotFoundError): + vlm.describe_image("/does/not/exist.png") + + def test_describe_image_bytes_returns_string(self): + """describe_image_bytes must still return a plain string.""" + vlm = self._make_vlm_patched() + + mock_choice = MagicMock() + mock_choice.message.content = '{"content": "A cat"}' + mock_response = MagicMock() + mock_response.choices = [mock_choice] + mock_response.usage.prompt_tokens = 10 + mock_response.usage.completion_tokens = 5 + vlm.client = MagicMock() + vlm.client.chat.completions.create.return_value = mock_response + + result = vlm.describe_image_bytes(b"fake_image", user_prompt="Describe this image.") + self.assertIsInstance(result, str) + + def test_describe_image_bytes_uses_json_response_format(self): + """ + REGRESSION: The ORIGINAL describe_image_bytes must still use + response_format=json_object (this is the existing contract). + It should NOT be affected by the plain-text OCR variant. + """ + vlm = self._make_vlm_patched() + + mock_choice = MagicMock() + mock_choice.message.content = '{"content": "A dog"}' + mock_response = MagicMock() + mock_response.choices = [mock_choice] + mock_response.usage.prompt_tokens = 10 + mock_response.usage.completion_tokens = 5 + vlm.client = MagicMock() + vlm.client.chat.completions.create.return_value = mock_response + + vlm.describe_image_bytes(b"fake_image", user_prompt="Describe this.") + + call_kwargs = vlm.client.chat.completions.create.call_args[1] + # Original describe_image_bytes should still request json_object + self.assertIn("response_format", call_kwargs, + "REGRESSION: describe_image_bytes lost response_format=json_object. " + "Only the new _openai_describe_bytes_plain should omit it.") + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_step2_iai_methods.py b/tests/test_step2_iai_methods.py new file mode 100644 index 00000000..415689eb --- /dev/null +++ b/tests/test_step2_iai_methods.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +import unittest +from unittest.mock import MagicMock, patch, mock_open +import os +from datetime import datetime +import asyncio + +# Mocking the constants before import if necessary, but app.config should be fine +import sys +from unittest.mock import PropertyMock + +class TestStep2InternalInterface(unittest.TestCase): + def setUp(self): + # We need to mock InternalActionInterface dependencies + self.iai_patcher = patch('app.internal_action_interface.InternalActionInterface', autospec=True) + # However, we want to test the ACTUAL methods on InternalActionInterface + # So we import it and patch its class attributes + + from app.internal_action_interface import InternalActionInterface + self.iai = InternalActionInterface + self.iai.vlm_interface = MagicMock() + self.iai.state_manager = MagicMock() + self.iai.ui_adapter = MagicMock() + + @patch('os.path.join', side_effect=lambda *args: "/".join(args)) + @patch('builtins.open', new_callable=mock_open) + @patch('app.internal_action_interface.AGENT_WORKSPACE_ROOT', "/mock/workspace") + def test_perform_ocr_saves_file_and_returns_dict(self, mock_file, mock_join): + # Setup + self.iai.vlm_interface.describe_image_ocr.return_value = "Extracted Text Content" + + # Execute + result = self.iai.perform_ocr("some_image.jpg", user_prompt="Test Prompt") + + # Verify call to VLM + self.iai.vlm_interface.describe_image_ocr.assert_called_once_with("some_image.jpg", user_prompt="Test Prompt") + + # Verify file saving + mock_file.assert_called_once() + handle = mock_file() + handle.write.assert_called_once_with("Extracted Text Content") + + # Verify return dict + self.assertEqual(result['status'], 'success') + self.assertTrue(result['file_saved']) + self.assertIn('ocr_result_', result['file_path']) + self.assertIn('OCR complete', result['summary']) + + @patch('os.path.join', side_effect=lambda *args: "/".join(args)) + @patch('builtins.open', new_callable=mock_open) + @patch('app.internal_action_interface.AGENT_WORKSPACE_ROOT', "/mock/workspace") + def test_understand_video_saves_file_and_returns_dict(self, mock_file, mock_join): + # Setup + self.iai.vlm_interface.describe_video_frames.return_value = "Video Summary Content" + + # Execute + result = self.iai.understand_video("some_video.mp4", query="What happens?") + + # Verify call to VLM + self.iai.vlm_interface.describe_video_frames.assert_called_once_with( + "some_video.mp4", query="What happens?", max_frames=8 + ) + + # Verify file saving + mock_file.assert_called_once() + handle = mock_file() + handle.write.assert_called_once_with("Video Summary Content") + + # Verify return dict + self.assertEqual(result['status'], 'success') + self.assertTrue(result['file_saved']) + self.assertIn('video_summary_', result['file_path']) + self.assertEqual(result['summary'], "Video Summary Content") + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_step2_internal_action_interface.py b/tests/test_step2_internal_action_interface.py new file mode 100644 index 00000000..8e8e8d0c --- /dev/null +++ b/tests/test_step2_internal_action_interface.py @@ -0,0 +1,599 @@ +# tests/test_step2_internal_action_interface.py +# -*- coding: utf-8 -*- +""" +Step 2 Verification Suite — InternalActionInterface Extensions +Tests for: perform_ocr() and understand_video() classmethods + +Run with: + python -m pytest tests/test_step2_internal_action_interface.py -v + +ALL tests must pass. Zero real API calls. Zero real file system dependency +outside of tempfile — all workspace writes use a patched AGENT_WORKSPACE_ROOT. + +PREREQUISITE: Step 1 tests must already be passing before running these. +""" + +from __future__ import annotations + +import os +import sys +import tempfile +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch, PropertyMock + + +# ───────────────────────────────────────────────────────────────── +# HELPERS +# ───────────────────────────────────────────────────────────────── + +def _reset_iai(): + """Reset InternalActionInterface class-level state between tests.""" + from app.internal_action_interface import InternalActionInterface + InternalActionInterface.vlm_interface = None + InternalActionInterface.llm_interface = None + InternalActionInterface.task_manager = None + InternalActionInterface.state_manager = None + + +def _inject_mock_vlm(mock_vlm=None): + """Inject a mock VLMInterface into InternalActionInterface.""" + from app.internal_action_interface import InternalActionInterface + if mock_vlm is None: + mock_vlm = MagicMock() + InternalActionInterface.vlm_interface = mock_vlm + return mock_vlm + + +# ───────────────────────────────────────────────────────────────── +# SECTION A: Method Existence & Signatures +# ───────────────────────────────────────────────────────────────── + +class TestMethodExistence(unittest.TestCase): + """ + VERIFY: Both new classmethods exist and are classmethods (not staticmethods + or instance methods), matching the pattern of describe_image(). + """ + + def test_perform_ocr_exists(self): + from app.internal_action_interface import InternalActionInterface + self.assertTrue( + hasattr(InternalActionInterface, "perform_ocr"), + "FAIL: InternalActionInterface.perform_ocr not found. " + "Add it to app/internal_action_interface.py" + ) + + def test_understand_video_exists(self): + from app.internal_action_interface import InternalActionInterface + self.assertTrue( + hasattr(InternalActionInterface, "understand_video"), + "FAIL: InternalActionInterface.understand_video not found. " + "Add it to app/internal_action_interface.py" + ) + + def test_perform_ocr_is_classmethod(self): + """perform_ocr must be a classmethod, not a staticmethod or instance method.""" + from app.internal_action_interface import InternalActionInterface + method = InternalActionInterface.__dict__.get("perform_ocr") + self.assertIsInstance( + method, classmethod, + "FAIL: perform_ocr must be a @classmethod (matching describe_image pattern)." + ) + + def test_understand_video_is_classmethod(self): + """understand_video must be a classmethod.""" + from app.internal_action_interface import InternalActionInterface + method = InternalActionInterface.__dict__.get("understand_video") + self.assertIsInstance( + method, classmethod, + "FAIL: understand_video must be a @classmethod." + ) + + def test_perform_ocr_accepts_image_path(self): + """perform_ocr must accept image_path as its first positional argument.""" + import inspect + from app.internal_action_interface import InternalActionInterface + sig = inspect.signature(InternalActionInterface.perform_ocr) + params = list(sig.parameters.keys()) + self.assertIn("image_path", params, + f"FAIL: perform_ocr must accept 'image_path'. Got params: {params}") + + def test_understand_video_accepts_video_path_and_query(self): + """understand_video must accept video_path and query parameters.""" + import inspect + from app.internal_action_interface import InternalActionInterface + sig = inspect.signature(InternalActionInterface.understand_video) + params = list(sig.parameters.keys()) + self.assertIn("video_path", params, + f"FAIL: understand_video must accept 'video_path'. Got: {params}") + self.assertIn("query", params, + f"FAIL: understand_video must accept 'query'. Got: {params}") + + def tearDown(self): + _reset_iai() + + +# ───────────────────────────────────────────────────────────────── +# SECTION B: VLM Guard — RuntimeError when not initialized +# ───────────────────────────────────────────────────────────────── + +class TestVLMGuard(unittest.TestCase): + """ + VERIFY: Both methods raise RuntimeError when vlm_interface is None, + matching the guard pattern of describe_image() and describe_screen(). + """ + + def setUp(self): + _reset_iai() + + def test_perform_ocr_raises_when_vlm_not_initialized(self): + from app.internal_action_interface import InternalActionInterface + # vlm_interface is None (default state) + with self.assertRaises(RuntimeError) as ctx: + InternalActionInterface.perform_ocr("/some/image.png") + self.assertIn( + "VLMInterface", str(ctx.exception), + "FAIL: RuntimeError message must mention 'VLMInterface' to match " + "existing error message pattern in describe_image/describe_screen." + ) + + def test_understand_video_raises_when_vlm_not_initialized(self): + from app.internal_action_interface import InternalActionInterface + with self.assertRaises(RuntimeError) as ctx: + InternalActionInterface.understand_video("/some/video.mp4") + self.assertIn( + "VLMInterface", str(ctx.exception), + "FAIL: RuntimeError message must mention 'VLMInterface'." + ) + + def tearDown(self): + _reset_iai() + + +# ───────────────────────────────────────────────────────────────── +# SECTION C: perform_ocr — Return Contract +# ───────────────────────────────────────────────────────────────── + +class TestPerformOcrReturnContract(unittest.TestCase): + """ + VERIFY: perform_ocr returns a dict with the correct keys, + correct types, and saves extracted text to AGENT_WORKSPACE_ROOT. + """ + + def setUp(self): + _reset_iai() + self.tmp_workspace = tempfile.mkdtemp() + + def _run_perform_ocr(self, ocr_text="Hello World\nLine 2\nLine 3"): + """Helper: run perform_ocr with a temp image and mocked VLM.""" + mock_vlm = MagicMock() + mock_vlm.describe_image_ocr.return_value = ocr_text + _inject_mock_vlm(mock_vlm) + + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(b"fake_png") + image_path = f.name + + try: + with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): + from app.internal_action_interface import InternalActionInterface + result = InternalActionInterface.perform_ocr(image_path) + finally: + os.unlink(image_path) + + return result, mock_vlm + + def test_returns_dict(self): + result, _ = self._run_perform_ocr() + self.assertIsInstance(result, dict, + "FAIL: perform_ocr must return a dict, not a plain string.") + + def test_return_dict_has_required_keys(self): + """Must have: status, summary, file_path, file_saved.""" + result, _ = self._run_perform_ocr() + for key in ("status", "summary", "file_path", "file_saved"): + self.assertIn(key, result, + f"FAIL: perform_ocr return dict is missing key '{key}'.") + + def test_status_is_success_on_happy_path(self): + result, _ = self._run_perform_ocr() + self.assertEqual(result["status"], "success", + "FAIL: status must be 'success' on happy path.") + + def test_file_saved_is_true(self): + result, _ = self._run_perform_ocr() + self.assertTrue(result["file_saved"], + "FAIL: file_saved must be True after successful OCR.") + + def test_file_path_exists_on_disk(self): + """The file_path in the result must be a real file that was written.""" + result, _ = self._run_perform_ocr("Invoice #1234\nTotal: $99.99") + self.assertTrue( + os.path.isfile(result["file_path"]), + f"FAIL: file_path '{result['file_path']}' does not exist on disk. " + "perform_ocr must write the extracted text to workspace." + ) + + def test_file_content_matches_ocr_output(self): + """The saved file must contain the raw OCR text exactly as returned by VLM.""" + ocr_text = "CONFIDENTIAL\nProject Alpha\nBudget: $1,000,000" + result, _ = self._run_perform_ocr(ocr_text) + + with open(result["file_path"], "r", encoding="utf-8") as f: + saved_content = f.read() + + self.assertEqual(saved_content, ocr_text, + "FAIL: Saved file content does not match OCR output. " + "The raw text must be written verbatim — no modification.") + + def test_file_saved_to_agent_workspace_root(self): + """The saved file must be inside AGENT_WORKSPACE_ROOT, not a temp dir.""" + result, _ = self._run_perform_ocr() + self.assertTrue( + result["file_path"].startswith(self.tmp_workspace), + f"FAIL: File saved to '{result['file_path']}' but expected " + f"it to be inside AGENT_WORKSPACE_ROOT='{self.tmp_workspace}'. " + "Do not hardcode paths — use AGENT_WORKSPACE_ROOT from app.config." + ) + + def test_file_has_txt_extension(self): + """Output file must be a .txt file (readable by do_chat_with_attachments).""" + result, _ = self._run_perform_ocr() + self.assertTrue( + result["file_path"].endswith(".txt"), + f"FAIL: Output file must have .txt extension. Got: '{result['file_path']}'" + ) + + def test_summary_does_not_contain_full_text(self): + """ + Summary must be a SHORT description, not the full OCR text. + The whole point of saving to file is to keep the agent context lean. + If summary == full text, the TUI flooding problem is not solved. + """ + long_text = "Line\n" * 200 # 200 lines, definitely not a summary + result, _ = self._run_perform_ocr(long_text) + self.assertLess( + len(result["summary"]), len(long_text), + "FAIL: summary contains the full OCR text. It must be a short " + "description (e.g. 'OCR complete: 200 lines, 1000 characters') " + "to prevent context window flooding." + ) + + def test_summary_mentions_line_or_char_count(self): + """Summary must be informative — mention lines or characters extracted.""" + result, _ = self._run_perform_ocr("Hello\nWorld") + summary_lower = result["summary"].lower() + has_count_info = ( + "line" in summary_lower or + "char" in summary_lower or + "word" in summary_lower or + "extracted" in summary_lower + ) + self.assertTrue(has_count_info, + f"FAIL: summary '{result['summary']}' is not informative. " + "It must mention lines/characters extracted so the agent knows what happened.") + + def test_calls_describe_image_ocr_not_describe_image(self): + """ + CRITICAL: Must call vlm_interface.describe_image_ocr(), NOT + vlm_interface.describe_image(). Using describe_image is exactly + the existing bug that Issue #155 was filed for. + """ + mock_vlm = MagicMock() + mock_vlm.describe_image_ocr.return_value = "Some text" + _inject_mock_vlm(mock_vlm) + + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(b"fake_png") + image_path = f.name + + try: + with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): + from app.internal_action_interface import InternalActionInterface + InternalActionInterface.perform_ocr(image_path) + finally: + os.unlink(image_path) + + mock_vlm.describe_image_ocr.assert_called_once() + mock_vlm.describe_image.assert_not_called() + + def test_user_prompt_forwarded_to_vlm(self): + """Optional user_prompt must be passed through to vlm.describe_image_ocr.""" + mock_vlm = MagicMock() + mock_vlm.describe_image_ocr.return_value = "text" + _inject_mock_vlm(mock_vlm) + + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(b"fake_png") + image_path = f.name + + try: + with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): + from app.internal_action_interface import InternalActionInterface + InternalActionInterface.perform_ocr(image_path, user_prompt="Focus on prices only.") + finally: + os.unlink(image_path) + + call_kwargs = mock_vlm.describe_image_ocr.call_args + # Check the user_prompt was forwarded (positional or keyword) + all_args = list(call_kwargs.args) + list(call_kwargs.kwargs.values()) + self.assertIn("Focus on prices only.", all_args, + "FAIL: user_prompt was not forwarded to vlm_interface.describe_image_ocr(). " + "The OCR method must pass user_prompt through.") + + def tearDown(self): + _reset_iai() + import shutil + shutil.rmtree(self.tmp_workspace, ignore_errors=True) + + +# ───────────────────────────────────────────────────────────────── +# SECTION D: understand_video — Return Contract +# ───────────────────────────────────────────────────────────────── + +class TestUnderstandVideoReturnContract(unittest.TestCase): + """ + VERIFY: understand_video returns a correct dict, saves summary to + workspace, truncates summary to prevent TUI flooding, and + forwards all parameters correctly to vlm_interface. + """ + + def setUp(self): + _reset_iai() + self.tmp_workspace = tempfile.mkdtemp() + + def _run_understand_video(self, summary_text="The video shows a presentation.", query=None, max_frames=8): + mock_vlm = MagicMock() + mock_vlm.describe_video_frames.return_value = summary_text + _inject_mock_vlm(mock_vlm) + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(b"fake_mp4") + video_path = f.name + + try: + with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): + from app.internal_action_interface import InternalActionInterface + result = InternalActionInterface.understand_video( + video_path, query=query, max_frames=max_frames + ) + finally: + os.unlink(video_path) + + return result, mock_vlm + + def test_returns_dict(self): + result, _ = self._run_understand_video() + self.assertIsInstance(result, dict, + "FAIL: understand_video must return a dict.") + + def test_return_dict_has_required_keys(self): + result, _ = self._run_understand_video() + for key in ("status", "summary", "file_path", "file_saved"): + self.assertIn(key, result, + f"FAIL: understand_video return dict is missing key '{key}'.") + + def test_status_is_success_on_happy_path(self): + result, _ = self._run_understand_video() + self.assertEqual(result["status"], "success") + + def test_file_saved_is_true(self): + result, _ = self._run_understand_video() + self.assertTrue(result["file_saved"]) + + def test_file_path_exists_on_disk(self): + result, _ = self._run_understand_video("A meeting recording with 3 participants.") + self.assertTrue( + os.path.isfile(result["file_path"]), + f"FAIL: file_path '{result['file_path']}' does not exist. " + "understand_video must write the full summary to workspace." + ) + + def test_full_summary_saved_to_file(self): + """The full, untruncated summary must be in the saved file.""" + long_summary = "Frame description. " * 100 # deliberately long + result, _ = self._run_understand_video(long_summary) + + with open(result["file_path"], "r", encoding="utf-8") as f: + saved = f.read() + + self.assertEqual(saved, long_summary, + "FAIL: The saved file must contain the FULL summary. " + "Truncation only applies to the return dict's 'summary' key.") + + def test_summary_in_return_dict_is_truncated_for_long_content(self): + """ + For long video summaries, the 'summary' key in the returned dict + must be truncated (<=500 chars + ellipsis) to prevent context flooding. + The full content is in the file — the dict summary is just a preview. + """ + long_summary = "X" * 2000 + result, _ = self._run_understand_video(long_summary) + self.assertLessEqual( + len(result["summary"]), 510, # 500 + len("...") + f"FAIL: summary in return dict is {len(result['summary'])} chars. " + "Must be truncated to ~500 chars to prevent agent context flooding." + ) + + def test_short_summary_not_truncated(self): + """Short summaries (<=500 chars) must be returned as-is without ellipsis.""" + short_summary = "A quick 30-second tutorial on Python loops." + result, _ = self._run_understand_video(short_summary) + self.assertEqual(result["summary"], short_summary, + "FAIL: Short summary was unexpectedly truncated or modified.") + + def test_file_saved_to_agent_workspace_root(self): + result, _ = self._run_understand_video() + self.assertTrue( + result["file_path"].startswith(self.tmp_workspace), + f"FAIL: File saved to wrong location. Expected inside " + f"AGENT_WORKSPACE_ROOT='{self.tmp_workspace}'." + ) + + def test_file_has_txt_extension(self): + result, _ = self._run_understand_video() + self.assertTrue(result["file_path"].endswith(".txt"), + "FAIL: Output file must be .txt") + + def test_video_filename_distinct_from_ocr_filename(self): + """ + Video summary files must have a distinct filename prefix from OCR files + to avoid confusion in workspace (e.g. 'video_summary_' vs 'ocr_result_'). + """ + result, _ = self._run_understand_video() + filename = os.path.basename(result["file_path"]) + self.assertFalse( + filename.startswith("ocr_"), + f"FAIL: Video summary file '{filename}' starts with 'ocr_'. " + "Video and OCR output files must have distinct prefixes." + ) + + def test_calls_describe_video_frames_not_describe_image(self): + """Must delegate to vlm_interface.describe_video_frames(), not describe_image().""" + mock_vlm = MagicMock() + mock_vlm.describe_video_frames.return_value = "summary" + _inject_mock_vlm(mock_vlm) + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(b"fake_mp4") + video_path = f.name + + try: + with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): + from app.internal_action_interface import InternalActionInterface + InternalActionInterface.understand_video(video_path) + finally: + os.unlink(video_path) + + mock_vlm.describe_video_frames.assert_called_once() + mock_vlm.describe_image.assert_not_called() + + def test_query_forwarded_to_vlm(self): + """The query parameter must be forwarded to describe_video_frames.""" + mock_vlm = MagicMock() + mock_vlm.describe_video_frames.return_value = "answer" + _inject_mock_vlm(mock_vlm) + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(b"fake_mp4") + video_path = f.name + + try: + with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): + from app.internal_action_interface import InternalActionInterface + InternalActionInterface.understand_video(video_path, query="What is on slide 3?") + finally: + os.unlink(video_path) + + call_kwargs = mock_vlm.describe_video_frames.call_args + all_args = list(call_kwargs.args) + list(call_kwargs.kwargs.values()) + self.assertIn("What is on slide 3?", all_args, + "FAIL: query not forwarded to describe_video_frames.") + + def test_max_frames_forwarded_to_vlm(self): + """max_frames must be forwarded to describe_video_frames.""" + mock_vlm = MagicMock() + mock_vlm.describe_video_frames.return_value = "summary" + _inject_mock_vlm(mock_vlm) + + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(b"fake_mp4") + video_path = f.name + + try: + with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): + from app.internal_action_interface import InternalActionInterface + InternalActionInterface.understand_video(video_path, max_frames=12) + finally: + os.unlink(video_path) + + call_kwargs = mock_vlm.describe_video_frames.call_args + all_args = list(call_kwargs.args) + list(call_kwargs.kwargs.values()) + self.assertIn(12, all_args, + "FAIL: max_frames=12 was not forwarded to describe_video_frames.") + + def tearDown(self): + _reset_iai() + import shutil + shutil.rmtree(self.tmp_workspace, ignore_errors=True) + + +# ───────────────────────────────────────────────────────────────── +# SECTION E: Regression — existing methods untouched +# ───────────────────────────────────────────────────────────────── + +class TestRegressionExistingMethods(unittest.TestCase): + """ + REGRESSION GUARD: Ensure describe_image(), describe_screen(), + and initialize() still work exactly as before Step 2. + """ + + def setUp(self): + _reset_iai() + self.tmp_workspace = tempfile.mkdtemp() + + def test_describe_image_still_raises_when_vlm_none(self): + from app.internal_action_interface import InternalActionInterface + with self.assertRaises(RuntimeError): + InternalActionInterface.describe_image("/any/path.png") + + def test_describe_image_still_returns_string(self): + """describe_image must still return str (not dict) — contract unchanged.""" + mock_vlm = MagicMock() + mock_vlm.describe_image.return_value = "A photo of a cat." + _inject_mock_vlm(mock_vlm) + + from app.internal_action_interface import InternalActionInterface + result = InternalActionInterface.describe_image("/fake/path.png") + self.assertIsInstance(result, str, + "REGRESSION: describe_image must still return str, not dict.") + self.assertEqual(result, "A photo of a cat.") + + def test_initialize_still_sets_vlm_interface(self): + """initialize() must still correctly set vlm_interface class attribute.""" + from app.internal_action_interface import InternalActionInterface + + mock_vlm = MagicMock() + mock_llm = MagicMock() + mock_task = MagicMock() + mock_state = MagicMock() + + InternalActionInterface.initialize( + llm_interface=mock_llm, + task_manager=mock_task, + state_manager=mock_state, + vlm_interface=mock_vlm, + ) + + self.assertIs(InternalActionInterface.vlm_interface, mock_vlm, + "REGRESSION: initialize() no longer sets vlm_interface correctly.") + + def test_new_methods_do_not_shadow_describe_image(self): + """ + perform_ocr and understand_video must not accidentally override + or shadow describe_image on the class. + """ + from app.internal_action_interface import InternalActionInterface + # All three must coexist independently + self.assertTrue(hasattr(InternalActionInterface, "describe_image")) + self.assertTrue(hasattr(InternalActionInterface, "perform_ocr")) + self.assertTrue(hasattr(InternalActionInterface, "understand_video")) + + # describe_image must still delegate to vlm.describe_image + mock_vlm = MagicMock() + mock_vlm.describe_image.return_value = "original image description" + _inject_mock_vlm(mock_vlm) + + result = InternalActionInterface.describe_image("/fake.png") + mock_vlm.describe_image.assert_called_once() + # describe_image_ocr must NOT have been called + mock_vlm.describe_image_ocr.assert_not_called() + + def tearDown(self): + _reset_iai() + import shutil + shutil.rmtree(self.tmp_workspace, ignore_errors=True) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_step3_perform_ocr_action.py b/tests/test_step3_perform_ocr_action.py new file mode 100644 index 00000000..31a55f44 --- /dev/null +++ b/tests/test_step3_perform_ocr_action.py @@ -0,0 +1,129 @@ +# tests/test_step3_perform_ocr_action.py +""" +Step 3 Verification: perform_ocr action layer tests. +Tests input validation, simulated mode, schema contract, +and bridge delegation — without making real VLM calls. +""" +import os +import pytest +from unittest.mock import patch, MagicMock + + +# ── Helpers ──────────────────────────────────────────────────────────────── + +def load_action(image_path: str, simulated: bool = False) -> dict: + """Import and invoke the action directly.""" + from app.data.action.perform_ocr import execute + return execute({"image_path": image_path, "simulated_mode": simulated}) + + +# ── 1. Input Validation ──────────────────────────────────────────────────── + +class TestInputValidation: + + def test_missing_image_path_key(self): + from app.data.action.perform_ocr import execute + result = execute({}) + assert result["status"] == "error" + assert "image_path" in result["message"].lower() + + def test_empty_image_path_string(self): + result = load_action("") + assert result["status"] == "error" + + def test_nonexistent_file_path(self): + result = load_action("/tmp/does_not_exist_12345.png") + assert result["status"] == "error" + assert "not found" in result["message"].lower() or \ + "does not exist" in result["message"].lower() or \ + result["status"] == "error" + + def test_path_is_directory_not_file(self, tmp_path): + result = load_action(str(tmp_path)) # directory, not a file + assert result["status"] == "error" + + +# ── 2. Simulated Mode ────────────────────────────────────────────────────── + +class TestSimulatedMode: + + def test_simulated_mode_returns_success(self, tmp_path): + fake_image = tmp_path / "test.png" + fake_image.write_bytes(b"fake_png_bytes") + result = load_action(str(fake_image), simulated=True) + assert result["status"] == "success" + + def test_simulated_mode_makes_no_vlm_call(self, tmp_path): + fake_image = tmp_path / "test.png" + fake_image.write_bytes(b"fake_png_bytes") + with patch("app.internal_action_interface.InternalActionInterface.perform_ocr") as mock_ocr: + load_action(str(fake_image), simulated=True) + mock_ocr.assert_not_called() + + def test_simulated_mode_result_is_string(self, tmp_path): + fake_image = tmp_path / "test.png" + fake_image.write_bytes(b"fake_png_bytes") + result = load_action(str(fake_image), simulated=True) + # In simulated mode, summary or message might be the string + assert isinstance(result.get("summary") or result.get("message"), str) + + +# ── 3. Schema Contract ───────────────────────────────────────────────────── + +class TestSchemaContract: + + def test_success_response_has_required_keys(self, tmp_path): + fake_image = tmp_path / "test.png" + fake_image.write_bytes(b"fake_png_bytes") + mock_return = {"status": "success", "text": "Invoice #1234", "file_path": "/tmp/ocr.txt"} + with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", + return_value=mock_return): + result = load_action(str(fake_image)) + assert "status" in result + assert result["status"] in ("success", "error") + + def test_error_response_has_message(self, tmp_path): + fake_image = tmp_path / "test.png" + fake_image.write_bytes(b"fake_png_bytes") + with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", + side_effect=RuntimeError("VLM unavailable")): + result = load_action(str(fake_image)) + assert result["status"] == "error" + assert "message" in result + assert len(result["message"]) > 0 + + def test_success_exposes_extracted_text(self, tmp_path): + fake_image = tmp_path / "test.png" + fake_image.write_bytes(b"fake_png_bytes") + mock_return = {"status": "success", "text": "Hello World", "file_path": "/tmp/ocr.txt"} + with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", + return_value=mock_return): + result = load_action(str(fake_image)) + # The action must surface the text somewhere — either in result["text"], + # result["result"], or result["message"] + combined = str(result) + assert "Hello World" in combined + + +# ── 4. Bridge Delegation ─────────────────────────────────────────────────── + +class TestBridgeDelegation: + + def test_delegates_correct_image_path_to_bridge(self, tmp_path): + fake_image = tmp_path / "receipt.png" + fake_image.write_bytes(b"fake_png_bytes") + mock_return = {"status": "success", "text": "some text", "file_path": "/tmp/x.txt"} + with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", + return_value=mock_return) as mock_bridge: + load_action(str(fake_image)) + called_path = mock_bridge.call_args[0][0] + assert called_path == str(fake_image) + + def test_bridge_vlm_not_initialized_returns_error(self, tmp_path): + fake_image = tmp_path / "test.png" + fake_image.write_bytes(b"fake_png_bytes") + with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", + side_effect=RuntimeError("InternalActionInterface not initialized with VLMInterface.")): + result = load_action(str(fake_image)) + assert result["status"] == "error" + assert "message" in result diff --git a/tests/test_step4_understand_video_action.py b/tests/test_step4_understand_video_action.py new file mode 100644 index 00000000..619dacc0 --- /dev/null +++ b/tests/test_step4_understand_video_action.py @@ -0,0 +1,116 @@ +# tests/test_step4_understand_video_action.py + +import pytest +from unittest.mock import patch + +def load_action(video_path: str, query: str = "", simulated: bool = False) -> dict: + from app.data.action.understand_video import execute + return execute({ + "video_path": video_path, + "query": query, + "simulated_mode": simulated, + }) + + +class TestInputValidation: + + def test_missing_video_path_key(self): + from app.data.action.understand_video import execute + result = execute({}) + assert result["status"] == "error" + assert "video_path" in result["message"].lower() + + def test_empty_video_path_string(self): + result = load_action("") + assert result["status"] == "error" + + def test_nonexistent_file_path(self): + result = load_action("/tmp/does_not_exist_98765.mp4") + assert result["status"] == "error" + + def test_path_is_directory_not_file(self, tmp_path): + result = load_action(str(tmp_path)) + assert result["status"] == "error" + + +class TestSimulatedMode: + + def test_simulated_mode_returns_success(self, tmp_path): + fake_video = tmp_path / "test.mp4" + fake_video.write_bytes(b"fake_video_bytes") + result = load_action(str(fake_video), simulated=True) + assert result["status"] == "success" + + def test_simulated_mode_makes_no_vlm_call(self, tmp_path): + fake_video = tmp_path / "test.mp4" + fake_video.write_bytes(b"fake_video_bytes") + with patch("app.internal_action_interface.InternalActionInterface.understand_video") as mock_bridge: + load_action(str(fake_video), simulated=True) + mock_bridge.assert_not_called() + + +class TestSchemaContract: + + def test_success_response_has_required_keys(self, tmp_path): + fake_video = tmp_path / "clip.mp4" + fake_video.write_bytes(b"fake_video_bytes") + + mock_return = { + "status": "success", + "summary": "A person walks into a room.", + "preview": "A person walks...", + "file_path": "/tmp/video_summary.txt", + } + with patch("app.internal_action_interface.InternalActionInterface.understand_video", + return_value=mock_return): + result = load_action(str(fake_video), query="What happens?") + + assert result["status"] == "success" + for key in ("summary", "file_path"): + assert key in result + + def test_error_response_has_message(self, tmp_path): + fake_video = tmp_path / "clip.mp4" + fake_video.write_bytes(b"fake_video_bytes") + + with patch("app.internal_action_interface.InternalActionInterface.understand_video", + side_effect=RuntimeError("VLM unavailable")): + result = load_action(str(fake_video)) + + assert result["status"] == "error" + assert "message" in result + assert len(result["message"]) > 0 + + +class TestBridgeDelegation: + + def test_delegates_correct_video_path_and_query(self, tmp_path): + fake_video = tmp_path / "scene.mp4" + fake_video.write_bytes(b"fake_video_bytes") + + mock_return = { + "status": "success", + "summary": "Some summary", + "preview": "Some...", + "file_path": "/tmp/video_summary.txt", + } + with patch("app.internal_action_interface.InternalActionInterface.understand_video", + return_value=mock_return) as mock_bridge: + load_action(str(fake_video), query="Who is present?") + + # Verify bridge call arguments + # In some versions of mock, call_args[0] is positional args + called_args = mock_bridge.call_args[0] + assert called_args[0] == str(fake_video) + assert mock_bridge.call_args[1].get('query') == "Who is present?" or called_args[1] == "Who is present?" + + def test_bridge_vlm_not_initialized_returns_error(self, tmp_path): + fake_video = tmp_path / "scene.mp4" + fake_video.write_bytes(b"fake_video_bytes") + + with patch("app.internal_action_interface.InternalActionInterface.understand_video", + side_effect=RuntimeError("InternalActionInterface not initialized with VLMInterface.")): + result = load_action(str(fake_video)) + + assert result["status"] == "error" + assert "message" in result From 5feaa80c1d80118ced849846a50b10a2b50747ad Mon Sep 17 00:00:00 2001 From: Korivi Date: Wed, 15 Apr 2026 10:48:47 +0900 Subject: [PATCH 08/30] CLI SKILLS Improvements --- skills/cli-anything/SKILL.md | 273 +++++++++++++++++------------------ 1 file changed, 131 insertions(+), 142 deletions(-) diff --git a/skills/cli-anything/SKILL.md b/skills/cli-anything/SKILL.md index 5194429b..e0b3b509 100644 --- a/skills/cli-anything/SKILL.md +++ b/skills/cli-anything/SKILL.md @@ -1,187 +1,176 @@ --- name: cli-anything -description: "Generate agent-native CLI harnesses for any GUI application using the CLI-Anything methodology, or discover and install pre-built CLIs via CLI-Hub." -metadata: {"clawdbot":{"emoji":"⚡","os":["darwin","linux","windows"],"requires":{"bins":["python"]}}} +description: "Use any supported GUI application (GIMP, Blender, LibreOffice, Audacity, OBS, etc.) on behalf of the user. Auto-installs the app and CLI harness, then executes the task directly." +action-sets: ["shell", "file_operations"] --- # CLI-Anything Skill -CLI-Anything transforms any GUI application into an agent-native command-line interface. Use this skill when the user asks to: -- Generate a CLI harness for any software (GIMP, Blender, LibreOffice, etc.) -- Install or discover CLIs via CLI-Hub -- Refine or test an existing generated harness +**Core rule: Do everything yourself. Never give the user a command to run. Never explain steps. Just execute the task and report the result.** --- -## Quick Install (CLI-Hub) - -For software that already has a pre-built harness: - -```bash -pip install cli-anything-hub -cli-hub install -``` - -Browse the full catalog: https://hkuds.github.io/CLI-Anything/ +## Supported Apps Reference + +Use this table to look up the correct names for every step. + +| App | cli-hub name | Windows (winget) | macOS (brew cask) | Linux (apt) | +|---|---|---|---|---| +| GIMP | `gimp` | `GIMP.GIMP` | `gimp` | `gimp` | +| Blender | `blender` | `BlenderFoundation.Blender` | `blender` | `blender` | +| Inkscape | `inkscape` | `Inkscape.Inkscape` | `inkscape` | `inkscape` | +| Audacity | `audacity` | `Audacity.Audacity` | `audacity` | `audacity` | +| OBS Studio | `obs` | `OBSProject.OBSStudio` | `obs` | `obs-studio` | +| Kdenlive | `kdenlive` | `KDE.Kdenlive` | `kdenlive` | `kdenlive` | +| Shotcut | `shotcut` | `Meltytech.Shotcut` | `shotcut` | `shotcut` | +| Krita | `krita` | `KDE.Krita` | `krita` | `krita` | +| LibreOffice | `libreoffice` | `TheDocumentFoundation.LibreOffice` | `libreoffice` | `libreoffice` | +| Mubu | `mubu` | _(web app — skip winget)_ | _(web app)_ | _(web app)_ | +| Zoom | `zoom` | `Zoom.Zoom` | `zoom` | `zoom` | +| Draw.io | `draw-io` | `JGraph.Draw` | `drawio` | _(AppImage)_ | +| Mermaid | `mermaid` | `OpenJS.NodeJS` _(then npm i -g @mermaid-js/mermaid-cli)_ | `mermaid` | _(npm)_ | +| ComfyUI | `comfyui` | _(git clone — see below)_ | _(git clone)_ | _(git clone)_ | +| AnyGen | `anygen` | _(pip install)_ | _(pip install)_ | _(pip install)_ | +| NotebookLM | `notebooklm` | _(web app — Playwright)_ | _(web app)_ | _(web app)_ | +| Ollama | `ollama` | `Ollama.Ollama` | `ollama` | _(curl install)_ | +| AdGuard Home | `adguard-home` | `AdGuard.AdGuardHome` | `adguard-home` | _(binary release)_ | +| Stable Diffusion | `stable-diffusion` | _(git clone AUTOMATIC1111)_ | _(git clone)_ | _(git clone)_ | +| JupyterLab | `jupyterlab` | _(pip install jupyterlab)_ | _(pip install)_ | _(pip install)_ | +| FreeCAD | `freecad` | `FreeCAD.FreeCAD` | `freecad` | `freecad` | +| QGIS | `qgis` | `OSGeo.QGIS` | `qgis` | `qgis` | +| Grafana | `grafana` | `GrafanaLabs.Grafana` | `grafana` | `grafana` | +| Gitea | `gitea` | `Gitea.Gitea` | `gitea` | _(binary)_ | +| GitLab | `gitlab` | _(docker or package)_ | _(docker)_ | `gitlab-ce` | +| NextCloud | `nextcloud` | `Nextcloud.NextcloudDesktop` | `nextcloud` | _(snap/docker)_ | +| Jenkins | `jenkins` | `Jenkins.Jenkins` | `jenkins` | `jenkins` | --- -## Generate a New CLI Harness - -Follow the **7-Phase Methodology** below. Work sequentially — each phase depends on the prior. +## Execution Flow (follow every time — use EXACT timeouts listed) -### Phase 1 — Codebase Analysis +**CRITICAL: Always pass the timeout shown below to run_shell. Never use the default (30s). winget/brew installs take minutes — without a timeout they die silently and the agent loops forever.** -Before writing code, study the target application: - -``` -- Identify the backend engine (separate from the GUI presentation layer) -- Map each GUI action to its underlying API or Python call -- Understand the data model and native file formats (e.g., .blend, ODF, SVG) -- Locate any existing CLI entry points or scripting interfaces -- Catalog the undo/redo and session management system +### Step 1 — Detect OS +Run with `timeout: 10`: +```bash +python -c "import platform; print(platform.system())" ``` +Result: `Windows`, `Darwin`, or `Linux`. -### Phase 2 — CLI Architecture Design - -Choose one of: -- **Stateful REPL** — for interactive, session-based workflows -- **Subcommand CLI** — for scriptable, one-shot invocations -- **Both** — recommended; REPL wraps the subcommand interface - -Design command groups that mirror the app's logical domains (e.g., `image`, `layer`, `export` for GIMP). Plan dual output: human-readable text and machine-readable `--json`. - -### Phase 3 — Implementation - -Directory layout: -``` -cli_anything/ # Namespace package — NO __init__.py here -└── / # Sub-package — HAS __init__.py - ├── __main__.py - ├── README.md - ├── _cli.py - ├── core/ # Domain modules wrapping the real software - ├── utils/ # Shared utilities + repl_skin.py - └── tests/ - ├── TEST.md - ├── test_core.py - └── test_full_e2e.py +### Step 2 — Check if the app is installed +Run with `timeout: 10`: +```bash +gimp --version # or blender --version, libreoffice --version, etc. ``` +- Exit 0 → already installed → skip to Step 4 +- Exit non-zero → not installed → go to Step 3 -**Critical rule**: The CLI MUST call the actual software for rendering and export — never reimplement the software's functionality in Python. Generate valid native project files and hand them to the real application backend. - -Required patterns for every command: -- `--json` flag for machine-readable output -- Fail loudly with unambiguous error messages -- Introspection commands (`info`, `list`, `status`) for state inspection - -Use the unified REPL skin (`repl_skin.py` from `cli-anything-plugin/repl_skin.py`) so all generated CLIs share a consistent interface. - -### Phase 4 — Test Planning (write TEST.md Part 1) - -Before any test code, document in `tests/TEST.md`: -- Test inventory and what each test covers -- Unit test plans (synthetic data, no external deps) -- E2E test plans (real software backend invoked) -- Realistic end-to-end workflow scenarios - -### Phase 5 — Test Implementation - -Four layers, all required: -1. **Unit tests** — synthetic data, deterministic, fast -2. **E2E native tests** — verify project file generation and structure -3. **E2E backend tests** — invoke the real software, check output exists with correct format (magic bytes, ZIP structure, pixel analysis, etc.) -4. **CLI subprocess tests** — install the CLI entry point, run full workflows end-to-end - -**Never assume an export is correct because it ran without errors.** Validate outputs programmatically and print artifact paths for manual inspection. - -### Phase 6 — Test Documentation (write TEST.md Part 2) - -Append full `pytest` output and summary statistics to `TEST.md`. - -### Phase 6.5 — SKILL.md Generation - -Create `cli_anything//skills/SKILL.md` with: -- YAML frontmatter for agent discovery (`name`, `description`, `tags`, `requires`) -- All command groups and subcommands -- Usage examples for common workflows -- Agent-specific guidance for `--json` output and error handling - -The REPL should print the absolute path to `SKILL.md` on startup so agents can find it. - -### Phase 7 — Package & Install +### Step 3 — Install the app (ONE attempt only — never retry install) +**Windows** — run with `timeout: 600`: ```bash -# setup.py uses PEP 420 namespace packaging -cd cli_anything/ -pip install -e . +winget install --id --silent --accept-package-agreements --accept-source-agreements +``` -# Verify the CLI is on PATH -which cli-anything- -cli-anything- --help +**macOS** — run with `timeout: 600`: +```bash +brew install --cask ``` -Publish to PyPI when ready: +**Linux** — run with `timeout: 300`: ```bash -python -m build -twine upload dist/* +sudo apt-get install -y ``` ---- +**Special cases:** +- ComfyUI / Stable Diffusion: `git clone` + `pip install -r requirements.txt` — `timeout: 600` +- Mermaid: `npm install -g @mermaid-js/mermaid-cli` — `timeout: 120` +- JupyterLab / AnyGen: `pip install ` — `timeout: 120` +- Web apps (Mubu, NotebookLM): no install needed — use `playwright-mcp` +- Ollama on Linux: `curl -fsSL https://ollama.com/install.sh | sh` — `timeout: 300` -## Using a Generated CLI +After install, re-run Step 2 check once (`timeout: 10`). If still fails → tell the user, stop completely. +### Step 4 — Check if CLI harness is installed +Run with `timeout: 10`: ```bash -# Interactive REPL (default when no subcommand given) -cli-anything- +cli-anything- --version +``` +- Found → skip to Step 6 +- Not found → go to Step 5 -# One-shot subcommand with JSON output for agent consumption -cli-anything- --json [args] +### Step 5 — Install CLI harness (ONE attempt only) -# Help -cli-anything- --help -cli-anything- --help +**Always try CLI-Hub first** — run with `timeout: 120`: +```bash +pip install cli-anything-hub --quiet && cli-hub install ``` ---- +If CLI-Hub fails → generate a minimal harness with `write_file` (a Click CLI wrapping the app's real scripting API), then run with `timeout: 60`: +```bash +pip install -e cli_anything/ --quiet +``` -## Refining an Existing Harness +If harness install also fails → tell the user, stop completely. -After initial generation, run a gap analysis: +### Step 6 — Execute the user's task +Run with `timeout: 300` (or `timeout: 600` for renders/exports): ```bash -# Broad refinement -/cli-anything:refine ./ - -# Focused refinement on specific capabilities -/cli-anything:refine ./ "batch processing and filters" +# Image editing +cli-anything-gimp image resize input.jpg output.jpg 1920 1080 +cli-anything-gimp filter blur input.jpg --radius 3 --output out.jpg +cli-anything-gimp export input.xcf output.png + +# 3D / rendering +cli-anything-blender render scene.blend --output frames/ --format PNG +cli-anything-blender script run myscript.py scene.blend + +# Vector +cli-anything-inkscape export logo.svg logo.png --dpi 300 +cli-anything-inkscape convert input.svg output.pdf + +# Audio +cli-anything-audacity trim audio.mp3 output.mp3 --start 0 --end 30 +cli-anything-audacity export-mp3 project.aup3 output.mp3 + +# Video +cli-anything-kdenlive render project.kdenlive output.mp4 +cli-anything-shotcut render project.mlt output.mp4 + +# Office +cli-anything-libreoffice convert doc.docx output.pdf +cli-anything-libreoffice calc run macro.py spreadsheet.xlsx + +# Diagrams +cli-anything-draw-io export diagram.drawio output.png +cli-anything-mermaid render diagram.mmd output.png + +# AI / ML +cli-anything-comfyui run workflow.json --output images/ +cli-anything-ollama run llama3 --prompt "summarize this" +cli-anything-stable-diffusion generate "a sunset over mountains" --output out.png + +# Dev / Infra +cli-anything-jupyterlab execute notebook.ipynb --output result.ipynb +cli-anything-grafana export-dashboard my-dashboard dashboard.json +cli-anything-gitea create-repo myrepo --private ``` -Then re-run tests: `/cli-anything:test ` - ---- +**Always run the task. Never print commands and ask the user to run them.** -## Supported Applications (Pre-built) +If the task command fails → retry once with adjusted args. If it fails again → report the error and stop. -CLI-Anything has verified harnesses for 26+ applications: - -| Category | Applications | -|---|---| -| Creative | GIMP, Blender, Inkscape, Krita, MuseScore | -| Office | LibreOffice, Zotero | -| Media | Audacity, OBS Studio, Kdenlive, Shotcut, VideoCaptioner | -| Diagramming | Draw.io, Mermaid | -| AI/ML | ComfyUI, Ollama, NotebookLM | -| Web/Cloud | Zoom, AdGuard Home, Exa | -| Dev Tools | Godot Engine, RenderDoc | +### Step 7 — Report result +One or two sentences only: +> "Done — rendered `output.mp4` from your Kdenlive project." +> "Converted `report.docx` to PDF at `report.pdf`." --- -## Architecture Pitfalls - -**The Rendering Gap** — project files may reference filters/effects that simple file readers ignore. Solution priority: -1. Use the app's native renderer -2. Build a translation layer for effect conversion -3. Generate a render script as fallback +## Hard Stop Rules (prevents infinite loops) -**Testing with missing software** — tests MUST NOT skip or fake results when the target software is missing. They should fail loudly so the absence is visible. +- **Never retry an install** — if `winget install` or `cli-hub install` fails, stop and tell the user. +- **Never loop on a timeout** — if a command times out once, it will time out again. Stop immediately. +- **Max 1 retry on the task command (Step 6) only** — not on installs. +- **If stuck after 3 total run_shell calls** for the same step → stop, tell the user what failed. From c49fe5690426c5c4463e95d1b45265b76e806193 Mon Sep 17 00:00:00 2001 From: Korivi Date: Wed, 15 Apr 2026 11:54:08 +0900 Subject: [PATCH 09/30] cli anything help guid added --- skills/cli-anything/SKILL.md | 78 ++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/skills/cli-anything/SKILL.md b/skills/cli-anything/SKILL.md index e0b3b509..c35e10d0 100644 --- a/skills/cli-anything/SKILL.md +++ b/skills/cli-anything/SKILL.md @@ -10,6 +10,84 @@ action-sets: ["shell", "file_operations"] --- +## Help Response (no tools needed — just reply with text) + +If the user's message matches any of these (case-insensitive, any wording): +- "cli anything help" / "cli-anything help" / "cli help" +- "what apps does cli-anything support" / "what can cli-anything do" +- "show cli apps" / "cli anything guide" / "list cli apps" +- Any variation asking what CLI-Anything can do or which apps are supported + +**Do not run any tools. Reply directly with this message:** + +--- + +**CLI-Anything — What I Can Do** + +Just tell me what you want done in plain English. I'll auto-install the app if it's not on your system and complete the task for you — you never need to run any commands yourself. + +**Creative & Media** +| App | Example prompt | +|---|---| +| GIMP | "Resize photo.jpg to 1920×1080 and save as photo_hd.jpg" | +| Blender | "Render scene.blend to PNG frames in the frames/ folder" | +| Inkscape | "Export logo.svg as a 300 DPI PNG" | +| Krita | "Export painting.kra as PNG" | +| Audacity | "Trim the first 30 seconds from audio.mp3 and save as clip.mp3" | +| OBS Studio | "Record my screen for 60 seconds" | +| Kdenlive | "Render project.kdenlive to MP4" | +| Shotcut | "Render project.mlt to MP4" | + +**Office & Productivity** +| App | Example prompt | +|---|---| +| LibreOffice | "Convert report.docx to PDF" / "Run a macro on spreadsheet.xlsx" | +| Mubu | "Open my outline in Mubu" | + +**Communication** +| App | Example prompt | +|---|---| +| Zoom | "Start a Zoom meeting" | + +**Diagramming** +| App | Example prompt | +|---|---| +| Draw.io | "Export diagram.drawio as PNG" | +| Mermaid | "Render this diagram to PNG: graph TD; A-->B; B-->C" | + +**AI & ML** +| App | Example prompt | +|---|---| +| ComfyUI | "Run workflow.json and save images to output/" | +| AnyGen | "Generate content using AnyGen" | +| NotebookLM | "Summarize this PDF using NotebookLM" | +| Ollama | "Run llama3 and summarize this text: ..." | +| Stable Diffusion | "Generate 'a sunset over mountains' and save as out.png" | + +**Dev & Infrastructure** +| App | Example prompt | +|---|---| +| JupyterLab | "Execute notebook.ipynb and save the output" | +| Grafana | "Export my dashboard as JSON" | +| Gitea | "Create a private repo called myrepo on Gitea" | +| GitLab | "Create a new project on GitLab" | +| NextCloud | "Sync my files to NextCloud" | +| Jenkins | "Trigger my build pipeline" | +| AdGuard Home | "Set up network-wide ad blocking with AdGuard Home" | + +**GIS & Design** +| App | Example prompt | +|---|---| +| FreeCAD | "Export model.fcstd as STL" | +| QGIS | "Export map.qgz as PNG" | + +**Tips:** +- Always give me the full file path (e.g. `C:\Users\you\Desktop\photo.jpg`) +- If the app isn't installed, I'll install it automatically — just wait a few minutes +- I never ask you to run commands yourself — I do everything for you + +--- + ## Supported Apps Reference Use this table to look up the correct names for every step. From ec3dc6763468b3b8a583997dacb9143a017bc4ac Mon Sep 17 00:00:00 2001 From: Korivi Date: Thu, 16 Apr 2026 15:18:53 +0900 Subject: [PATCH 10/30] CLI Skill updated --- skills/cli-anything/SKILL.md | 169 ++++++++++++++++++++++------------- 1 file changed, 108 insertions(+), 61 deletions(-) diff --git a/skills/cli-anything/SKILL.md b/skills/cli-anything/SKILL.md index c35e10d0..58309cc0 100644 --- a/skills/cli-anything/SKILL.md +++ b/skills/cli-anything/SKILL.md @@ -10,6 +10,29 @@ action-sets: ["shell", "file_operations"] --- +## FORBIDDEN — Never Do These (causes bugs on all platforms) + +These patterns are strictly banned. If you catch yourself about to do any of these, stop and use the cli-anything harness instead. + +| ❌ FORBIDDEN | ✅ CORRECT | +|---|---| +| `soffice.exe --headless --convert-to pdf ...` | `cli-anything-libreoffice convert doc.docx output.pdf` | +| `cd "C:\Program Files\LibreOffice\program" && soffice.exe ...` | `cli-anything-libreoffice convert doc.docx output.pdf` | +| `gimp --batch-interpreter=script-fu-use-v2 ...` | `cli-anything-gimp image resize input.jpg output.jpg 1920 1080` | +| `blender --background scene.blend --render-output ...` | `cli-anything-blender render scene.blend --output frames/ --format PNG` | +| `inkscape --export-type=png logo.svg` | `cli-anything-inkscape export logo.svg logo.png --dpi 300` | +| Chaining with `&&`: `cmd1 && cmd2` | Two separate `run_shell` calls | +| Any `.exe` extension in a command | No `.exe` — harness is cross-platform | +| Hardcoded paths like `C:\Program Files\...` | Use the harness — it finds the app automatically | + +**Why these are banned:** +- `.exe` only exists on Windows — breaks on macOS and Linux +- `C:\Program Files\...` paths break on macOS and Linux +- `&&` chaining breaks in PowerShell on Windows +- Raw app CLIs require knowing app-specific flags — the harness handles all of that + +--- + ## Help Response (no tools needed — just reply with text) If the user's message matches any of these (case-insensitive, any wording): @@ -27,64 +50,65 @@ If the user's message matches any of these (case-insensitive, any wording): Just tell me what you want done in plain English. I'll auto-install the app if it's not on your system and complete the task for you — you never need to run any commands yourself. **Creative & Media** -| App | Example prompt | -|---|---| -| GIMP | "Resize photo.jpg to 1920×1080 and save as photo_hd.jpg" | -| Blender | "Render scene.blend to PNG frames in the frames/ folder" | -| Inkscape | "Export logo.svg as a 300 DPI PNG" | -| Krita | "Export painting.kra as PNG" | -| Audacity | "Trim the first 30 seconds from audio.mp3 and save as clip.mp3" | -| OBS Studio | "Record my screen for 60 seconds" | -| Kdenlive | "Render project.kdenlive to MP4" | -| Shotcut | "Render project.mlt to MP4" | +| App | What I can do | Example prompt | +|---|---|---| +| GIMP | Resize, crop, blur, convert, export images | "Resize photo.jpg to 1920×1080 and save as photo_hd.jpg" | +| Blender | Render 3D scenes, run scripts, export models | "Render scene.blend to PNG frames in the frames/ folder" | +| Inkscape | Export SVG to PNG/PDF, convert vector files | "Export logo.svg as a 300 DPI PNG" | +| Krita | Export paintings, batch convert images | "Export painting.kra as PNG" | +| Audacity | Trim, export, convert audio files | "Trim the first 30 seconds from audio.mp3 and save as clip.mp3" | +| OBS Studio | Record screen, stream | "Record my screen for 60 seconds" | +| Kdenlive | Render video projects to MP4/MKV | "Render project.kdenlive to MP4" | +| Shotcut | Render video projects to MP4 | "Render project.mlt to MP4" | **Office & Productivity** -| App | Example prompt | -|---|---| -| LibreOffice | "Convert report.docx to PDF" / "Run a macro on spreadsheet.xlsx" | -| Mubu | "Open my outline in Mubu" | +| App | What I can do | Example prompt | +|---|---|---| +| LibreOffice | Convert DOCX/XLSX/PPTX to PDF, run macros | "Convert report.docx to PDF" | +| Mubu | Manage knowledge outlines | "Open my outline in Mubu" | **Communication** -| App | Example prompt | -|---|---| -| Zoom | "Start a Zoom meeting" | +| App | What I can do | Example prompt | +|---|---|---| +| Zoom | Start/join meetings | "Start a Zoom meeting" | **Diagramming** -| App | Example prompt | -|---|---| -| Draw.io | "Export diagram.drawio as PNG" | -| Mermaid | "Render this diagram to PNG: graph TD; A-->B; B-->C" | +| App | What I can do | Example prompt | +|---|---|---| +| Draw.io | Export diagrams to PNG/SVG/PDF | "Export diagram.drawio as PNG" | +| Mermaid | Render diagram code to PNG | "Render this diagram to PNG: graph TD; A-->B; B-->C" | **AI & ML** -| App | Example prompt | -|---|---| -| ComfyUI | "Run workflow.json and save images to output/" | -| AnyGen | "Generate content using AnyGen" | -| NotebookLM | "Summarize this PDF using NotebookLM" | -| Ollama | "Run llama3 and summarize this text: ..." | -| Stable Diffusion | "Generate 'a sunset over mountains' and save as out.png" | +| App | What I can do | Example prompt | +|---|---|---| +| ComfyUI | Run AI image generation workflows | "Run workflow.json and save images to output/" | +| AnyGen | Generate AI content | "Generate content using AnyGen" | +| NotebookLM | AI research and summarization | "Summarize this PDF using NotebookLM" | +| Ollama | Run local LLM inference | "Run llama3 and summarize this text: ..." | +| Stable Diffusion | Generate images from text prompts | "Generate 'a sunset over mountains' and save as out.png" | **Dev & Infrastructure** -| App | Example prompt | -|---|---| -| JupyterLab | "Execute notebook.ipynb and save the output" | -| Grafana | "Export my dashboard as JSON" | -| Gitea | "Create a private repo called myrepo on Gitea" | -| GitLab | "Create a new project on GitLab" | -| NextCloud | "Sync my files to NextCloud" | -| Jenkins | "Trigger my build pipeline" | -| AdGuard Home | "Set up network-wide ad blocking with AdGuard Home" | - -**GIS & Design** -| App | Example prompt | -|---|---| -| FreeCAD | "Export model.fcstd as STL" | -| QGIS | "Export map.qgz as PNG" | +| App | What I can do | Example prompt | +|---|---|---| +| JupyterLab | Execute notebooks, save output | "Execute notebook.ipynb and save the output" | +| Grafana | Export dashboards | "Export my dashboard as JSON" | +| Gitea | Create repos, manage git hosting | "Create a private repo called myrepo on Gitea" | +| GitLab | Create projects, manage CI/CD | "Create a new project on GitLab" | +| NextCloud | Sync files, manage cloud storage | "Sync my files to NextCloud" | +| Jenkins | Trigger build pipelines | "Trigger my build pipeline" | +| AdGuard Home | Set up network-wide ad blocking | "Set up network-wide ad blocking with AdGuard Home" | + +**GIS & 3D Design** +| App | What I can do | Example prompt | +|---|---|---| +| FreeCAD | Export 3D models to STL/STEP | "Export model.fcstd as STL" | +| QGIS | Export maps to PNG/PDF | "Export map.qgz as PNG" | **Tips:** - Always give me the full file path (e.g. `C:\Users\you\Desktop\photo.jpg`) - If the app isn't installed, I'll install it automatically — just wait a few minutes - I never ask you to run commands yourself — I do everything for you +- Works on Windows, macOS, and Linux --- @@ -128,35 +152,39 @@ Use this table to look up the correct names for every step. **CRITICAL: Always pass the timeout shown below to run_shell. Never use the default (30s). winget/brew installs take minutes — without a timeout they die silently and the agent loops forever.** +**CRITICAL: Never chain commands with `&&` or `;` in a single run_shell call. Use one separate run_shell call per command.** + ### Step 1 — Detect OS Run with `timeout: 10`: -```bash +``` python -c "import platform; print(platform.system())" ``` Result: `Windows`, `Darwin`, or `Linux`. ### Step 2 — Check if the app is installed Run with `timeout: 10`: -```bash -gimp --version # or blender --version, libreoffice --version, etc. ``` +gimp --version +``` +(replace with the correct app: `blender --version`, `libreoffice --version`, etc.) + - Exit 0 → already installed → skip to Step 4 - Exit non-zero → not installed → go to Step 3 ### Step 3 — Install the app (ONE attempt only — never retry install) **Windows** — run with `timeout: 600`: -```bash +``` winget install --id --silent --accept-package-agreements --accept-source-agreements ``` **macOS** — run with `timeout: 600`: -```bash +``` brew install --cask ``` **Linux** — run with `timeout: 300`: -```bash +``` sudo apt-get install -y ``` @@ -171,7 +199,7 @@ After install, re-run Step 2 check once (`timeout: 10`). If still fails → tell ### Step 4 — Check if CLI harness is installed Run with `timeout: 10`: -```bash +``` cli-anything- --version ``` - Found → skip to Step 6 @@ -180,44 +208,56 @@ cli-anything- --version ### Step 5 — Install CLI harness (ONE attempt only) **Always try CLI-Hub first** — run with `timeout: 120`: -```bash -pip install cli-anything-hub --quiet && cli-hub install ``` +pip install cli-anything-hub --quiet +``` +Then run with `timeout: 120`: +``` +cli-hub install +``` +(Two separate run_shell calls — do NOT chain with &&) If CLI-Hub fails → generate a minimal harness with `write_file` (a Click CLI wrapping the app's real scripting API), then run with `timeout: 60`: -```bash +``` pip install -e cli_anything/ --quiet ``` If harness install also fails → tell the user, stop completely. -### Step 6 — Execute the user's task +### Step 6 — Execute the user's task using the CLI harness ONLY + +**MANDATORY: Use ONLY `cli-anything-` commands. Never call soffice, gimp, blender, or any app binary directly.** + Run with `timeout: 300` (or `timeout: 600` for renders/exports): -```bash -# Image editing +``` +# Image editing — GIMP cli-anything-gimp image resize input.jpg output.jpg 1920 1080 cli-anything-gimp filter blur input.jpg --radius 3 --output out.jpg cli-anything-gimp export input.xcf output.png -# 3D / rendering +# 3D / rendering — Blender cli-anything-blender render scene.blend --output frames/ --format PNG cli-anything-blender script run myscript.py scene.blend -# Vector +# Vector — Inkscape cli-anything-inkscape export logo.svg logo.png --dpi 300 cli-anything-inkscape convert input.svg output.pdf -# Audio +# Painting — Krita +cli-anything-krita export painting.kra output.png + +# Audio — Audacity cli-anything-audacity trim audio.mp3 output.mp3 --start 0 --end 30 cli-anything-audacity export-mp3 project.aup3 output.mp3 -# Video +# Video — Kdenlive / Shotcut cli-anything-kdenlive render project.kdenlive output.mp4 cli-anything-shotcut render project.mlt output.mp4 -# Office +# Office — LibreOffice (NEVER use soffice.exe directly) cli-anything-libreoffice convert doc.docx output.pdf +cli-anything-libreoffice convert spreadsheet.xlsx output.pdf cli-anything-libreoffice calc run macro.py spreadsheet.xlsx # Diagrams @@ -233,6 +273,10 @@ cli-anything-stable-diffusion generate "a sunset over mountains" --output out.pn cli-anything-jupyterlab execute notebook.ipynb --output result.ipynb cli-anything-grafana export-dashboard my-dashboard dashboard.json cli-anything-gitea create-repo myrepo --private + +# GIS / Design +cli-anything-freecad export model.fcstd output.stl +cli-anything-qgis export map.qgz output.png ``` **Always run the task. Never print commands and ask the user to run them.** @@ -252,3 +296,6 @@ One or two sentences only: - **Never loop on a timeout** — if a command times out once, it will time out again. Stop immediately. - **Max 1 retry on the task command (Step 6) only** — not on installs. - **If stuck after 3 total run_shell calls** for the same step → stop, tell the user what failed. +- **Never use `&&` or `;` to chain commands** — always use separate run_shell calls. +- **Never use `.exe` extensions** — use the cli-anything harness which is cross-platform. +- **Never hardcode app installation paths** — use the harness, it resolves the path automatically. \ No newline at end of file From fa0284eb092495e3c171d6ad5cb1e4f40196927e Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Thu, 16 Apr 2026 18:05:29 +0530 Subject: [PATCH 11/30] improvement: use Gemini native video API as primary path in understand_video, OpenCV as fallback --- app/data/action/understand_video.py | 43 ++++++++++++++- tests/test_step4_understand_video_action.py | 61 +++++++++++++++++++++ 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/app/data/action/understand_video.py b/app/data/action/understand_video.py index d40b4dfb..e4c5c77d 100644 --- a/app/data/action/understand_video.py +++ b/app/data/action/understand_video.py @@ -2,9 +2,10 @@ @action( name="understand_video", - description="Analyses a video file by sampling keyframes and generating a narrative summary using a Vision Language Model. Use when the user shares a video and wants to know what happens in it, extract visible text, or answer a specific question about video content.", + description="Uses Gemini 1.5 Pro for native video understanding when a Google API key is configured. Falls back to keyframe extraction via OpenCV if no Google API key is available.", mode="CLI", action_sets=["document_processing, image"], + requirement=["google-generativeai"], input_schema={ "video_path": { "type": "string", @@ -79,6 +80,46 @@ def understand_video(input_data: dict) -> dict: if not os.path.isfile(video_path): return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'File not found.'} + from app.config import get_api_key + api_key = get_api_key('gemini') + + if api_key: + try: + import google.generativeai as genai + genai.configure(api_key=api_key) + import time + from datetime import datetime + from app.config import AGENT_WORKSPACE_ROOT + + video_file = genai.upload_file(path=video_path) + + while video_file.state.name == "PROCESSING": + time.sleep(2) + video_file = genai.get_file(video_file.name) + + model = genai.GenerativeModel("gemini-1.5-pro") + prompt = query if query else "Understand and describe the contents of this video." + response = model.generate_content([video_file, prompt]) + + genai.delete_file(video_file.name) + + full_text = response.text + ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + out_path = os.path.join(AGENT_WORKSPACE_ROOT, f"video_summary_{ts}.txt") + with open(out_path, "w", encoding="utf-8") as f: + f.write(full_text) + + return { + 'status': 'success', + 'summary': full_text[:500] + ("..." if len(full_text) > 500 else ""), + 'file_path': out_path, + 'file_saved': True, + 'message': '' + } + except Exception as e: + # Fall through to fallback path if Gemini native path fails + pass + try: import app.internal_action_interface as iai result = iai.InternalActionInterface.understand_video(video_path, query=query, max_frames=max_frames) diff --git a/tests/test_step4_understand_video_action.py b/tests/test_step4_understand_video_action.py index 619dacc0..77883701 100644 --- a/tests/test_step4_understand_video_action.py +++ b/tests/test_step4_understand_video_action.py @@ -114,3 +114,64 @@ def test_bridge_vlm_not_initialized_returns_error(self, tmp_path): assert result["status"] == "error" assert "message" in result + + +class TestPrimaryGeminiPath: + + @patch("app.config.get_api_key") + @patch("google.generativeai.upload_file") + @patch("google.generativeai.GenerativeModel") + @patch("google.generativeai.delete_file") + def test_gemini_path_success(self, mock_delete, mock_generative_model, mock_upload, mock_get_api_key, tmp_path): + from unittest.mock import MagicMock + mock_get_api_key.return_value = "fake_google_key" + + mock_file = MagicMock() + mock_file.name = "fake_video_name" + mock_file.state.name = "ACTIVE" + mock_upload.return_value = mock_file + + mock_model_instance = MagicMock() + mock_response = MagicMock() + mock_response.text = "This is a native Gemini summary of the video. " * 20 + mock_model_instance.generate_content.return_value = mock_response + mock_generative_model.return_value = mock_model_instance + + fake_video = tmp_path / "gemini_clip.mp4" + fake_video.write_bytes(b"fake_video_bytes") + + with patch("app.config.AGENT_WORKSPACE_ROOT", str(tmp_path)): + result = load_action(str(fake_video), query="What happens?") + + assert result["status"] == "success" + assert "native Gemini summary" in result["summary"] + assert result["file_saved"] is True + + mock_upload.assert_called_once() + mock_model_instance.generate_content.assert_called_once() + mock_delete.assert_called_once_with(mock_file.name) + + @patch("app.config.get_api_key") + @patch("app.internal_action_interface.InternalActionInterface.understand_video") + def test_fallback_path_triggered(self, mock_bridge, mock_get_api_key, tmp_path): + mock_get_api_key.return_value = None + + mock_return = { + "status": "success", + "summary": "Fallback summary", + "file_path": "/tmp/fallback.txt", + "file_saved": True, + "message": "" + } + mock_bridge.return_value = mock_return + + fake_video = tmp_path / "fallback_clip.mp4" + fake_video.write_bytes(b"fake_video_bytes") + + result = load_action(str(fake_video), query="Fallback query") + + assert result["status"] == "success" + assert result["summary"] == "Fallback summary" + mock_bridge.assert_called_once() + called_args = mock_bridge.call_args[0] + assert called_args[0] == str(fake_video) From 77d4d878eedb1e89dfd5edba98555edc653a1cc2 Mon Sep 17 00:00:00 2001 From: Korivi Date: Thu, 16 Apr 2026 23:52:54 +0900 Subject: [PATCH 12/30] Major Issues are fixed - Install Issues on Mac fixed - Python compatibility & Syntax Issues fixed - CLI skills updated - Local LLM compatibility Issues fixed - Image action error fixed --- agent_core/core/embedding_interface.py | 2 + agent_core/core/impl/action/router.py | 2 + agent_core/core/impl/llm/interface.py | 6 +- agent_core/core/impl/vlm/interface.py | 52 +++--- agent_core/core/registry/action.py | 2 + agent_core/core/registry/context.py | 2 + agent_core/core/registry/database.py | 2 + agent_core/core/registry/event_stream.py | 2 + agent_core/core/registry/llm.py | 2 + agent_core/core/registry/memory.py | 2 + agent_core/core/registry/state.py | 2 + agent_core/core/registry/task_manager.py | 2 + agent_core/decorators/log_events.py | 2 + agent_core/decorators/profiler.py | 2 + app/config/skills_config.json | 2 +- app/gui/gui_module.py | 2 + app/internal_action_interface.py | 2 + app/security/prompt_sanitizer.py | 2 + app/ui_layer/local_llm_setup.py | 2 + install.py | 151 +++++++++++++++++- skills/cli-anything/SKILL.md | 130 ++++++++++----- skills/docx/scripts/comment.py | 2 + skills/docx/scripts/office/pack.py | 2 + .../nano-banana-pro/scripts/generate_image.py | 2 + skills/ontology/scripts/ontology.py | 2 + skills/pptx/scripts/office/pack.py | 2 + skills/tesla-api/scripts/tesla.py | 2 + skills/xlsx/scripts/office/pack.py | 2 + 28 files changed, 316 insertions(+), 71 deletions(-) diff --git a/agent_core/core/embedding_interface.py b/agent_core/core/embedding_interface.py index b9894cbd..17acfa99 100644 --- a/agent_core/core/embedding_interface.py +++ b/agent_core/core/embedding_interface.py @@ -12,6 +12,8 @@ - GOOGLE_API_KEY (for provider="gemini") """ +from __future__ import annotations + import os from typing import List, Optional diff --git a/agent_core/core/impl/action/router.py b/agent_core/core/impl/action/router.py index 12f1fef9..210c2458 100644 --- a/agent_core/core/impl/action/router.py +++ b/agent_core/core/impl/action/router.py @@ -6,6 +6,8 @@ based on user queries using LLM reasoning. """ +from __future__ import annotations + import json import ast from typing import Optional, List, Dict, Any, Tuple diff --git a/agent_core/core/impl/llm/interface.py b/agent_core/core/impl/llm/interface.py index 5114cfae..3f201d9e 100644 --- a/agent_core/core/impl/llm/interface.py +++ b/agent_core/core/impl/llm/interface.py @@ -1239,22 +1239,24 @@ def _generate_ollama(self, system_prompt: str | None, user_prompt: str) -> Dict[ try: payload = { "model": self.model, - "system": system_prompt, "prompt": user_prompt, "stream": False, + "format": "json", "options": { "temperature": self.temperature, } } + if system_prompt: + payload["system"] = system_prompt url: str = f"{self.remote_url.rstrip('/')}/api/generate" response = requests.post(url, json=payload, timeout=600) response.raise_for_status() result = response.json() content = result.get("response", "").strip() - total_tokens = result.get("usage", {}).get("total_tokens", 0) token_count_input = result.get("prompt_eval_count", 0) token_count_output = result.get("eval_count", 0) + total_tokens = token_count_input + token_count_output status = "success" except Exception as exc: exc_obj = exc diff --git a/agent_core/core/impl/vlm/interface.py b/agent_core/core/impl/vlm/interface.py index dce58675..927bd8e6 100644 --- a/agent_core/core/impl/vlm/interface.py +++ b/agent_core/core/impl/vlm/interface.py @@ -233,7 +233,9 @@ def describe_image_bytes( if log_response: logger.info(f"[LLM SEND] system={system_prompt} | user={user_prompt}") - if self.provider in ("openai", "minimax", "deepseek", "moonshot", "grok"): + if self.provider == "deepseek": + raise RuntimeError("DeepSeek does not support vision/VLM. Use a different provider for image description.") + elif self.provider in ("openai", "minimax", "moonshot", "grok"): response = self._openai_describe_bytes(image_bytes, system_prompt, user_prompt) elif self.provider == "remote": response = self._ollama_describe_bytes(image_bytes, system_prompt, user_prompt) @@ -288,6 +290,17 @@ async def generate_response_async( # ───────────────────── Provider Helpers ───────────────────── + @staticmethod + def _detect_mime_type(image_bytes: bytes) -> str: + """Detect image MIME type from the first few bytes of image data.""" + if image_bytes[:8] == b'\x89PNG\r\n\x1a\n': + return "image/png" + if image_bytes[:4] == b'GIF8': + return "image/gif" + if image_bytes[:4] == b'RIFF' and image_bytes[8:12] == b'WEBP': + return "image/webp" + return "image/jpeg" + def _report_usage_async( self, service_type: str, @@ -318,8 +331,9 @@ def _report_usage_async( logger.warning(f"[VLM] Failed to report usage: {e}") def _openai_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) -> Dict[str, Any]: - """OpenAI vision request with automatic prompt caching metrics.""" + """OpenAI/Grok vision request with automatic prompt caching metrics.""" img_b64 = base64.b64encode(image_bytes).decode() + mime_type = self._detect_mime_type(image_bytes) messages: list[Dict[str, Any]] = [] if sys: messages.append({"role": "system", "content": sys}) @@ -328,7 +342,7 @@ def _openai_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) "role": "user", "content": [ {"type": "text", "text": usr}, - {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}, + {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{img_b64}"}}, ], } ) @@ -337,7 +351,6 @@ def _openai_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) messages=messages, temperature=self.temperature, max_tokens=2048, - response_format={"type": "json_object"}, ) content = response.choices[0].message.content.strip() token_count_input = response.usage.prompt_tokens @@ -359,9 +372,9 @@ def _openai_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) elif sys and len(sys) >= config.min_cache_tokens: metrics.record_miss("openai", "automatic_vlm", total_tokens=token_count_input) - # Report usage via hook + # Report usage via hook (use actual provider name, e.g. "grok", "minimax") self._report_usage_async( - "vlm_openai", "openai", self.model, + f"vlm_{self.provider}", self.provider, self.model, token_count_input, token_count_output, cached_tokens ) @@ -377,16 +390,20 @@ def _ollama_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) payload = { "model": self.model, "prompt": usr, - "system": sys, "images": [img_b64], "stream": False, - "temperature": self.temperature, + "options": {"temperature": self.temperature}, } + if sys: + payload["system"] = sys url: str = f"{self.remote_url.rstrip('/')}/api/generate" r = requests.post(url, json=payload, timeout=600) r.raise_for_status() - content = r.json().get("response", "").strip() - total_tokens = r.json().get("usage", {}).get("total_tokens", 0) + result = r.json() + content = result.get("response", "").strip() + token_count_input = result.get("prompt_eval_count", 0) + token_count_output = result.get("eval_count", 0) + total_tokens = token_count_input + token_count_output return { "tokens_used": total_tokens or 0, @@ -404,7 +421,7 @@ def _gemini_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) image_bytes=image_bytes, system_prompt=sys, temperature=self.temperature, - json_mode=True, + json_mode=False, ) # Record cache metrics @@ -431,6 +448,7 @@ def _gemini_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) def _byteplus_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) -> Dict[str, Any]: """BytePlus vision request.""" img_b64 = base64.b64encode(image_bytes).decode() + mime_type = self._detect_mime_type(image_bytes) messages: list[Dict[str, Any]] = [] if sys: messages.append({"role": "system", "content": sys}) @@ -440,7 +458,7 @@ def _byteplus_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str "role": "user", "content": [ {"type": "text", "text": usr}, - {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}, + {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{img_b64}"}}, ], } ) @@ -451,7 +469,6 @@ def _byteplus_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str "messages": messages, "temperature": self.temperature, "max_tokens": 2048, - "response_format": {"type": "json_object"}, } headers = { "Content-Type": "application/json", @@ -486,14 +503,7 @@ def _anthropic_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: st img_b64 = base64.b64encode(image_bytes).decode() config = get_cache_config() - # Detect media type from image bytes - media_type = "image/jpeg" - if image_bytes[:8] == b'\x89PNG\r\n\x1a\n': - media_type = "image/png" - elif image_bytes[:4] == b'GIF8': - media_type = "image/gif" - elif image_bytes[:4] == b'RIFF' and image_bytes[8:12] == b'WEBP': - media_type = "image/webp" + media_type = self._detect_mime_type(image_bytes) message_content = [ { diff --git a/agent_core/core/registry/action.py b/agent_core/core/registry/action.py index 956c9dba..46478333 100644 --- a/agent_core/core/registry/action.py +++ b/agent_core/core/registry/action.py @@ -19,6 +19,8 @@ result = await executor.execute_action(action, input_data) """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/core/registry/context.py b/agent_core/core/registry/context.py index fe3aef47..4ba203d5 100644 --- a/agent_core/core/registry/context.py +++ b/agent_core/core/registry/context.py @@ -16,6 +16,8 @@ system_prompt, user_prompt = engine.make_prompt(query="...") """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/core/registry/database.py b/agent_core/core/registry/database.py index ab04e20d..cb5a3827 100644 --- a/agent_core/core/registry/database.py +++ b/agent_core/core/registry/database.py @@ -18,6 +18,8 @@ db.list_actions() """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/core/registry/event_stream.py b/agent_core/core/registry/event_stream.py index 041ff55d..fec9e3e3 100644 --- a/agent_core/core/registry/event_stream.py +++ b/agent_core/core/registry/event_stream.py @@ -16,6 +16,8 @@ manager.log("INFO", "Something happened") """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/core/registry/llm.py b/agent_core/core/registry/llm.py index 4e82fb67..be8d40ab 100644 --- a/agent_core/core/registry/llm.py +++ b/agent_core/core/registry/llm.py @@ -18,6 +18,8 @@ response = await llm.generate_response_async(prompt) """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/core/registry/memory.py b/agent_core/core/registry/memory.py index c1586d69..cf774336 100644 --- a/agent_core/core/registry/memory.py +++ b/agent_core/core/registry/memory.py @@ -21,6 +21,8 @@ pointers = memory.retrieve("user preferences") """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/core/registry/state.py b/agent_core/core/registry/state.py index 3b869851..45571b50 100644 --- a/agent_core/core/registry/state.py +++ b/agent_core/core/registry/state.py @@ -19,6 +19,8 @@ await manager.start_session() """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/core/registry/task_manager.py b/agent_core/core/registry/task_manager.py index ce87f4e8..da57db77 100644 --- a/agent_core/core/registry/task_manager.py +++ b/agent_core/core/registry/task_manager.py @@ -16,6 +16,8 @@ task_id = manager.create_task("My Task", "Do something") """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/decorators/log_events.py b/agent_core/decorators/log_events.py index ab9a7cfe..41a84547 100644 --- a/agent_core/decorators/log_events.py +++ b/agent_core/decorators/log_events.py @@ -8,6 +8,8 @@ {id}, {name}, {args}, {kwargs}, {result}, {exception}, {duration_ms} """ +from __future__ import annotations + import logging import time import uuid diff --git a/agent_core/decorators/profiler.py b/agent_core/decorators/profiler.py index 38e5e77c..ca35a343 100644 --- a/agent_core/decorators/profiler.py +++ b/agent_core/decorators/profiler.py @@ -28,6 +28,8 @@ Set "auto_save_interval" to N to save after every N loops (0 = only at exit). """ +from __future__ import annotations + import atexit import asyncio import functools diff --git a/app/config/skills_config.json b/app/config/skills_config.json index 0975a5d4..8fde8d49 100644 --- a/app/config/skills_config.json +++ b/app/config/skills_config.json @@ -1,7 +1,6 @@ { "auto_load": true, "enabled_skills": [ - "cli-anything", "docx", "pdf", "playwright-mcp", @@ -9,6 +8,7 @@ "xlsx" ], "disabled_skills": [ + "cli-anything", "agentmail", "ai-news-collector", "ai-ppt-generator", diff --git a/app/gui/gui_module.py b/app/gui/gui_module.py index 6d53c583..124bb967 100644 --- a/app/gui/gui_module.py +++ b/app/gui/gui_module.py @@ -1,3 +1,5 @@ + +from __future__ import annotations import json import ast import tempfile diff --git a/app/internal_action_interface.py b/app/internal_action_interface.py index a1486f1b..9fffeadb 100644 --- a/app/internal_action_interface.py +++ b/app/internal_action_interface.py @@ -5,6 +5,8 @@ framework internal functions. """ +from __future__ import annotations + from typing import Dict, Any, Optional, List, TYPE_CHECKING from app.llm import LLMInterface, LLMCallType from app.vlm_interface import VLMInterface diff --git a/app/security/prompt_sanitizer.py b/app/security/prompt_sanitizer.py index 3dba8ced..71ae1ce0 100644 --- a/app/security/prompt_sanitizer.py +++ b/app/security/prompt_sanitizer.py @@ -9,6 +9,8 @@ - Format manipulation attacks """ +from __future__ import annotations + import re from typing import Any diff --git a/app/ui_layer/local_llm_setup.py b/app/ui_layer/local_llm_setup.py index 67437eab..e998c510 100644 --- a/app/ui_layer/local_llm_setup.py +++ b/app/ui_layer/local_llm_setup.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- """Local LLM setup utilities for Ollama.""" +from __future__ import annotations + import asyncio import json import logging diff --git a/install.py b/install.py index bfbfc982..2c346460 100644 --- a/install.py +++ b/install.py @@ -553,19 +553,43 @@ def verify_conda_env(env_name: str) -> bool: def install_nodejs_linux(): """ - Automatically install Node.js on Linux systems (including Kali). - Detects the package manager (apt, pacman, yum) and installs accordingly. + Automatically install Node.js on Linux/macOS systems (including Kali). + Detects the package manager (brew, apt, pacman, yum) and installs accordingly. """ if sys.platform == "win32": return True # Windows users should install Node.js manually from nodejs.org - + # Check if node is already installed if shutil.which("node") and shutil.which("npm"): print("✓ Node.js and npm are already installed") return True - + print("\n🔧 Installing Node.js...") - + + # macOS: try Homebrew first, then nvm + if sys.platform == "darwin": + if shutil.which("brew"): + print(" Found Homebrew, installing Node.js...") + try: + result = run_command(["brew", "install", "node"], check=False, capture=True, quiet=True, show_error=False) + if result and hasattr(result, 'returncode') and result.returncode == 0: + print("✓ Node.js installed via Homebrew") + time.sleep(1) + if shutil.which("node") and shutil.which("npm"): + return True + print("⚠ Node.js installed but not yet in PATH. Restart your terminal.") + return False + except Exception as e: + print(f" ⚠ brew install node failed: {str(e)[:100]}") + print("\n⚠ Could not automatically install Node.js on macOS") + print("\nOptions:") + print(" 1. Install Homebrew (https://brew.sh), then run: brew install node") + print(" 2. Download Node.js from: https://nodejs.org/ (LTS version)") + print(" 3. Use nvm: curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.0/install.sh | bash") + print(" then: nvm install --lts") + print("\n After installation, restart your terminal and run: python3 install.py") + return False + # Detect package manager and prepare install commands # Format: (package_manager, update_cmd, install_cmd) package_managers = [ @@ -1148,10 +1172,127 @@ def show_api_setup_instructions(): print("="*50 + "\n") +# ========================================== +# LINUX PYTHON COMPATIBILITY CHECK +# ========================================== +def _check_linux_python() -> None: + """ + Warn Linux users who are running an old or system-managed Python. + + Common problem scenarios: + - Python < 3.9 (Ubuntu 20.04 default is 3.8) + - System Python used directly without a venv, which triggers PEP 668 + "externally-managed-environment" errors on newer distros + """ + ver = sys.version_info + + # Already gated to >= 3.9 above, but warn hard about 3.9 since + # it's the bare minimum — 3.11+ is much more reliable. + if ver < (3, 10): + print("\n" + "=" * 62) + print(f" ⚠ Python {ver.major}.{ver.minor} detected — upgrade recommended") + print("=" * 62) + print(f"\n You are running Python {ver.major}.{ver.minor}.{ver.micro}.") + print(" CraftBot works on 3.9+ but runs best on Python 3.11 or newer.") + print("\n To install Python 3.11 on Ubuntu/Debian/Kali:") + print(" sudo apt update") + print(" sudo apt install -y software-properties-common") + print(" sudo add-apt-repository ppa:deadsnakes/ppa") + print(" sudo apt install -y python3.11 python3.11-venv python3.11-pip") + print(" python3.11 install.py") + print() + print(" Or use pyenv (works on any distro):") + print(" curl https://pyenv.run | bash") + print(" pyenv install 3.11.9") + print(" pyenv local 3.11.9") + print(" python install.py") + print("=" * 62) + choice = input("\n Continue with Python 3.9 anyway? (y/n): ").strip().lower() + if choice != "y": + print("\n Installation cancelled. Please upgrade Python and try again.\n") + sys.exit(1) + print() + + +# ========================================== +# MAC PYTHON COMPATIBILITY CHECK +# ========================================== +def _check_mac_python() -> None: + """ + Warn Mac users who are running a problematic Python interpreter. + + Common bad interpreters on macOS: + - Xcode bundled Python (/Applications/Xcode.app/...) + - macOS system Python (/usr/bin/python3) + + Both are difficult to install packages into and are intended as OS + tooling, not for running user applications. Homebrew or python.org + Python is recommended instead. + """ + exe = sys.executable or "" + is_xcode = "Xcode.app" in exe or "Python3.framework" in exe + is_system = exe.startswith("/usr/bin/python") + + if not (is_xcode or is_system): + return # Running a proper Python — nothing to warn about + + ver = sys.version_info + label = "Xcode's built-in Python" if is_xcode else "macOS system Python" + + print("\n" + "=" * 62) + print(" ⚠ WARNING: Wrong Python interpreter detected") + print("=" * 62) + print(f"\n You are using {label}:") + print(f" {exe}") + print(f"\n This Python ({ver.major}.{ver.minor}.{ver.micro}) is reserved for macOS") + print(" system tools. Installing packages into it can be unreliable") + print(" and may break system components.") + print("\n Recommended fix — install Python via Homebrew:") + print() + print(" # 1. Install Homebrew (if not already installed):") + print(' /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"') + print() + print(" # 2. Install Python 3.11 (or newer):") + print(" brew install python@3.11") + print() + print(" # 3. Re-run the installer with Homebrew Python:") + print(" /opt/homebrew/bin/python3.11 install.py # Apple Silicon") + print(" /usr/local/bin/python3.11 install.py # Intel Mac") + print() + print(" Alternative: download Python from https://www.python.org/downloads/") + print("=" * 62) + + choice = input("\n Continue with the current interpreter anyway? (y/n): ").strip().lower() + if choice != "y": + print("\n Installation cancelled. Please use a Homebrew or python.org Python.\n") + sys.exit(1) + print() + + # ========================================== # MAIN # ========================================== if __name__ == "__main__": + # ── Python version gate ──────────────────────────────────────────────── + _ver = sys.version_info + if _ver < (3, 9): + print(f"\n❌ Python {_ver.major}.{_ver.minor} is not supported.") + print(" CraftBot requires Python 3.9 or newer.") + if sys.platform == "darwin": + print("\n Recommended fix:") + print(" 1. Install Homebrew: https://brew.sh") + print(" 2. Run: brew install python@3.11") + print(" 3. Re-run: /opt/homebrew/bin/python3.11 install.py") + else: + print("\n Please install Python 3.9+ from https://www.python.org/downloads/") + sys.exit(1) + + # ── platform-specific interpreter checks ────────────────────────────── + if sys.platform == "darwin": + _check_mac_python() + elif sys.platform == "linux": + _check_linux_python() + args = set(sys.argv[1:]) # Parse flags diff --git a/skills/cli-anything/SKILL.md b/skills/cli-anything/SKILL.md index 58309cc0..5dbff223 100644 --- a/skills/cli-anything/SKILL.md +++ b/skills/cli-anything/SKILL.md @@ -1,6 +1,6 @@ --- name: cli-anything -description: "Use any supported GUI application (GIMP, Blender, LibreOffice, Audacity, OBS, etc.) on behalf of the user. Auto-installs the app and CLI harness, then executes the task directly." +description: "Automatically handles image editing, document conversion, audio/video editing, 3D rendering, diagrams, AI image generation, and more — using GIMP, LibreOffice, Blender, Audacity, Inkscape, Krita, Kdenlive, Shotcut, OBS, Draw.io, Mermaid, Ollama, Stable Diffusion, ComfyUI, JupyterLab, FreeCAD, QGIS, Grafana, Gitea, GitLab, NextCloud, Jenkins, AdGuard Home, Zoom, Mubu. User does NOT need to mention CLI-Anything — agent auto-selects the right app for the task. Auto-installs if not present." action-sets: ["shell", "file_operations"] --- @@ -8,6 +8,56 @@ action-sets: ["shell", "file_operations"] **Core rule: Do everything yourself. Never give the user a command to run. Never explain steps. Just execute the task and report the result.** +**Activation rule: The user does NOT need to say "CLI-Anything". If their task matches a supported app below, use it automatically — no prompting needed.** + +--- + +## Task Routing — Auto-select the right app (check this before every task) + +| If the user asks about... | Use this app | Command prefix | +|---|---|---| +| Resize / crop / filter / edit an image | **GIMP** | `cli-anything-gimp` | +| Convert image format (JPG→PNG, PNG→WEBP, etc.) | **GIMP** | `cli-anything-gimp` | +| SVG, vector graphics, logos | **Inkscape** | `cli-anything-inkscape` | +| Digital painting, .kra files | **Krita** | `cli-anything-krita` | +| Convert DOCX / XLSX / PPTX → PDF | **LibreOffice** | `cli-anything-libreoffice` | +| Writer / Calc / Impress / spreadsheet macros | **LibreOffice** | `cli-anything-libreoffice` | +| Trim / convert / export audio (MP3, WAV, FLAC) | **Audacity** | `cli-anything-audacity` | +| Render / edit video | **Kdenlive** or **Shotcut** | `cli-anything-kdenlive` | +| Record screen or live stream | **OBS Studio** | `cli-anything-obs` | +| 3D modeling / rendering / .blend files | **Blender** | `cli-anything-blender` | +| Create or export diagrams (.drawio) | **Draw.io** | `cli-anything-draw-io` | +| Render Mermaid diagram code | **Mermaid** | `cli-anything-mermaid` | +| Generate image from text prompt (AI) | **Stable Diffusion** or **ComfyUI** | `cli-anything-stable-diffusion` | +| Run a local LLM | **Ollama** | `cli-anything-ollama` | +| AI content generation | **AnyGen** | `cli-anything-anygen` | +| AI research / summarize PDF | **NotebookLM** | `cli-anything-notebooklm` | +| Execute a Jupyter notebook | **JupyterLab** | `cli-anything-jupyterlab` | +| CAD / 3D design, .fcstd files | **FreeCAD** | `cli-anything-freecad` | +| GIS / maps, .qgz files | **QGIS** | `cli-anything-qgis` | +| Monitoring dashboards | **Grafana** | `cli-anything-grafana` | +| Git hosting, create repos | **Gitea** or **GitLab** | `cli-anything-gitea` | +| CI/CD pipelines | **Jenkins** | `cli-anything-jenkins` | +| Cloud file sync | **NextCloud** | `cli-anything-nextcloud` | +| Network-wide ad blocking | **AdGuard Home** | `cli-anything-adguard-home` | +| Video conferencing | **Zoom** | `cli-anything-zoom` | +| Knowledge outlines | **Mubu** | `cli-anything-mubu` | + +--- + +## Smart Fallback — When CLI-Anything fails + +CLI-Anything is the first choice, but if it fails the agent must still complete the task: + +1. **Try CLI-Anything first** — always attempt the harness (`cli-anything-`) +2. **If harness fails after 1 retry** — fall back to Python (PIL, python-docx, pydub, moviepy, etc.) and complete the task anyway +3. **Always tell the user** what was actually used and suggest installing the app for better results + +Example: +> "Done — resized using Python PIL as a fallback (GIMP harness failed). Install GIMP for higher quality results next time." + +Never leave the user with no result. Always complete the task one way or another. + --- ## FORBIDDEN — Never Do These (causes bugs on all platforms) @@ -47,67 +97,63 @@ If the user's message matches any of these (case-insensitive, any wording): **CLI-Anything — What I Can Do** -Just tell me what you want done in plain English. I'll auto-install the app if it's not on your system and complete the task for you — you never need to run any commands yourself. +Just describe your task in plain English — you don't need to mention CLI-Anything. I'll pick the right app, install it if needed, and complete the task. Works on Windows, macOS, and Linux. **Creative & Media** -| App | What I can do | Example prompt | +| App | What I do | Example | |---|---|---| -| GIMP | Resize, crop, blur, convert, export images | "Resize photo.jpg to 1920×1080 and save as photo_hd.jpg" | -| Blender | Render 3D scenes, run scripts, export models | "Render scene.blend to PNG frames in the frames/ folder" | -| Inkscape | Export SVG to PNG/PDF, convert vector files | "Export logo.svg as a 300 DPI PNG" | -| Krita | Export paintings, batch convert images | "Export painting.kra as PNG" | -| Audacity | Trim, export, convert audio files | "Trim the first 30 seconds from audio.mp3 and save as clip.mp3" | -| OBS Studio | Record screen, stream | "Record my screen for 60 seconds" | -| Kdenlive | Render video projects to MP4/MKV | "Render project.kdenlive to MP4" | -| Shotcut | Render video projects to MP4 | "Render project.mlt to MP4" | +| GIMP _(image editing)_ | Resize, crop, filter, convert, export images | "Resize photo.jpg to 1920×1080" | +| Blender _(3D modeling & rendering)_ | Render scenes, export models, run scripts | "Render scene.blend to PNG frames" | +| Inkscape _(vector graphics)_ | Export SVG to PNG/PDF, convert vectors | "Export logo.svg as 300 DPI PNG" | +| Audacity _(audio production)_ | Trim, convert, export audio | "Trim first 30s from audio.mp3" | +| OBS Studio _(live streaming & recording)_ | Record screen, capture video, stream | "Record my screen for 60 seconds" | +| Kdenlive _(video editing)_ | Render video projects to MP4/MKV | "Render project.kdenlive to MP4" | +| Shotcut _(video editing)_ | Render video projects to MP4 | "Render project.mlt to MP4" | +| Krita _(digital painting)_ | Export paintings, batch convert .kra files | "Export painting.kra as PNG" | **Office & Productivity** -| App | What I can do | Example prompt | +| App | What I do | Example | |---|---|---| -| LibreOffice | Convert DOCX/XLSX/PPTX to PDF, run macros | "Convert report.docx to PDF" | -| Mubu | Manage knowledge outlines | "Open my outline in Mubu" | +| LibreOffice _(Writer, Calc, Impress)_ | Convert DOCX/XLSX/PPTX to PDF, run macros | "Convert report.docx to PDF" | +| Mubu _(knowledge management & outlining)_ | Manage outlines and knowledge bases | "Open my outline in Mubu" | **Communication** -| App | What I can do | Example prompt | +| App | What I do | Example | |---|---|---| -| Zoom | Start/join meetings | "Start a Zoom meeting" | +| Zoom _(video conferencing)_ | Start or join meetings | "Start a Zoom meeting" | **Diagramming** -| App | What I can do | Example prompt | +| App | What I do | Example | |---|---|---| -| Draw.io | Export diagrams to PNG/SVG/PDF | "Export diagram.drawio as PNG" | -| Mermaid | Render diagram code to PNG | "Render this diagram to PNG: graph TD; A-->B; B-->C" | +| Draw.io _(diagrams)_ | Export diagrams to PNG/SVG/PDF | "Export diagram.drawio as PNG" | +| Mermaid Live Editor _(diagrams)_ | Render diagram code to image | "Render: graph TD; A-->B; B-->C" | **AI & ML** -| App | What I can do | Example prompt | -|---|---|---| -| ComfyUI | Run AI image generation workflows | "Run workflow.json and save images to output/" | -| AnyGen | Generate AI content | "Generate content using AnyGen" | -| NotebookLM | AI research and summarization | "Summarize this PDF using NotebookLM" | -| Ollama | Run local LLM inference | "Run llama3 and summarize this text: ..." | -| Stable Diffusion | Generate images from text prompts | "Generate 'a sunset over mountains' and save as out.png" | - -**Dev & Infrastructure** -| App | What I can do | Example prompt | +| App | What I do | Example | |---|---|---| -| JupyterLab | Execute notebooks, save output | "Execute notebook.ipynb and save the output" | -| Grafana | Export dashboards | "Export my dashboard as JSON" | -| Gitea | Create repos, manage git hosting | "Create a private repo called myrepo on Gitea" | -| GitLab | Create projects, manage CI/CD | "Create a new project on GitLab" | -| NextCloud | Sync files, manage cloud storage | "Sync my files to NextCloud" | -| Jenkins | Trigger build pipelines | "Trigger my build pipeline" | -| AdGuard Home | Set up network-wide ad blocking | "Set up network-wide ad blocking with AdGuard Home" | - -**GIS & 3D Design** -| App | What I can do | Example prompt | +| ComfyUI _(AI image generation)_ | Run AI image workflows | "Run workflow.json, save to output/" | +| AnyGen _(AI content generation)_ | Generate AI content | "Generate content using AnyGen" | +| NotebookLM _(AI research assistant)_ | Research, summarize documents | "Summarize this PDF in NotebookLM" | +| Ollama _(local LLM inference)_ | Run local AI models | "Run llama3: summarize this text" | +| Stable Diffusion WebUI | Generate images from text prompts | "Generate 'sunset over mountains'" | + +**Network & Infrastructure** +| App | What I do | Example | |---|---|---| +| AdGuard Home _(network-wide ad blocking)_ | Set up DNS-level ad blocking | "Set up AdGuard Home ad blocking" | +| JupyterLab | Execute notebooks, save output | "Run notebook.ipynb and save output" | +| Jenkins | Trigger CI/CD pipelines | "Trigger my build pipeline" | +| Gitea | Git hosting, create/manage repos | "Create private repo called myrepo" | +| NextCloud | Cloud file sync | "Sync my folder to NextCloud" | +| GitLab | Projects, CI/CD pipelines | "Create a new GitLab project" | +| Grafana | Export monitoring dashboards | "Export my dashboard as JSON" | | FreeCAD | Export 3D models to STL/STEP | "Export model.fcstd as STL" | | QGIS | Export maps to PNG/PDF | "Export map.qgz as PNG" | **Tips:** -- Always give me the full file path (e.g. `C:\Users\you\Desktop\photo.jpg`) -- If the app isn't installed, I'll install it automatically — just wait a few minutes -- I never ask you to run commands yourself — I do everything for you +- Give me the full file path (e.g. `C:\Users\you\Desktop\photo.jpg` or `/home/user/photo.jpg`) +- If the app isn't installed, I install it automatically — no action needed from you +- If the app fails, I fall back to a Python alternative and tell you - Works on Windows, macOS, and Linux --- diff --git a/skills/docx/scripts/comment.py b/skills/docx/scripts/comment.py index 36e1c935..35600710 100644 --- a/skills/docx/scripts/comment.py +++ b/skills/docx/scripts/comment.py @@ -13,6 +13,8 @@ """ +from __future__ import annotations + import argparse import random import shutil diff --git a/skills/docx/scripts/office/pack.py b/skills/docx/scripts/office/pack.py index db29ed8b..55b53343 100644 --- a/skills/docx/scripts/office/pack.py +++ b/skills/docx/scripts/office/pack.py @@ -10,6 +10,8 @@ python pack.py unpacked/ output.pptx --validate false """ +from __future__ import annotations + import argparse import sys import shutil diff --git a/skills/nano-banana-pro/scripts/generate_image.py b/skills/nano-banana-pro/scripts/generate_image.py index 0ceed2c2..0672c22e 100644 --- a/skills/nano-banana-pro/scripts/generate_image.py +++ b/skills/nano-banana-pro/scripts/generate_image.py @@ -1,3 +1,5 @@ + +from __future__ import annotations #!/usr/bin/env python3 # /// script # requires-python = ">=3.10" diff --git a/skills/ontology/scripts/ontology.py b/skills/ontology/scripts/ontology.py index 040b4354..2c8f8e07 100644 --- a/skills/ontology/scripts/ontology.py +++ b/skills/ontology/scripts/ontology.py @@ -1,3 +1,5 @@ + +from __future__ import annotations #!/usr/bin/env python3 """ Ontology graph operations: create, query, relate, validate. diff --git a/skills/pptx/scripts/office/pack.py b/skills/pptx/scripts/office/pack.py index db29ed8b..55b53343 100644 --- a/skills/pptx/scripts/office/pack.py +++ b/skills/pptx/scripts/office/pack.py @@ -10,6 +10,8 @@ python pack.py unpacked/ output.pptx --validate false """ +from __future__ import annotations + import argparse import sys import shutil diff --git a/skills/tesla-api/scripts/tesla.py b/skills/tesla-api/scripts/tesla.py index 3577107b..b5c10fd5 100644 --- a/skills/tesla-api/scripts/tesla.py +++ b/skills/tesla-api/scripts/tesla.py @@ -1,3 +1,5 @@ + +from __future__ import annotations #!/usr/bin/env python3 # /// script # requires-python = ">=3.10" diff --git a/skills/xlsx/scripts/office/pack.py b/skills/xlsx/scripts/office/pack.py index db29ed8b..55b53343 100644 --- a/skills/xlsx/scripts/office/pack.py +++ b/skills/xlsx/scripts/office/pack.py @@ -10,6 +10,8 @@ python pack.py unpacked/ output.pptx --validate false """ +from __future__ import annotations + import argparse import sys import shutil From 6915894d4160020c5f093deb6f19f368d5a9bb4c Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Thu, 16 Apr 2026 22:25:36 +0530 Subject: [PATCH 13/30] fix(vlm): remove response_format json_object from byteplus, re-raise exceptions in describe_image_bytes --- agent_core/core/impl/vlm/interface.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/agent_core/core/impl/vlm/interface.py b/agent_core/core/impl/vlm/interface.py index 455de4af..b4c7aed4 100644 --- a/agent_core/core/impl/vlm/interface.py +++ b/agent_core/core/impl/vlm/interface.py @@ -259,7 +259,7 @@ def describe_image_bytes( return cleaned except Exception as e: logger.error(f"[ERROR] {e}") - return "" + raise async def generate_response_async( self, @@ -624,7 +624,6 @@ def _byteplus_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str "messages": messages, "temperature": self.temperature, "max_tokens": 2048, - "response_format": {"type": "json_object"}, } headers = { "Content-Type": "application/json", From 125cff4368260ed302c3faa6783b85f6f3a59f7e Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Thu, 16 Apr 2026 22:25:57 +0530 Subject: [PATCH 14/30] fix(actions): split action_sets string into proper list in perform_ocr and understand_video --- app/data/action/perform_ocr.py | 2 +- app/data/action/understand_video.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/data/action/perform_ocr.py b/app/data/action/perform_ocr.py index 3c1d01d9..85c2a5d6 100644 --- a/app/data/action/perform_ocr.py +++ b/app/data/action/perform_ocr.py @@ -4,7 +4,7 @@ name="perform_ocr", description="Extracts all text from an image using OCR via a Vision Language Model. Use this when the user wants to read text from a screenshot, scanned document, photo of a receipt, whiteboard, sign, or any image containing text. Returns extracted text saved to a file in workspace.", mode="CLI", - action_sets=["document_processing, image"], + action_sets=["document_processing", "image"], input_schema={ "image_path": { "type": "string", diff --git a/app/data/action/understand_video.py b/app/data/action/understand_video.py index e4c5c77d..8c280419 100644 --- a/app/data/action/understand_video.py +++ b/app/data/action/understand_video.py @@ -4,7 +4,7 @@ name="understand_video", description="Uses Gemini 1.5 Pro for native video understanding when a Google API key is configured. Falls back to keyframe extraction via OpenCV if no Google API key is available.", mode="CLI", - action_sets=["document_processing, image"], + action_sets=["document_processing", "image"], requirement=["google-generativeai"], input_schema={ "video_path": { From 247ee92824ffc58a96b679e32c251764bdf99486 Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Thu, 16 Apr 2026 22:56:50 +0530 Subject: [PATCH 15/30] fix: wire independent VLM provider/model/key resolution and add availability guard --- app/agent_base.py | 19 +++++++++++++---- app/data/action/describe_image.py | 34 ++++++++++++++++++++++++++++--- app/data/action/generate_image.py | 2 +- app/main.py | 17 ++++++++++------ 4 files changed, 58 insertions(+), 14 deletions(-) diff --git a/app/agent_base.py b/app/agent_base.py index 8ee53288..8158dfcd 100644 --- a/app/agent_base.py +++ b/app/agent_base.py @@ -45,6 +45,8 @@ AGENT_FILE_SYSTEM_TEMPLATE_PATH, AGENT_MEMORY_CHROMA_PATH, PROCESS_MEMORY_AT_STARTUP, + get_api_key, + get_base_url, ) from app.internal_action_interface import InternalActionInterface @@ -124,6 +126,8 @@ def __init__( llm_api_key: str | None = None, llm_base_url: str | None = None, llm_model: str | None = None, + vlm_provider: str | None = None, + vlm_model: str | None = None, deferred_init: bool = False, ) -> None: """ @@ -139,6 +143,8 @@ def __init__( llm_api_key: API key for the LLM provider. llm_base_url: Base URL for the LLM provider (optional). llm_model: Model name override (None = use registry default). + vlm_provider: Provider name for VLM (defaults to llm_provider if None). + vlm_model: VLM model name override (None = use registry default). deferred_init: If True, allow LLM/VLM initialization to be deferred until API key is configured (useful for first-time setup). """ @@ -156,11 +162,16 @@ def __init__( base_url=llm_base_url, deferred=deferred_init, ) + # VLM uses its own provider/model settings, falling back to LLM values + _vlm_provider = vlm_provider or llm_provider + _vlm_api_key = get_api_key(_vlm_provider) if vlm_provider else llm_api_key + _vlm_base_url = get_base_url(_vlm_provider) if vlm_provider else llm_base_url + self.vlm = VLMInterface( - provider=llm_provider, - model=llm_model, - api_key=llm_api_key, - base_url=llm_base_url, + provider=_vlm_provider, + model=vlm_model, + api_key=_vlm_api_key, + base_url=_vlm_base_url, deferred=deferred_init, ) diff --git a/app/data/action/describe_image.py b/app/data/action/describe_image.py index abccca24..8e66ae7a 100644 --- a/app/data/action/describe_image.py +++ b/app/data/action/describe_image.py @@ -4,7 +4,7 @@ name="describe_image", description="Uses a Visual Language Model to analyse an image and return a detailed, markdown-ready description. IMPORTANT: Always provide a prompt describing what to look for or describe in the image.", mode="CLI", - action_sets=["document_processing, image"], + action_sets=["document_processing", "image"], input_schema={ "image_path": { "type": "string", @@ -48,9 +48,36 @@ def view_image(input_data: dict) -> dict: prompt = str(input_data.get('prompt', '')).strip() or "Describe the content of this image in detail." if simulated_mode: - # Return mock result for testing return {'status': 'success', 'description': 'A simulated image description showing various objects and colors.', 'message': ''} + # ── VLM availability guard ────────────────────────────────────────── + import app.internal_action_interface as iai + from agent_core.core.models.model_registry import MODEL_REGISTRY + from agent_core.core.models.types import InterfaceType + from app.config import get_vlm_provider + + vlm = iai.InternalActionInterface.vlm_interface + current_provider = get_vlm_provider() + registry_vlm = MODEL_REGISTRY.get(current_provider, {}).get(InterfaceType.VLM) + + if vlm is None or not registry_vlm: + return { + 'status': 'error', + 'description': '', + 'message': ( + f"The current VLM provider '{current_provider}' does not support vision/image analysis. " + "Please inform the user and suggest switching to a provider that supports VLM.\n\n" + "Providers with VLM support: openai, anthropic, gemini, byteplus.\n\n" + "To switch provider, edit 'app/config/settings.json' and update:\n" + ' "vlm_provider": "" (e.g. "anthropic")\n' + ' "vlm_model": "" (e.g. "claude-sonnet-4-6" for anthropic)\n\n' + "Make sure the corresponding API key is configured under 'api_keys' in the same file. " + "If no API key is set, ask the user to provide one. " + "The system will automatically detect the config change and reload." + ), + } + # ─────────────────────────────────────────────────────────────────── + if not image_path: return {'status': 'error', 'description': '', 'message': 'image_path is required.'} @@ -58,8 +85,9 @@ def view_image(input_data: dict) -> dict: return {'status': 'error', 'description': '', 'message': 'File not found.'} try: - import app.internal_action_interface as iai description = iai.InternalActionInterface.describe_image(image_path, prompt) + if not description: + return {'status': 'error', 'description': '', 'message': 'VLM returned an empty description.'} return {'status': 'success', 'description': description, 'message': ''} except Exception as e: return {'status': 'error', 'description': '', 'message': str(e)} \ No newline at end of file diff --git a/app/data/action/generate_image.py b/app/data/action/generate_image.py index fde5dfae..751a2d5e 100644 --- a/app/data/action/generate_image.py +++ b/app/data/action/generate_image.py @@ -10,7 +10,7 @@ - TIP: When generating multiple images for the same project or related work, use 'reference_images' parameter with previously generated images to maintain consistent style across all outputs""", default=True, mode="CLI", - action_sets=["content_creation, image, document_processing"], + action_sets=["content_creation", "image", "document_processing"], input_schema={ "prompt": { "type": "string", diff --git a/app/main.py b/app/main.py index ce4e5dd4..418f3dd1 100644 --- a/app/main.py +++ b/app/main.py @@ -67,7 +67,7 @@ def _suppress_console_logging_early() -> None: ConfigRegistry.register_workspace_root(".") # Import settings reader (reads directly from settings.json) -from app.config import get_llm_provider, get_api_key, get_base_url, get_llm_model +from app.config import get_llm_provider, get_vlm_provider, get_api_key, get_base_url, get_llm_model, get_vlm_model from app.agent_base import AgentBase @@ -110,12 +110,12 @@ def _parse_cli_args() -> dict: return vars(args) -def _initial_settings() -> tuple[str, str, str, bool]: +def _initial_settings() -> tuple: """Determine initial provider, API key, and base URL from settings.json. Returns: - Tuple of (provider, api_key, base_url, has_valid_key) where has_valid_key - indicates if a working API key was found. + Tuple of (provider, api_key, base_url, model, vlm_provider, vlm_model, has_valid_key) + where has_valid_key indicates if a working API key was found. """ # Read directly from settings.json provider = get_llm_provider() @@ -126,7 +126,10 @@ def _initial_settings() -> tuple[str, str, str, bool]: # Remote (Ollama) doesn't require API key has_key = bool(api_key) or provider == "remote" - return provider, api_key, base_url, model, has_key + vlm_prov = get_vlm_provider() # falls back to llm_provider if not set + vlm_mod = get_vlm_model() # falls back to registry default if None + + return provider, api_key, base_url, model, vlm_prov, vlm_mod, has_key async def main_async() -> None: @@ -136,7 +139,7 @@ async def main_async() -> None: browser_mode = cli_args.get("browser", False) # Get settings from settings.json - provider, api_key, base_url, model, has_valid_key = _initial_settings() + provider, api_key, base_url, model, vlm_prov, vlm_mod, has_valid_key = _initial_settings() # CLI args override settings.json if provided if cli_args.get("provider"): @@ -159,6 +162,8 @@ async def main_async() -> None: llm_api_key=api_key, llm_base_url=base_url, llm_model=model, + vlm_provider=vlm_prov, + vlm_model=vlm_mod, deferred_init=not has_valid_key, ) From f00ae32d15b3c8c17210cfb2c99f24bcf5fa8850 Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Fri, 17 Apr 2026 16:40:04 +0530 Subject: [PATCH 16/30] refactor(vlm): unify multimodal, deduplicate OCR path, dynamic video model - Merge generate_multimodal_multi_image into generate_multimodal (image_bytes_list param) - Add json_mode param to describe_image_bytes; describe_image_ocr now a thin wrapper - understand_video pulls model from get_vlm_model() with gemini-1.5-pro fallback - Add test suites: gemini_client_multimodal, vlm_json_mode, ocr_wrapper, understand_video_model --- agent_core/core/impl/vlm/interface.py | 33 +++---- agent_core/core/llm/google_gemini_client.py | 99 +++++---------------- app/data/action/understand_video.py | 7 +- tests/test_gemini_client_multimodal.py | 49 ++++++++++ tests/test_step1_vlm_interface.py | 21 ++--- tests/test_understand_video_model.py | 49 ++++++++++ tests/test_vlm_interface_json_mode.py | 53 +++++++++++ tests/test_vlm_ocr_wrapper.py | 52 +++++++++++ 8 files changed, 255 insertions(+), 108 deletions(-) create mode 100644 tests/test_gemini_client_multimodal.py create mode 100644 tests/test_understand_video_model.py create mode 100644 tests/test_vlm_interface_json_mode.py create mode 100644 tests/test_vlm_ocr_wrapper.py diff --git a/agent_core/core/impl/vlm/interface.py b/agent_core/core/impl/vlm/interface.py index b4c7aed4..1ddd401b 100644 --- a/agent_core/core/impl/vlm/interface.py +++ b/agent_core/core/impl/vlm/interface.py @@ -217,6 +217,7 @@ def describe_image_bytes( system_prompt: str | None = None, user_prompt: str | None = "Describe this image in detail.", log_response: bool = True, + json_mode: bool = True, ) -> str: """Describe an image from raw bytes using the VLM. @@ -234,7 +235,10 @@ def describe_image_bytes( logger.info(f"[LLM SEND] system={system_prompt} | user={user_prompt}") if self.provider in ("openai", "minimax", "deepseek", "moonshot", "grok"): - response = self._openai_describe_bytes(image_bytes, system_prompt, user_prompt) + if json_mode: + response = self._openai_describe_bytes(image_bytes, system_prompt, user_prompt) + else: + response = self._openai_describe_bytes_plain(image_bytes, system_prompt, user_prompt) elif self.provider == "remote": response = self._ollama_describe_bytes(image_bytes, system_prompt, user_prompt) elif self.provider == "gemini": @@ -311,24 +315,13 @@ def describe_image_ocr( logger.info(f"[LLM SEND] OCR request | path={image_path}") - if self.provider in ("openai", "minimax", "deepseek", "moonshot", "grok"): - response = self._openai_describe_bytes_plain(image_bytes, system_prompt, effective_user) - elif self.provider == "remote": - response = self._ollama_describe_bytes(image_bytes, system_prompt, effective_user) - elif self.provider == "gemini": - response = self._gemini_describe_bytes(image_bytes, system_prompt, effective_user) - elif self.provider == "byteplus": - response = self._byteplus_describe_bytes(image_bytes, system_prompt, effective_user) - elif self.provider == "anthropic": - response = self._anthropic_describe_bytes(image_bytes, system_prompt, effective_user) - else: - raise RuntimeError(f"Unknown provider {self.provider!r}") - - cleaned = re.sub(self._CODE_BLOCK_RE, "", response.get("content", "").strip()) - - tokens_used = response.get("tokens_used", 0) - if tokens_used: - self._set_token_count(self._get_token_count() + tokens_used) + cleaned = self.describe_image_bytes( + image_bytes, + system_prompt=system_prompt, + user_prompt=effective_user, + log_response=False, # Logged below + json_mode=False, + ) logger.info(f"[LLM RECV OCR] {cleaned[:120]}...") return cleaned @@ -451,7 +444,7 @@ def _gemini_describe_video_frames( self, frame_bytes_list: list[bytes], sys: str | None, usr: str ) -> str: """Gemini-specific multi-image frame analysis in a single API call.""" - result = self._gemini_client.generate_multimodal_multi_image( + result = self._gemini_client.generate_multimodal( self.model, text=usr, image_bytes_list=frame_bytes_list, diff --git a/agent_core/core/llm/google_gemini_client.py b/agent_core/core/llm/google_gemini_client.py index 3cbffe44..36ae2f21 100644 --- a/agent_core/core/llm/google_gemini_client.py +++ b/agent_core/core/llm/google_gemini_client.py @@ -168,12 +168,16 @@ def generate_multimodal( model: str, *, text: str, - image_bytes: bytes, + image_bytes: Optional[bytes] = None, + image_bytes_list: Optional[List[bytes]] = None, system_prompt: Optional[str] = None, temperature: Optional[float] = None, json_mode: bool = False, ) -> Dict[str, Any]: - """Generate text from a prompt that also contains an inline image. + """Generate text from a prompt that contains one or more inline images. + + Normalises both single-image and multi-image inputs into a consistent + request format for the Gemini API. Returns a dict containing: - tokens_used: Total tokens consumed @@ -185,7 +189,8 @@ def generate_multimodal( Args: model: Model identifier text: The text prompt - image_bytes: PNG image data + image_bytes: Single PNG image data (for backward compatibility) + image_bytes_list: List of image data (PNG/JPEG) system_prompt: Optional system instruction temperature: Sampling temperature json_mode: If True, enforce JSON output format @@ -193,80 +198,22 @@ def generate_multimodal( Returns: Dict with generation results and token counts """ - inline_data = { - "mimeType": "image/png", - "data": base64.b64encode(image_bytes).decode("utf-8"), - } - - parts: List[Dict[str, Any]] = [{"text": text}, {"inlineData": inline_data}] - contents = [{"role": "user", "parts": parts}] - - payload: Dict[str, Any] = {"contents": contents} - if system_prompt: - payload["systemInstruction"] = { - "parts": [{"text": system_prompt}], - } - - generation_config: Dict[str, Any] = {} - if temperature is not None: - generation_config["temperature"] = temperature - if json_mode: - generation_config["responseMimeType"] = "application/json" - if generation_config: - payload["generationConfig"] = generation_config - - response = self._post_json( - f"{_normalise_model_name(model)}:generateContent", payload - ) - - # Extract token usage from usageMetadata - usage_metadata = response.get("usageMetadata", {}) - total_tokens = usage_metadata.get("totalTokenCount", 0) - prompt_tokens = usage_metadata.get("promptTokenCount", 0) - completion_tokens = usage_metadata.get("candidatesTokenCount", 0) - cached_tokens = usage_metadata.get("cachedContentTokenCount", 0) - - content = self._extract_text(response) + # Normalise: single image wraps into list; list takes priority if both provided + images = image_bytes_list if image_bytes_list is not None else ([image_bytes] if image_bytes else []) + if not images: + raise ValueError("At least one of `image_bytes` or `image_bytes_list` must be provided.") - return { - "tokens_used": total_tokens, - "content": content, - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "cached_tokens": cached_tokens, - } - - def generate_multimodal_multi_image( - self, - model: str, - *, - text: str, - image_bytes_list: List[bytes], - system_prompt: Optional[str] = None, - temperature: Optional[float] = None, - json_mode: bool = False, - ) -> Dict[str, Any]: - """Generate text from a prompt that contains multiple inline images. - - Args: - model: Model identifier - text: The text prompt - image_bytes_list: List of PNG/JPEG image data - system_prompt: Optional system instruction - temperature: Sampling temperature - json_mode: If True, enforce JSON output format - - Returns: - Dict with generation results and token counts - """ parts: List[Dict[str, Any]] = [{"text": text}] - - for image_bytes in image_bytes_list: - inline_data = { - "mimeType": "image/jpeg", - "data": base64.b64encode(image_bytes).decode("utf-8"), - } - parts.append({"inlineData": inline_data}) + for img in images: + # Preserve existing mime-type logic: single-image callers stay PNG index, + # multi-image callers (video frames) use JPEG. + mime = "image/jpeg" if image_bytes_list is not None else "image/png" + parts.append({ + "inlineData": { + "mimeType": mime, + "data": base64.b64encode(img).decode("utf-8"), + } + }) contents = [{"role": "user", "parts": parts}] @@ -305,6 +252,8 @@ def generate_multimodal_multi_image( "cached_tokens": cached_tokens, } + + def embed_text(self, model: str, *, text: str) -> List[float]: """Fetch an embedding vector for the supplied text. diff --git a/app/data/action/understand_video.py b/app/data/action/understand_video.py index 8c280419..12a19804 100644 --- a/app/data/action/understand_video.py +++ b/app/data/action/understand_video.py @@ -2,7 +2,7 @@ @action( name="understand_video", - description="Uses Gemini 1.5 Pro for native video understanding when a Google API key is configured. Falls back to keyframe extraction via OpenCV if no Google API key is available.", + description="Uses the configured VLM model (default: Gemini 1.5 Pro) for native video understanding when a Google API key is configured. Falls back to keyframe extraction via OpenCV if no Google API key is available.", mode="CLI", action_sets=["document_processing", "image"], requirement=["google-generativeai"], @@ -80,7 +80,7 @@ def understand_video(input_data: dict) -> dict: if not os.path.isfile(video_path): return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'File not found.'} - from app.config import get_api_key + from app.config import get_api_key, get_vlm_model api_key = get_api_key('gemini') if api_key: @@ -97,7 +97,8 @@ def understand_video(input_data: dict) -> dict: time.sleep(2) video_file = genai.get_file(video_file.name) - model = genai.GenerativeModel("gemini-1.5-pro") + vlm_model = get_vlm_model() or "gemini-1.5-pro" + model = genai.GenerativeModel(vlm_model) prompt = query if query else "Understand and describe the contents of this video." response = model.generate_content([video_file, prompt]) diff --git a/tests/test_gemini_client_multimodal.py b/tests/test_gemini_client_multimodal.py new file mode 100644 index 00000000..16c7b5fb --- /dev/null +++ b/tests/test_gemini_client_multimodal.py @@ -0,0 +1,49 @@ +import base64 +import pytest +from unittest.mock import patch, MagicMock +from agent_core.core.llm.google_gemini_client import GeminiClient + +FAKE_RESPONSE = { + "candidates": [{"content": {"parts": [{"text": "ok"}]}, "finishReason": "STOP"}], + "usageMetadata": {"totalTokenCount": 10, "promptTokenCount": 8, "candidatesTokenCount": 2} +} + +@pytest.fixture +def client(): + return GeminiClient(api_key="fake-key") + +def test_single_image_produces_one_inlinedata_part(client): + """Passing image_bytes alone → exactly 1 inlineData in parts.""" + with patch.object(client, "_post_json", return_value=FAKE_RESPONSE) as mock_post: + client.generate_multimodal("gemini-2.0-flash", text="hi", image_bytes=b"img1") + # mock_post.call_args.args[1] is the payload + payload = mock_post.call_args.args[1] + parts = payload["contents"][0]["parts"] + inline_parts = [p for p in parts if "inlineData" in p] + assert len(inline_parts) == 1 + +def test_multi_image_produces_correct_count(client): + """Passing image_bytes_list of N images → exactly N inlineData parts.""" + with patch.object(client, "_post_json", return_value=FAKE_RESPONSE) as mock_post: + client.generate_multimodal("gemini-2.0-flash", text="hi", image_bytes_list=[b"a", b"b", b"c"]) + payload = mock_post.call_args.args[1] + parts = payload["contents"][0]["parts"] + inline_parts = [p for p in parts if "inlineData" in p] + assert len(inline_parts) == 3 + +def test_neither_image_raises_valueerror(client): + """Passing neither image_bytes nor image_bytes_list → ValueError.""" + with pytest.raises(ValueError): + client.generate_multimodal("gemini-2.0-flash", text="hi") + +def test_single_image_backwards_compat_response(client): + """Single-image call returns same response structure as before the refactor.""" + with patch.object(client, "_post_json", return_value=FAKE_RESPONSE): + result = client.generate_multimodal("gemini-2.0-flash", text="hi", image_bytes=b"img") + assert result["content"] == "ok" + assert result["tokens_used"] == 10 + +def test_generate_multimodal_multi_image_no_longer_exists(client): + """The old method must be gone.""" + assert not hasattr(client, "generate_multimodal_multi_image"), \ + "generate_multimodal_multi_image was not removed" diff --git a/tests/test_step1_vlm_interface.py b/tests/test_step1_vlm_interface.py index c1bf516f..88937c8c 100644 --- a/tests/test_step1_vlm_interface.py +++ b/tests/test_step1_vlm_interface.py @@ -42,14 +42,13 @@ def _make_client(self): client._timeout = 30 return client - def test_method_exists(self): - """generate_multimodal_multi_image must exist on GeminiClient.""" + def test_method_accepts_list(self): + """generate_multimodal must accept image_bytes_list.""" from agent_core.core.llm.google_gemini_client import GeminiClient - self.assertTrue( - hasattr(GeminiClient, "generate_multimodal_multi_image"), - "FAIL: GeminiClient.generate_multimodal_multi_image not found. " - "Add it to agent_core/core/llm/google_gemini_client.py" - ) + import inspect + sig = inspect.signature(GeminiClient.generate_multimodal) + self.assertIn("image_bytes_list", sig.parameters, + "FAIL: GeminiClient.generate_multimodal does not accept image_bytes_list.") def test_payload_contains_multiple_inline_data_parts(self): """The API payload must contain one inlineData entry per frame passed in.""" @@ -68,7 +67,7 @@ def fake_post(path, payload): client._post_json = fake_post frame_bytes = [b"frame1_bytes", b"frame2_bytes", b"frame3_bytes"] - result = client.generate_multimodal_multi_image( + result = client.generate_multimodal( "gemini-2.5-flash", text="What is happening?", image_bytes_list=frame_bytes, @@ -109,7 +108,7 @@ def test_system_prompt_is_included(self): captured = {} client._post_json = lambda path, payload: (captured.update(payload), fake_response)[1] - client.generate_multimodal_multi_image( + client.generate_multimodal( "gemini-2.5-flash", text="Describe", image_bytes_list=[b"img"], @@ -128,7 +127,7 @@ def test_no_system_prompt_omits_key(self): captured = {} client._post_json = lambda path, payload: (captured.update(payload), fake_response)[1] - client.generate_multimodal_multi_image( + client.generate_multimodal( "gemini-2.5-flash", text="Describe", image_bytes_list=[b"img"], @@ -527,6 +526,7 @@ def test_describe_image_bytes_returns_string(self): mock_response.choices = [mock_choice] mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 5 + mock_response.usage.prompt_tokens_details = None # Prevent MagicMock leak vlm.client = MagicMock() vlm.client.chat.completions.create.return_value = mock_response @@ -547,6 +547,7 @@ def test_describe_image_bytes_uses_json_response_format(self): mock_response.choices = [mock_choice] mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 5 + mock_response.usage.prompt_tokens_details = None # Prevent MagicMock leak vlm.client = MagicMock() vlm.client.chat.completions.create.return_value = mock_response diff --git a/tests/test_understand_video_model.py b/tests/test_understand_video_model.py new file mode 100644 index 00000000..cd69dc3b --- /dev/null +++ b/tests/test_understand_video_model.py @@ -0,0 +1,49 @@ +import pytest +from unittest.mock import patch, MagicMock +import os + +def test_understand_video_uses_configured_model(): + """understand_video must use get_vlm_model(), not hardcode gemini-1.5-pro.""" + mock_file = MagicMock() + mock_file.state.name = "ACTIVE" + mock_model_instance = MagicMock() + mock_model_instance.generate_content.return_value = MagicMock(text="video summary") + + with patch("os.path.isfile", return_value=True), \ + patch("app.config.get_api_key", return_value="fake-key"), \ + patch("app.config.get_vlm_model", return_value="gemini-2.0-flash") as mock_get_model, \ + patch("google.generativeai.configure"), \ + patch("google.generativeai.upload_file", return_value=mock_file), \ + patch("google.generativeai.get_file", return_value=mock_file), \ + patch("google.generativeai.GenerativeModel", return_value=mock_model_instance) as mock_gm, \ + patch("google.generativeai.delete_file"), \ + patch("builtins.open", MagicMock()), \ + patch("app.config.AGENT_WORKSPACE_ROOT", "/tmp"): + from app.data.action.understand_video import understand_video + understand_video({"video_path": "/fake/video.mp4"}) + called_model_name = mock_gm.call_args[0][0] + assert called_model_name == "gemini-2.0-flash", \ + f"Expected gemini-2.0-flash from config, got {called_model_name}" + +def test_understand_video_falls_back_when_config_missing(): + """If get_vlm_model() returns None, fall back to gemini-1.5-pro.""" + mock_file = MagicMock() + mock_file.state.name = "ACTIVE" + mock_model_instance = MagicMock() + mock_model_instance.generate_content.return_value = MagicMock(text="summary") + + with patch("os.path.isfile", return_value=True), \ + patch("app.config.get_api_key", return_value="fake-key"), \ + patch("app.config.get_vlm_model", return_value=None), \ + patch("google.generativeai.configure"), \ + patch("google.generativeai.upload_file", return_value=mock_file), \ + patch("google.generativeai.get_file", return_value=mock_file), \ + patch("google.generativeai.GenerativeModel", return_value=mock_model_instance) as mock_gm, \ + patch("google.generativeai.delete_file"), \ + patch("builtins.open", MagicMock()), \ + patch("app.config.AGENT_WORKSPACE_ROOT", "/tmp"): + from app.data.action.understand_video import understand_video + understand_video({"video_path": "/fake/video.mp4"}) + called_model_name = mock_gm.call_args[0][0] + assert called_model_name == "gemini-1.5-pro", \ + f"Expected fallback gemini-1.5-pro, got {called_model_name}" diff --git a/tests/test_vlm_interface_json_mode.py b/tests/test_vlm_interface_json_mode.py new file mode 100644 index 00000000..3d38495c --- /dev/null +++ b/tests/test_vlm_interface_json_mode.py @@ -0,0 +1,53 @@ +import pytest +from unittest.mock import MagicMock, patch +from agent_core.core.impl.vlm.interface import VLMInterface + +PLAIN_RESPONSE = {"content": "raw text output", "tokens_used": 5} + +def _make_vlm(provider="openai"): + """Create a VLMInterface with mocked internals.""" + with patch("agent_core.core.impl.vlm.interface.VLMInterface.__init__", return_value=None): + vlm = VLMInterface.__new__(VLMInterface) + vlm.provider = provider + vlm.model = "gpt-4o" + vlm.temperature = 0.5 + vlm._get_token_count = lambda: 0 + vlm._set_token_count = lambda x: None + vlm._report_usage = None + vlm._CODE_BLOCK_RE = VLMInterface._CODE_BLOCK_RE + return vlm + +def test_openai_json_mode_true_uses_json_method(): + """describe_image_bytes with json_mode=True (default) → _openai_describe_bytes.""" + vlm = _make_vlm("openai") + vlm._openai_describe_bytes = MagicMock(return_value=PLAIN_RESPONSE) + vlm._openai_describe_bytes_plain = MagicMock(return_value=PLAIN_RESPONSE) + vlm.describe_image_bytes(b"img", json_mode=True) + vlm._openai_describe_bytes.assert_called_once() + vlm._openai_describe_bytes_plain.assert_not_called() + +def test_openai_json_mode_false_uses_plain_method(): + """describe_image_bytes with json_mode=False → _openai_describe_bytes_plain.""" + vlm = _make_vlm("openai") + vlm._openai_describe_bytes = MagicMock(return_value=PLAIN_RESPONSE) + vlm._openai_describe_bytes_plain = MagicMock(return_value=PLAIN_RESPONSE) + vlm.describe_image_bytes(b"img", json_mode=False) + vlm._openai_describe_bytes_plain.assert_called_once() + vlm._openai_describe_bytes.assert_not_called() + +def test_default_json_mode_is_true(): + """Calling describe_image_bytes without json_mode defaults to True (no regression).""" + vlm = _make_vlm("openai") + vlm._openai_describe_bytes = MagicMock(return_value=PLAIN_RESPONSE) + vlm._openai_describe_bytes_plain = MagicMock(return_value=PLAIN_RESPONSE) + vlm.describe_image_bytes(b"img") # no json_mode arg + vlm._openai_describe_bytes.assert_called_once() + +def test_gemini_unaffected_by_json_mode(): + """Gemini always uses _gemini_describe_bytes regardless of json_mode flag.""" + vlm = _make_vlm("gemini") + vlm._gemini_describe_bytes = MagicMock(return_value=PLAIN_RESPONSE) + vlm.describe_image_bytes(b"img", json_mode=False) + vlm._gemini_describe_bytes.assert_called_once() + vlm.describe_image_bytes(b"img", json_mode=True) + assert vlm._gemini_describe_bytes.call_count == 2 diff --git a/tests/test_vlm_ocr_wrapper.py b/tests/test_vlm_ocr_wrapper.py new file mode 100644 index 00000000..8e12846d --- /dev/null +++ b/tests/test_vlm_ocr_wrapper.py @@ -0,0 +1,52 @@ +import os +import pytest +import tempfile +from unittest.mock import MagicMock, patch +from agent_core.core.impl.vlm.interface import VLMInterface + +def _make_vlm(): + with patch("agent_core.core.impl.vlm.interface.VLMInterface.__init__", return_value=None): + vlm = VLMInterface.__new__(VLMInterface) + vlm.provider = "openai" + vlm.model = "gpt-4o" + vlm.temperature = 0.5 + vlm._get_token_count = lambda: 0 + vlm._set_token_count = lambda x: None + vlm._report_usage = None + vlm._CODE_BLOCK_RE = VLMInterface._CODE_BLOCK_RE + return vlm + +def test_ocr_calls_describe_image_bytes_with_json_mode_false(tmp_path): + """describe_image_ocr must delegate to describe_image_bytes with json_mode=False.""" + img_file = tmp_path / "test.png" + img_file.write_bytes(b"fakeimgdata") + vlm = _make_vlm() + vlm.describe_image_bytes = MagicMock(return_value="extracted text") + vlm.describe_image_ocr(str(img_file)) + call_kwargs = vlm.describe_image_bytes.call_args.kwargs + assert call_kwargs.get("json_mode") == False, \ + "describe_image_ocr must pass json_mode=False" + +def test_ocr_system_prompt_is_ocr_focused(tmp_path): + """The system prompt passed by OCR must mention OCR/extraction, not description.""" + img_file = tmp_path / "test.png" + img_file.write_bytes(b"fakeimgdata") + vlm = _make_vlm() + vlm.describe_image_bytes = MagicMock(return_value="text") + vlm.describe_image_ocr(str(img_file)) + sys_prompt = vlm.describe_image_bytes.call_args.kwargs.get("system_prompt", "") + assert "OCR" in sys_prompt or "extract" in sys_prompt.lower() + +def test_ocr_no_provider_routing_in_method(): + """describe_image_ocr source must not contain a provider routing switch.""" + import inspect + src = inspect.getsource(VLMInterface.describe_image_ocr) + assert "self.provider" not in src, \ + "describe_image_ocr still contains provider routing — refactor incomplete" + assert "elif self.provider ==" not in src, \ + "describe_image_ocr still contains provider routing switch" + +def test_ocr_raises_on_missing_file(): + vlm = _make_vlm() + with pytest.raises(FileNotFoundError): + vlm.describe_image_ocr("/nonexistent/path/image.png") From 3fbe092745532bc88b361f972bf566218fac788d Mon Sep 17 00:00:00 2001 From: ahmad-ajmal Date: Fri, 17 Apr 2026 15:09:20 +0100 Subject: [PATCH 17/30] name limit on craftbot --- app/onboarding/interfaces/steps.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/app/onboarding/interfaces/steps.py b/app/onboarding/interfaces/steps.py index baf94ec4..d87c02aa 100644 --- a/app/onboarding/interfaces/steps.py +++ b/app/onboarding/interfaces/steps.py @@ -242,8 +242,13 @@ def get_options(self) -> List[StepOption]: def validate(self, value: Any) -> tuple[bool, Optional[str]]: # Accept legacy string submissions (plain text name) for backward compat. if isinstance(value, str): + if len(value) > 20: + return False, "Agent name must be 20 characters or fewer" return True, None if isinstance(value, dict): + agent_name = value.get("agent_name") + if agent_name and len(str(agent_name)) > 20: + return False, "Agent name must be 20 characters or fewer" picture = value.get("agent_profile_picture") if picture not in (None, ""): if not isinstance(picture, str) or picture.lower() not in self.ALLOWED_PICTURE_EXTS: From a17980b0e60b976498443e779552c2cba979f07c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=82=A4=E3=83=84=E3=83=9F=E3=83=8D?= Date: Fri, 17 Apr 2026 23:39:45 +0900 Subject: [PATCH 18/30] Update settings.json Update version from 1.2.2 to 1.2.3 in settings config --- app/config/settings.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/config/settings.json b/app/config/settings.json index 669d7ebd..4d5efca0 100644 --- a/app/config/settings.json +++ b/app/config/settings.json @@ -1,5 +1,5 @@ { - "version": "1.2.2", + "version": "1.2.3", "general": { "agent_name": "CraftBot", "os_language": "en" @@ -76,4 +76,4 @@ "google": true, "byteplus": true } -} \ No newline at end of file +} From 6c0b2c2574515d5766ef2d875a64ece8456f40a9 Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Tue, 21 Apr 2026 16:04:55 +0530 Subject: [PATCH 19/30] chore: remove test files from PR --- tests/test_gemini_client_multimodal.py | 49 -- tests/test_step1_vlm_interface.py | 564 ----------------- tests/test_step2_iai_methods.py | 76 --- tests/test_step2_internal_action_interface.py | 599 ------------------ tests/test_step3_perform_ocr_action.py | 129 ---- tests/test_step4_understand_video_action.py | 177 ------ tests/test_understand_video_model.py | 49 -- tests/test_vlm_interface_json_mode.py | 53 -- tests/test_vlm_ocr_wrapper.py | 52 -- 9 files changed, 1748 deletions(-) delete mode 100644 tests/test_gemini_client_multimodal.py delete mode 100644 tests/test_step1_vlm_interface.py delete mode 100644 tests/test_step2_iai_methods.py delete mode 100644 tests/test_step2_internal_action_interface.py delete mode 100644 tests/test_step3_perform_ocr_action.py delete mode 100644 tests/test_step4_understand_video_action.py delete mode 100644 tests/test_understand_video_model.py delete mode 100644 tests/test_vlm_interface_json_mode.py delete mode 100644 tests/test_vlm_ocr_wrapper.py diff --git a/tests/test_gemini_client_multimodal.py b/tests/test_gemini_client_multimodal.py deleted file mode 100644 index 16c7b5fb..00000000 --- a/tests/test_gemini_client_multimodal.py +++ /dev/null @@ -1,49 +0,0 @@ -import base64 -import pytest -from unittest.mock import patch, MagicMock -from agent_core.core.llm.google_gemini_client import GeminiClient - -FAKE_RESPONSE = { - "candidates": [{"content": {"parts": [{"text": "ok"}]}, "finishReason": "STOP"}], - "usageMetadata": {"totalTokenCount": 10, "promptTokenCount": 8, "candidatesTokenCount": 2} -} - -@pytest.fixture -def client(): - return GeminiClient(api_key="fake-key") - -def test_single_image_produces_one_inlinedata_part(client): - """Passing image_bytes alone → exactly 1 inlineData in parts.""" - with patch.object(client, "_post_json", return_value=FAKE_RESPONSE) as mock_post: - client.generate_multimodal("gemini-2.0-flash", text="hi", image_bytes=b"img1") - # mock_post.call_args.args[1] is the payload - payload = mock_post.call_args.args[1] - parts = payload["contents"][0]["parts"] - inline_parts = [p for p in parts if "inlineData" in p] - assert len(inline_parts) == 1 - -def test_multi_image_produces_correct_count(client): - """Passing image_bytes_list of N images → exactly N inlineData parts.""" - with patch.object(client, "_post_json", return_value=FAKE_RESPONSE) as mock_post: - client.generate_multimodal("gemini-2.0-flash", text="hi", image_bytes_list=[b"a", b"b", b"c"]) - payload = mock_post.call_args.args[1] - parts = payload["contents"][0]["parts"] - inline_parts = [p for p in parts if "inlineData" in p] - assert len(inline_parts) == 3 - -def test_neither_image_raises_valueerror(client): - """Passing neither image_bytes nor image_bytes_list → ValueError.""" - with pytest.raises(ValueError): - client.generate_multimodal("gemini-2.0-flash", text="hi") - -def test_single_image_backwards_compat_response(client): - """Single-image call returns same response structure as before the refactor.""" - with patch.object(client, "_post_json", return_value=FAKE_RESPONSE): - result = client.generate_multimodal("gemini-2.0-flash", text="hi", image_bytes=b"img") - assert result["content"] == "ok" - assert result["tokens_used"] == 10 - -def test_generate_multimodal_multi_image_no_longer_exists(client): - """The old method must be gone.""" - assert not hasattr(client, "generate_multimodal_multi_image"), \ - "generate_multimodal_multi_image was not removed" diff --git a/tests/test_step1_vlm_interface.py b/tests/test_step1_vlm_interface.py deleted file mode 100644 index 88937c8c..00000000 --- a/tests/test_step1_vlm_interface.py +++ /dev/null @@ -1,564 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Step 1 Verification Suite — VLM Interface Extensions -Tests for: describe_image_ocr, describe_video_frames, _openai_describe_bytes_plain, - _gemini_describe_video_frames, _multi_frame_describe_fallback, - GeminiClient.generate_multimodal_multi_image - -Run with: - python -m pytest tests/test_step1_vlm_interface.py -v - -ALL tests must pass. Zero real API calls are made. -Zero imports of app.* are required — only agent_core. -""" - -from __future__ import annotations - -import base64 -import io -import os -import sys -import tempfile -import unittest -from pathlib import Path -from unittest.mock import MagicMock, patch, call - -# ───────────────────────────────────────────────────────────────── -# SECTION A: GeminiClient.generate_multimodal_multi_image -# ───────────────────────────────────────────────────────────────── - -class TestGeminiClientMultiImage(unittest.TestCase): - """ - VERIFY: GeminiClient.generate_multimodal_multi_image exists and - constructs the correct payload (one inlineData part per frame). - """ - - def _make_client(self): - from agent_core.core.llm.google_gemini_client import GeminiClient - client = GeminiClient.__new__(GeminiClient) - client._api_key = "fake-key" - client._api_base = "https://generativelanguage.googleapis.com" - client._api_version = "v1beta" - client._timeout = 30 - return client - - def test_method_accepts_list(self): - """generate_multimodal must accept image_bytes_list.""" - from agent_core.core.llm.google_gemini_client import GeminiClient - import inspect - sig = inspect.signature(GeminiClient.generate_multimodal) - self.assertIn("image_bytes_list", sig.parameters, - "FAIL: GeminiClient.generate_multimodal does not accept image_bytes_list.") - - def test_payload_contains_multiple_inline_data_parts(self): - """The API payload must contain one inlineData entry per frame passed in.""" - client = self._make_client() - fake_response = { - "candidates": [{"content": {"parts": [{"text": "video summary"}]}, "finishReason": "STOP"}], - "usageMetadata": {"totalTokenCount": 100, "promptTokenCount": 80, "candidatesTokenCount": 20}, - } - - captured_payload = {} - - def fake_post(path, payload): - captured_payload.update(payload) - return fake_response - - client._post_json = fake_post - - frame_bytes = [b"frame1_bytes", b"frame2_bytes", b"frame3_bytes"] - result = client.generate_multimodal( - "gemini-2.5-flash", - text="What is happening?", - image_bytes_list=frame_bytes, - system_prompt="Analyse these frames.", - temperature=0.5, - json_mode=False, - ) - - # Assert return shape - self.assertIn("content", result) - self.assertIn("tokens_used", result) - self.assertEqual(result["content"], "video summary") - - # Assert payload structure: must have text part + 3 inlineData parts - parts = captured_payload["contents"][0]["parts"] - inline_parts = [p for p in parts if "inlineData" in p] - text_parts = [p for p in parts if "text" in p] - - self.assertEqual(len(inline_parts), 3, - f"Expected 3 inlineData parts, got {len(inline_parts)}") - self.assertEqual(len(text_parts), 1, - f"Expected 1 text part, got {len(text_parts)}") - - # Assert each frame is correctly base64-encoded in the payload - for i, (part, raw) in enumerate(zip(inline_parts, frame_bytes)): - expected_b64 = base64.b64encode(raw).decode() - actual_b64 = part["inlineData"]["data"] - self.assertEqual(actual_b64, expected_b64, - f"Frame {i+1}: base64 mismatch in payload") - - def test_system_prompt_is_included(self): - """systemInstruction must be present in payload when system_prompt is given.""" - client = self._make_client() - fake_response = { - "candidates": [{"content": {"parts": [{"text": "ok"}]}, "finishReason": "STOP"}], - "usageMetadata": {"totalTokenCount": 10, "promptTokenCount": 8, "candidatesTokenCount": 2}, - } - captured = {} - client._post_json = lambda path, payload: (captured.update(payload), fake_response)[1] - - client.generate_multimodal( - "gemini-2.5-flash", - text="Describe", - image_bytes_list=[b"img"], - system_prompt="You are an expert.", - ) - self.assertIn("systemInstruction", captured, - "FAIL: systemInstruction missing from payload when system_prompt is provided") - - def test_no_system_prompt_omits_key(self): - """systemInstruction must be absent when system_prompt is None.""" - client = self._make_client() - fake_response = { - "candidates": [{"content": {"parts": [{"text": "ok"}]}, "finishReason": "STOP"}], - "usageMetadata": {"totalTokenCount": 5}, - } - captured = {} - client._post_json = lambda path, payload: (captured.update(payload), fake_response)[1] - - client.generate_multimodal( - "gemini-2.5-flash", - text="Describe", - image_bytes_list=[b"img"], - system_prompt=None, - ) - self.assertNotIn("systemInstruction", captured, - "FAIL: systemInstruction should be absent when no system_prompt is given") - - -# ───────────────────────────────────────────────────────────────── -# SECTION B: VLMInterface._openai_describe_bytes_plain -# ───────────────────────────────────────────────────────────────── - -class TestOpenAIDescribeBytesPlain(unittest.TestCase): - """ - VERIFY: _openai_describe_bytes_plain exists and does NOT set - response_format=json_object (that would break raw OCR text output). - """ - - def _make_vlm(self): - """Instantiate VLMInterface in deferred mode so no real API calls are made.""" - with patch("app.models.factory.ModelFactory.create") as mock_create: - mock_create.return_value = { - "model": "gpt-4o", - "client": MagicMock(), - "gemini_client": None, - "remote_url": None, - "anthropic_client": None, - "initialized": True, - "byteplus": None, - "provider": "openai", - } - from agent_core.core.impl.vlm.interface import VLMInterface - vlm = VLMInterface(provider="openai", deferred=True) - vlm.provider = "openai" - return vlm - - def test_method_exists(self): - """_openai_describe_bytes_plain must exist on VLMInterface.""" - from agent_core.core.impl.vlm.interface import VLMInterface - self.assertTrue( - hasattr(VLMInterface, "_openai_describe_bytes_plain"), - "FAIL: _openai_describe_bytes_plain not found on VLMInterface. " - "Add it to agent_core/core/impl/vlm/interface.py" - ) - - def test_no_response_format_json_object(self): - """ - CRITICAL: _openai_describe_bytes_plain must NOT pass - response_format={'type': 'json_object'} to the OpenAI client. - OCR returns raw text — json_object enforces a JSON wrapper and breaks it. - """ - vlm = self._make_vlm() - - mock_choice = MagicMock() - mock_choice.message.content = "Hello World\nLine 2" - mock_response = MagicMock() - mock_response.choices = [mock_choice] - mock_response.usage.prompt_tokens = 50 - mock_response.usage.completion_tokens = 20 - - vlm.client = MagicMock() - vlm.client.chat.completions.create.return_value = mock_response - - vlm._openai_describe_bytes_plain(b"fake_image_bytes", "sys prompt", "Extract text") - - call_kwargs = vlm.client.chat.completions.create.call_args[1] - self.assertNotIn("response_format", call_kwargs, - "FAIL: response_format is present in _openai_describe_bytes_plain. " - "Remove it — OCR must return raw text, not JSON.") - - def test_returns_dict_with_content_and_tokens(self): - """Must return dict with 'content' and 'tokens_used' keys.""" - vlm = self._make_vlm() - - mock_choice = MagicMock() - mock_choice.message.content = "Extracted: Invoice #1234" - mock_response = MagicMock() - mock_response.choices = [mock_choice] - mock_response.usage.prompt_tokens = 40 - mock_response.usage.completion_tokens = 15 - vlm.client = MagicMock() - vlm.client.chat.completions.create.return_value = mock_response - - result = vlm._openai_describe_bytes_plain(b"img", None, "Extract text") - - self.assertIsInstance(result, dict) - self.assertIn("content", result) - self.assertIn("tokens_used", result) - self.assertEqual(result["content"], "Extracted: Invoice #1234") - self.assertEqual(result["tokens_used"], 55) - - def test_max_tokens_is_at_least_4096(self): - """ - OCR may produce large amounts of text. max_tokens must be >= 4096. - """ - vlm = self._make_vlm() - mock_choice = MagicMock() - mock_choice.message.content = "text" - mock_response = MagicMock() - mock_response.choices = [mock_choice] - mock_response.usage.prompt_tokens = 10 - mock_response.usage.completion_tokens = 5 - vlm.client = MagicMock() - vlm.client.chat.completions.create.return_value = mock_response - - vlm._openai_describe_bytes_plain(b"img", None, "Extract text") - - call_kwargs = vlm.client.chat.completions.create.call_args[1] - max_tokens = call_kwargs.get("max_tokens", call_kwargs.get("max_completion_tokens", 0)) - self.assertGreaterEqual(max_tokens, 4096, - f"FAIL: max_tokens={max_tokens}. OCR needs at least 4096 to handle large text blocks.") - - -# ───────────────────────────────────────────────────────────────── -# SECTION C: VLMInterface.describe_image_ocr -# ───────────────────────────────────────────────────────────────── - -class TestDescribeImageOcr(unittest.TestCase): - """ - VERIFY: describe_image_ocr exists, routes to the correct provider branch, - uses an OCR-specific system prompt, and handles FileNotFoundError. - """ - - def _make_vlm_patched(self, provider="openai"): - with patch("app.models.factory.ModelFactory.create") as mock_create: - mock_create.return_value = { - "model": "gpt-4o", - "client": MagicMock(), - "gemini_client": None, - "remote_url": None, - "anthropic_client": None, - "initialized": True, - "byteplus": None, - "provider": provider, - } - from agent_core.core.impl.vlm.interface import VLMInterface - vlm = VLMInterface(provider=provider, deferred=True) - vlm.provider = provider - return vlm - - def test_method_exists(self): - from agent_core.core.impl.vlm.interface import VLMInterface - self.assertTrue( - hasattr(VLMInterface, "describe_image_ocr"), - "FAIL: describe_image_ocr not found on VLMInterface. " - "Add it to agent_core/core/impl/vlm/interface.py" - ) - - def test_raises_file_not_found_for_missing_path(self): - """Must raise FileNotFoundError when the image path does not exist.""" - vlm = self._make_vlm_patched() - with self.assertRaises(FileNotFoundError): - vlm.describe_image_ocr("/nonexistent/path/image.png") - - def test_routes_to_plain_method_for_openai(self): - """ - For provider='openai', describe_image_ocr must call - _openai_describe_bytes_plain (not _openai_describe_bytes). - This ensures json_object response format is not applied. - """ - vlm = self._make_vlm_patched(provider="openai") - - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: - f.write(b"fake_png_data") - tmp_path = f.name - - try: - vlm._openai_describe_bytes_plain = MagicMock( - return_value={"content": "INVOICE\nTotal: $100", "tokens_used": 30} - ) - vlm._openai_describe_bytes = MagicMock() - - result = vlm.describe_image_ocr(tmp_path) - - vlm._openai_describe_bytes_plain.assert_called_once() - vlm._openai_describe_bytes.assert_not_called() - self.assertEqual(result, "INVOICE\nTotal: $100") - finally: - os.unlink(tmp_path) - - def test_system_prompt_contains_ocr_keywords(self): - """ - The system prompt passed to the provider must contain OCR-specific - language ('OCR', 'extract', 'text') — not a generic description prompt. - """ - vlm = self._make_vlm_patched(provider="openai") - - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: - f.write(b"fake_png_data") - tmp_path = f.name - - try: - captured_sys_prompt = {} - - def capture_plain(image_bytes, sys_prompt, user_prompt): - captured_sys_prompt["sys"] = sys_prompt or "" - return {"content": "Hello", "tokens_used": 10} - - vlm._openai_describe_bytes_plain = capture_plain - vlm.describe_image_ocr(tmp_path) - - sys_lower = captured_sys_prompt.get("sys", "").lower() - self.assertTrue( - "ocr" in sys_lower or "extract" in sys_lower or "text" in sys_lower, - f"FAIL: OCR system prompt does not mention OCR/extraction. Got: '{captured_sys_prompt.get('sys')}'" - ) - finally: - os.unlink(tmp_path) - - def test_returns_string(self): - """describe_image_ocr must return a string, not a dict.""" - vlm = self._make_vlm_patched(provider="openai") - - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: - f.write(b"fake_png_data") - tmp_path = f.name - - try: - vlm._openai_describe_bytes_plain = MagicMock( - return_value={"content": "TEXT FROM IMAGE", "tokens_used": 20} - ) - result = vlm.describe_image_ocr(tmp_path) - self.assertIsInstance(result, str) - finally: - os.unlink(tmp_path) - - -# ───────────────────────────────────────────────────────────────── -# SECTION D: VLMInterface.describe_video_frames -# ───────────────────────────────────────────────────────────────── - -class TestDescribeVideoFrames(unittest.TestCase): - """ - VERIFY: describe_video_frames exists, handles missing file, - handles missing opencv gracefully, and calls the correct - provider path (Gemini native vs. fallback). - """ - - def _make_vlm_patched(self, provider="openai"): - with patch("app.models.factory.ModelFactory.create") as mock_create: - mock_create.return_value = { - "model": "gpt-4o", - "client": MagicMock(), - "gemini_client": None, - "remote_url": None, - "anthropic_client": None, - "initialized": True, - "byteplus": None, - "provider": provider, - } - from agent_core.core.impl.vlm.interface import VLMInterface - vlm = VLMInterface(provider=provider, deferred=True) - vlm.provider = provider - return vlm - - def test_method_exists(self): - from agent_core.core.impl.vlm.interface import VLMInterface - self.assertTrue( - hasattr(VLMInterface, "describe_video_frames"), - "FAIL: describe_video_frames not found on VLMInterface." - ) - - def test_raises_file_not_found_for_missing_video(self): - """Must raise FileNotFoundError when the video path does not exist.""" - vlm = self._make_vlm_patched() - with self.assertRaises(FileNotFoundError): - vlm.describe_video_frames("/nonexistent/video.mp4") - - def test_raises_runtime_error_when_opencv_missing(self): - """ - When opencv is not installed, describe_video_frames must raise - a RuntimeError with an actionable install message — not an ImportError. - This ensures a clean error surface for the user. - """ - vlm = self._make_vlm_patched() - - with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: - f.write(b"fake_mp4_data") - tmp_path = f.name - - try: - with patch.dict(sys.modules, {"cv2": None}): - with self.assertRaises(RuntimeError) as ctx: - vlm.describe_video_frames(tmp_path) - self.assertIn("opencv", str(ctx.exception).lower(), - "FAIL: RuntimeError message must mention 'opencv' to guide the user.") - finally: - os.unlink(tmp_path) - - def test_gemini_uses_native_multi_image_method(self): - """ - For provider='gemini', describe_video_frames must call - _gemini_describe_video_frames (native multi-image path). - It must NOT fall back to the sequential per-frame fallback. - """ - vlm = self._make_vlm_patched(provider="gemini") - - with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: - f.write(b"fake_mp4_data") - tmp_path = f.name - - try: - mock_cv2 = MagicMock() - mock_cap = MagicMock() - mock_cap.get.return_value = 30.0 - mock_cap.read.return_value = (True, MagicMock()) - mock_cv2.VideoCapture.return_value = mock_cap - mock_cv2.imencode.return_value = (True, MagicMock(tobytes=lambda: b"frame")) - - vlm._gemini_describe_video_frames = MagicMock(return_value="Gemini video summary") - vlm._multi_frame_describe_fallback = MagicMock(return_value="fallback summary") - - with patch.dict(sys.modules, {"cv2": mock_cv2}): - result = vlm.describe_video_frames(tmp_path, max_frames=2) - - vlm._gemini_describe_video_frames.assert_called_once() - vlm._multi_frame_describe_fallback.assert_not_called() - self.assertEqual(result, "Gemini video summary") - finally: - os.unlink(tmp_path) - - def test_non_gemini_uses_fallback(self): - """ - For provider='openai', describe_video_frames must call - _multi_frame_describe_fallback (sequential frame path). - """ - vlm = self._make_vlm_patched(provider="openai") - - with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: - f.write(b"fake_mp4_data") - tmp_path = f.name - - try: - mock_cv2 = MagicMock() - mock_cap = MagicMock() - mock_cap.get.return_value = 30.0 - mock_cap.read.return_value = (True, MagicMock()) - mock_cv2.VideoCapture.return_value = mock_cap - mock_cv2.imencode.return_value = (True, MagicMock(tobytes=lambda: b"frame")) - - vlm._gemini_describe_video_frames = MagicMock(return_value="should not be called") - vlm._multi_frame_describe_fallback = MagicMock(return_value="OpenAI fallback summary") - - with patch.dict(sys.modules, {"cv2": mock_cv2}): - result = vlm.describe_video_frames(tmp_path, max_frames=2) - - vlm._multi_frame_describe_fallback.assert_called_once() - vlm._gemini_describe_video_frames.assert_not_called() - self.assertEqual(result, "OpenAI fallback summary") - finally: - os.unlink(tmp_path) - - -# ───────────────────────────────────────────────────────────────── -# SECTION E: Regression — existing describe_image still works -# ───────────────────────────────────────────────────────────────── - -class TestRegressionDescribeImage(unittest.TestCase): - """ - REGRESSION GUARD: Ensure existing describe_image and describe_image_bytes - are untouched and still produce the same output contract. - This confirms Step 1 did not break any existing functionality. - """ - - def _make_vlm_patched(self): - with patch("app.models.factory.ModelFactory.create") as mock_create: - mock_create.return_value = { - "model": "gpt-4o", - "client": MagicMock(), - "gemini_client": None, - "remote_url": None, - "anthropic_client": None, - "initialized": True, - "byteplus": None, - "provider": "openai", - } - from agent_core.core.impl.vlm.interface import VLMInterface - vlm = VLMInterface(provider="openai", deferred=True) - vlm.provider = "openai" - return vlm - - def test_describe_image_still_raises_on_missing_file(self): - """describe_image must still raise FileNotFoundError (unchanged).""" - vlm = self._make_vlm_patched() - with self.assertRaises(FileNotFoundError): - vlm.describe_image("/does/not/exist.png") - - def test_describe_image_bytes_returns_string(self): - """describe_image_bytes must still return a plain string.""" - vlm = self._make_vlm_patched() - - mock_choice = MagicMock() - mock_choice.message.content = '{"content": "A cat"}' - mock_response = MagicMock() - mock_response.choices = [mock_choice] - mock_response.usage.prompt_tokens = 10 - mock_response.usage.completion_tokens = 5 - mock_response.usage.prompt_tokens_details = None # Prevent MagicMock leak - vlm.client = MagicMock() - vlm.client.chat.completions.create.return_value = mock_response - - result = vlm.describe_image_bytes(b"fake_image", user_prompt="Describe this image.") - self.assertIsInstance(result, str) - - def test_describe_image_bytes_uses_json_response_format(self): - """ - REGRESSION: The ORIGINAL describe_image_bytes must still use - response_format=json_object (this is the existing contract). - It should NOT be affected by the plain-text OCR variant. - """ - vlm = self._make_vlm_patched() - - mock_choice = MagicMock() - mock_choice.message.content = '{"content": "A dog"}' - mock_response = MagicMock() - mock_response.choices = [mock_choice] - mock_response.usage.prompt_tokens = 10 - mock_response.usage.completion_tokens = 5 - mock_response.usage.prompt_tokens_details = None # Prevent MagicMock leak - vlm.client = MagicMock() - vlm.client.chat.completions.create.return_value = mock_response - - vlm.describe_image_bytes(b"fake_image", user_prompt="Describe this.") - - call_kwargs = vlm.client.chat.completions.create.call_args[1] - # Original describe_image_bytes should still request json_object - self.assertIn("response_format", call_kwargs, - "REGRESSION: describe_image_bytes lost response_format=json_object. " - "Only the new _openai_describe_bytes_plain should omit it.") - - -if __name__ == "__main__": - unittest.main(verbosity=2) diff --git a/tests/test_step2_iai_methods.py b/tests/test_step2_iai_methods.py deleted file mode 100644 index 415689eb..00000000 --- a/tests/test_step2_iai_methods.py +++ /dev/null @@ -1,76 +0,0 @@ -# -*- coding: utf-8 -*- -import unittest -from unittest.mock import MagicMock, patch, mock_open -import os -from datetime import datetime -import asyncio - -# Mocking the constants before import if necessary, but app.config should be fine -import sys -from unittest.mock import PropertyMock - -class TestStep2InternalInterface(unittest.TestCase): - def setUp(self): - # We need to mock InternalActionInterface dependencies - self.iai_patcher = patch('app.internal_action_interface.InternalActionInterface', autospec=True) - # However, we want to test the ACTUAL methods on InternalActionInterface - # So we import it and patch its class attributes - - from app.internal_action_interface import InternalActionInterface - self.iai = InternalActionInterface - self.iai.vlm_interface = MagicMock() - self.iai.state_manager = MagicMock() - self.iai.ui_adapter = MagicMock() - - @patch('os.path.join', side_effect=lambda *args: "/".join(args)) - @patch('builtins.open', new_callable=mock_open) - @patch('app.internal_action_interface.AGENT_WORKSPACE_ROOT', "/mock/workspace") - def test_perform_ocr_saves_file_and_returns_dict(self, mock_file, mock_join): - # Setup - self.iai.vlm_interface.describe_image_ocr.return_value = "Extracted Text Content" - - # Execute - result = self.iai.perform_ocr("some_image.jpg", user_prompt="Test Prompt") - - # Verify call to VLM - self.iai.vlm_interface.describe_image_ocr.assert_called_once_with("some_image.jpg", user_prompt="Test Prompt") - - # Verify file saving - mock_file.assert_called_once() - handle = mock_file() - handle.write.assert_called_once_with("Extracted Text Content") - - # Verify return dict - self.assertEqual(result['status'], 'success') - self.assertTrue(result['file_saved']) - self.assertIn('ocr_result_', result['file_path']) - self.assertIn('OCR complete', result['summary']) - - @patch('os.path.join', side_effect=lambda *args: "/".join(args)) - @patch('builtins.open', new_callable=mock_open) - @patch('app.internal_action_interface.AGENT_WORKSPACE_ROOT', "/mock/workspace") - def test_understand_video_saves_file_and_returns_dict(self, mock_file, mock_join): - # Setup - self.iai.vlm_interface.describe_video_frames.return_value = "Video Summary Content" - - # Execute - result = self.iai.understand_video("some_video.mp4", query="What happens?") - - # Verify call to VLM - self.iai.vlm_interface.describe_video_frames.assert_called_once_with( - "some_video.mp4", query="What happens?", max_frames=8 - ) - - # Verify file saving - mock_file.assert_called_once() - handle = mock_file() - handle.write.assert_called_once_with("Video Summary Content") - - # Verify return dict - self.assertEqual(result['status'], 'success') - self.assertTrue(result['file_saved']) - self.assertIn('video_summary_', result['file_path']) - self.assertEqual(result['summary'], "Video Summary Content") - -if __name__ == '__main__': - unittest.main() diff --git a/tests/test_step2_internal_action_interface.py b/tests/test_step2_internal_action_interface.py deleted file mode 100644 index 8e8e8d0c..00000000 --- a/tests/test_step2_internal_action_interface.py +++ /dev/null @@ -1,599 +0,0 @@ -# tests/test_step2_internal_action_interface.py -# -*- coding: utf-8 -*- -""" -Step 2 Verification Suite — InternalActionInterface Extensions -Tests for: perform_ocr() and understand_video() classmethods - -Run with: - python -m pytest tests/test_step2_internal_action_interface.py -v - -ALL tests must pass. Zero real API calls. Zero real file system dependency -outside of tempfile — all workspace writes use a patched AGENT_WORKSPACE_ROOT. - -PREREQUISITE: Step 1 tests must already be passing before running these. -""" - -from __future__ import annotations - -import os -import sys -import tempfile -import unittest -from pathlib import Path -from unittest.mock import MagicMock, patch, PropertyMock - - -# ───────────────────────────────────────────────────────────────── -# HELPERS -# ───────────────────────────────────────────────────────────────── - -def _reset_iai(): - """Reset InternalActionInterface class-level state between tests.""" - from app.internal_action_interface import InternalActionInterface - InternalActionInterface.vlm_interface = None - InternalActionInterface.llm_interface = None - InternalActionInterface.task_manager = None - InternalActionInterface.state_manager = None - - -def _inject_mock_vlm(mock_vlm=None): - """Inject a mock VLMInterface into InternalActionInterface.""" - from app.internal_action_interface import InternalActionInterface - if mock_vlm is None: - mock_vlm = MagicMock() - InternalActionInterface.vlm_interface = mock_vlm - return mock_vlm - - -# ───────────────────────────────────────────────────────────────── -# SECTION A: Method Existence & Signatures -# ───────────────────────────────────────────────────────────────── - -class TestMethodExistence(unittest.TestCase): - """ - VERIFY: Both new classmethods exist and are classmethods (not staticmethods - or instance methods), matching the pattern of describe_image(). - """ - - def test_perform_ocr_exists(self): - from app.internal_action_interface import InternalActionInterface - self.assertTrue( - hasattr(InternalActionInterface, "perform_ocr"), - "FAIL: InternalActionInterface.perform_ocr not found. " - "Add it to app/internal_action_interface.py" - ) - - def test_understand_video_exists(self): - from app.internal_action_interface import InternalActionInterface - self.assertTrue( - hasattr(InternalActionInterface, "understand_video"), - "FAIL: InternalActionInterface.understand_video not found. " - "Add it to app/internal_action_interface.py" - ) - - def test_perform_ocr_is_classmethod(self): - """perform_ocr must be a classmethod, not a staticmethod or instance method.""" - from app.internal_action_interface import InternalActionInterface - method = InternalActionInterface.__dict__.get("perform_ocr") - self.assertIsInstance( - method, classmethod, - "FAIL: perform_ocr must be a @classmethod (matching describe_image pattern)." - ) - - def test_understand_video_is_classmethod(self): - """understand_video must be a classmethod.""" - from app.internal_action_interface import InternalActionInterface - method = InternalActionInterface.__dict__.get("understand_video") - self.assertIsInstance( - method, classmethod, - "FAIL: understand_video must be a @classmethod." - ) - - def test_perform_ocr_accepts_image_path(self): - """perform_ocr must accept image_path as its first positional argument.""" - import inspect - from app.internal_action_interface import InternalActionInterface - sig = inspect.signature(InternalActionInterface.perform_ocr) - params = list(sig.parameters.keys()) - self.assertIn("image_path", params, - f"FAIL: perform_ocr must accept 'image_path'. Got params: {params}") - - def test_understand_video_accepts_video_path_and_query(self): - """understand_video must accept video_path and query parameters.""" - import inspect - from app.internal_action_interface import InternalActionInterface - sig = inspect.signature(InternalActionInterface.understand_video) - params = list(sig.parameters.keys()) - self.assertIn("video_path", params, - f"FAIL: understand_video must accept 'video_path'. Got: {params}") - self.assertIn("query", params, - f"FAIL: understand_video must accept 'query'. Got: {params}") - - def tearDown(self): - _reset_iai() - - -# ───────────────────────────────────────────────────────────────── -# SECTION B: VLM Guard — RuntimeError when not initialized -# ───────────────────────────────────────────────────────────────── - -class TestVLMGuard(unittest.TestCase): - """ - VERIFY: Both methods raise RuntimeError when vlm_interface is None, - matching the guard pattern of describe_image() and describe_screen(). - """ - - def setUp(self): - _reset_iai() - - def test_perform_ocr_raises_when_vlm_not_initialized(self): - from app.internal_action_interface import InternalActionInterface - # vlm_interface is None (default state) - with self.assertRaises(RuntimeError) as ctx: - InternalActionInterface.perform_ocr("/some/image.png") - self.assertIn( - "VLMInterface", str(ctx.exception), - "FAIL: RuntimeError message must mention 'VLMInterface' to match " - "existing error message pattern in describe_image/describe_screen." - ) - - def test_understand_video_raises_when_vlm_not_initialized(self): - from app.internal_action_interface import InternalActionInterface - with self.assertRaises(RuntimeError) as ctx: - InternalActionInterface.understand_video("/some/video.mp4") - self.assertIn( - "VLMInterface", str(ctx.exception), - "FAIL: RuntimeError message must mention 'VLMInterface'." - ) - - def tearDown(self): - _reset_iai() - - -# ───────────────────────────────────────────────────────────────── -# SECTION C: perform_ocr — Return Contract -# ───────────────────────────────────────────────────────────────── - -class TestPerformOcrReturnContract(unittest.TestCase): - """ - VERIFY: perform_ocr returns a dict with the correct keys, - correct types, and saves extracted text to AGENT_WORKSPACE_ROOT. - """ - - def setUp(self): - _reset_iai() - self.tmp_workspace = tempfile.mkdtemp() - - def _run_perform_ocr(self, ocr_text="Hello World\nLine 2\nLine 3"): - """Helper: run perform_ocr with a temp image and mocked VLM.""" - mock_vlm = MagicMock() - mock_vlm.describe_image_ocr.return_value = ocr_text - _inject_mock_vlm(mock_vlm) - - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: - f.write(b"fake_png") - image_path = f.name - - try: - with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): - from app.internal_action_interface import InternalActionInterface - result = InternalActionInterface.perform_ocr(image_path) - finally: - os.unlink(image_path) - - return result, mock_vlm - - def test_returns_dict(self): - result, _ = self._run_perform_ocr() - self.assertIsInstance(result, dict, - "FAIL: perform_ocr must return a dict, not a plain string.") - - def test_return_dict_has_required_keys(self): - """Must have: status, summary, file_path, file_saved.""" - result, _ = self._run_perform_ocr() - for key in ("status", "summary", "file_path", "file_saved"): - self.assertIn(key, result, - f"FAIL: perform_ocr return dict is missing key '{key}'.") - - def test_status_is_success_on_happy_path(self): - result, _ = self._run_perform_ocr() - self.assertEqual(result["status"], "success", - "FAIL: status must be 'success' on happy path.") - - def test_file_saved_is_true(self): - result, _ = self._run_perform_ocr() - self.assertTrue(result["file_saved"], - "FAIL: file_saved must be True after successful OCR.") - - def test_file_path_exists_on_disk(self): - """The file_path in the result must be a real file that was written.""" - result, _ = self._run_perform_ocr("Invoice #1234\nTotal: $99.99") - self.assertTrue( - os.path.isfile(result["file_path"]), - f"FAIL: file_path '{result['file_path']}' does not exist on disk. " - "perform_ocr must write the extracted text to workspace." - ) - - def test_file_content_matches_ocr_output(self): - """The saved file must contain the raw OCR text exactly as returned by VLM.""" - ocr_text = "CONFIDENTIAL\nProject Alpha\nBudget: $1,000,000" - result, _ = self._run_perform_ocr(ocr_text) - - with open(result["file_path"], "r", encoding="utf-8") as f: - saved_content = f.read() - - self.assertEqual(saved_content, ocr_text, - "FAIL: Saved file content does not match OCR output. " - "The raw text must be written verbatim — no modification.") - - def test_file_saved_to_agent_workspace_root(self): - """The saved file must be inside AGENT_WORKSPACE_ROOT, not a temp dir.""" - result, _ = self._run_perform_ocr() - self.assertTrue( - result["file_path"].startswith(self.tmp_workspace), - f"FAIL: File saved to '{result['file_path']}' but expected " - f"it to be inside AGENT_WORKSPACE_ROOT='{self.tmp_workspace}'. " - "Do not hardcode paths — use AGENT_WORKSPACE_ROOT from app.config." - ) - - def test_file_has_txt_extension(self): - """Output file must be a .txt file (readable by do_chat_with_attachments).""" - result, _ = self._run_perform_ocr() - self.assertTrue( - result["file_path"].endswith(".txt"), - f"FAIL: Output file must have .txt extension. Got: '{result['file_path']}'" - ) - - def test_summary_does_not_contain_full_text(self): - """ - Summary must be a SHORT description, not the full OCR text. - The whole point of saving to file is to keep the agent context lean. - If summary == full text, the TUI flooding problem is not solved. - """ - long_text = "Line\n" * 200 # 200 lines, definitely not a summary - result, _ = self._run_perform_ocr(long_text) - self.assertLess( - len(result["summary"]), len(long_text), - "FAIL: summary contains the full OCR text. It must be a short " - "description (e.g. 'OCR complete: 200 lines, 1000 characters') " - "to prevent context window flooding." - ) - - def test_summary_mentions_line_or_char_count(self): - """Summary must be informative — mention lines or characters extracted.""" - result, _ = self._run_perform_ocr("Hello\nWorld") - summary_lower = result["summary"].lower() - has_count_info = ( - "line" in summary_lower or - "char" in summary_lower or - "word" in summary_lower or - "extracted" in summary_lower - ) - self.assertTrue(has_count_info, - f"FAIL: summary '{result['summary']}' is not informative. " - "It must mention lines/characters extracted so the agent knows what happened.") - - def test_calls_describe_image_ocr_not_describe_image(self): - """ - CRITICAL: Must call vlm_interface.describe_image_ocr(), NOT - vlm_interface.describe_image(). Using describe_image is exactly - the existing bug that Issue #155 was filed for. - """ - mock_vlm = MagicMock() - mock_vlm.describe_image_ocr.return_value = "Some text" - _inject_mock_vlm(mock_vlm) - - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: - f.write(b"fake_png") - image_path = f.name - - try: - with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): - from app.internal_action_interface import InternalActionInterface - InternalActionInterface.perform_ocr(image_path) - finally: - os.unlink(image_path) - - mock_vlm.describe_image_ocr.assert_called_once() - mock_vlm.describe_image.assert_not_called() - - def test_user_prompt_forwarded_to_vlm(self): - """Optional user_prompt must be passed through to vlm.describe_image_ocr.""" - mock_vlm = MagicMock() - mock_vlm.describe_image_ocr.return_value = "text" - _inject_mock_vlm(mock_vlm) - - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: - f.write(b"fake_png") - image_path = f.name - - try: - with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): - from app.internal_action_interface import InternalActionInterface - InternalActionInterface.perform_ocr(image_path, user_prompt="Focus on prices only.") - finally: - os.unlink(image_path) - - call_kwargs = mock_vlm.describe_image_ocr.call_args - # Check the user_prompt was forwarded (positional or keyword) - all_args = list(call_kwargs.args) + list(call_kwargs.kwargs.values()) - self.assertIn("Focus on prices only.", all_args, - "FAIL: user_prompt was not forwarded to vlm_interface.describe_image_ocr(). " - "The OCR method must pass user_prompt through.") - - def tearDown(self): - _reset_iai() - import shutil - shutil.rmtree(self.tmp_workspace, ignore_errors=True) - - -# ───────────────────────────────────────────────────────────────── -# SECTION D: understand_video — Return Contract -# ───────────────────────────────────────────────────────────────── - -class TestUnderstandVideoReturnContract(unittest.TestCase): - """ - VERIFY: understand_video returns a correct dict, saves summary to - workspace, truncates summary to prevent TUI flooding, and - forwards all parameters correctly to vlm_interface. - """ - - def setUp(self): - _reset_iai() - self.tmp_workspace = tempfile.mkdtemp() - - def _run_understand_video(self, summary_text="The video shows a presentation.", query=None, max_frames=8): - mock_vlm = MagicMock() - mock_vlm.describe_video_frames.return_value = summary_text - _inject_mock_vlm(mock_vlm) - - with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: - f.write(b"fake_mp4") - video_path = f.name - - try: - with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): - from app.internal_action_interface import InternalActionInterface - result = InternalActionInterface.understand_video( - video_path, query=query, max_frames=max_frames - ) - finally: - os.unlink(video_path) - - return result, mock_vlm - - def test_returns_dict(self): - result, _ = self._run_understand_video() - self.assertIsInstance(result, dict, - "FAIL: understand_video must return a dict.") - - def test_return_dict_has_required_keys(self): - result, _ = self._run_understand_video() - for key in ("status", "summary", "file_path", "file_saved"): - self.assertIn(key, result, - f"FAIL: understand_video return dict is missing key '{key}'.") - - def test_status_is_success_on_happy_path(self): - result, _ = self._run_understand_video() - self.assertEqual(result["status"], "success") - - def test_file_saved_is_true(self): - result, _ = self._run_understand_video() - self.assertTrue(result["file_saved"]) - - def test_file_path_exists_on_disk(self): - result, _ = self._run_understand_video("A meeting recording with 3 participants.") - self.assertTrue( - os.path.isfile(result["file_path"]), - f"FAIL: file_path '{result['file_path']}' does not exist. " - "understand_video must write the full summary to workspace." - ) - - def test_full_summary_saved_to_file(self): - """The full, untruncated summary must be in the saved file.""" - long_summary = "Frame description. " * 100 # deliberately long - result, _ = self._run_understand_video(long_summary) - - with open(result["file_path"], "r", encoding="utf-8") as f: - saved = f.read() - - self.assertEqual(saved, long_summary, - "FAIL: The saved file must contain the FULL summary. " - "Truncation only applies to the return dict's 'summary' key.") - - def test_summary_in_return_dict_is_truncated_for_long_content(self): - """ - For long video summaries, the 'summary' key in the returned dict - must be truncated (<=500 chars + ellipsis) to prevent context flooding. - The full content is in the file — the dict summary is just a preview. - """ - long_summary = "X" * 2000 - result, _ = self._run_understand_video(long_summary) - self.assertLessEqual( - len(result["summary"]), 510, # 500 + len("...") - f"FAIL: summary in return dict is {len(result['summary'])} chars. " - "Must be truncated to ~500 chars to prevent agent context flooding." - ) - - def test_short_summary_not_truncated(self): - """Short summaries (<=500 chars) must be returned as-is without ellipsis.""" - short_summary = "A quick 30-second tutorial on Python loops." - result, _ = self._run_understand_video(short_summary) - self.assertEqual(result["summary"], short_summary, - "FAIL: Short summary was unexpectedly truncated or modified.") - - def test_file_saved_to_agent_workspace_root(self): - result, _ = self._run_understand_video() - self.assertTrue( - result["file_path"].startswith(self.tmp_workspace), - f"FAIL: File saved to wrong location. Expected inside " - f"AGENT_WORKSPACE_ROOT='{self.tmp_workspace}'." - ) - - def test_file_has_txt_extension(self): - result, _ = self._run_understand_video() - self.assertTrue(result["file_path"].endswith(".txt"), - "FAIL: Output file must be .txt") - - def test_video_filename_distinct_from_ocr_filename(self): - """ - Video summary files must have a distinct filename prefix from OCR files - to avoid confusion in workspace (e.g. 'video_summary_' vs 'ocr_result_'). - """ - result, _ = self._run_understand_video() - filename = os.path.basename(result["file_path"]) - self.assertFalse( - filename.startswith("ocr_"), - f"FAIL: Video summary file '{filename}' starts with 'ocr_'. " - "Video and OCR output files must have distinct prefixes." - ) - - def test_calls_describe_video_frames_not_describe_image(self): - """Must delegate to vlm_interface.describe_video_frames(), not describe_image().""" - mock_vlm = MagicMock() - mock_vlm.describe_video_frames.return_value = "summary" - _inject_mock_vlm(mock_vlm) - - with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: - f.write(b"fake_mp4") - video_path = f.name - - try: - with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): - from app.internal_action_interface import InternalActionInterface - InternalActionInterface.understand_video(video_path) - finally: - os.unlink(video_path) - - mock_vlm.describe_video_frames.assert_called_once() - mock_vlm.describe_image.assert_not_called() - - def test_query_forwarded_to_vlm(self): - """The query parameter must be forwarded to describe_video_frames.""" - mock_vlm = MagicMock() - mock_vlm.describe_video_frames.return_value = "answer" - _inject_mock_vlm(mock_vlm) - - with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: - f.write(b"fake_mp4") - video_path = f.name - - try: - with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): - from app.internal_action_interface import InternalActionInterface - InternalActionInterface.understand_video(video_path, query="What is on slide 3?") - finally: - os.unlink(video_path) - - call_kwargs = mock_vlm.describe_video_frames.call_args - all_args = list(call_kwargs.args) + list(call_kwargs.kwargs.values()) - self.assertIn("What is on slide 3?", all_args, - "FAIL: query not forwarded to describe_video_frames.") - - def test_max_frames_forwarded_to_vlm(self): - """max_frames must be forwarded to describe_video_frames.""" - mock_vlm = MagicMock() - mock_vlm.describe_video_frames.return_value = "summary" - _inject_mock_vlm(mock_vlm) - - with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: - f.write(b"fake_mp4") - video_path = f.name - - try: - with patch("app.internal_action_interface.AGENT_WORKSPACE_ROOT", self.tmp_workspace): - from app.internal_action_interface import InternalActionInterface - InternalActionInterface.understand_video(video_path, max_frames=12) - finally: - os.unlink(video_path) - - call_kwargs = mock_vlm.describe_video_frames.call_args - all_args = list(call_kwargs.args) + list(call_kwargs.kwargs.values()) - self.assertIn(12, all_args, - "FAIL: max_frames=12 was not forwarded to describe_video_frames.") - - def tearDown(self): - _reset_iai() - import shutil - shutil.rmtree(self.tmp_workspace, ignore_errors=True) - - -# ───────────────────────────────────────────────────────────────── -# SECTION E: Regression — existing methods untouched -# ───────────────────────────────────────────────────────────────── - -class TestRegressionExistingMethods(unittest.TestCase): - """ - REGRESSION GUARD: Ensure describe_image(), describe_screen(), - and initialize() still work exactly as before Step 2. - """ - - def setUp(self): - _reset_iai() - self.tmp_workspace = tempfile.mkdtemp() - - def test_describe_image_still_raises_when_vlm_none(self): - from app.internal_action_interface import InternalActionInterface - with self.assertRaises(RuntimeError): - InternalActionInterface.describe_image("/any/path.png") - - def test_describe_image_still_returns_string(self): - """describe_image must still return str (not dict) — contract unchanged.""" - mock_vlm = MagicMock() - mock_vlm.describe_image.return_value = "A photo of a cat." - _inject_mock_vlm(mock_vlm) - - from app.internal_action_interface import InternalActionInterface - result = InternalActionInterface.describe_image("/fake/path.png") - self.assertIsInstance(result, str, - "REGRESSION: describe_image must still return str, not dict.") - self.assertEqual(result, "A photo of a cat.") - - def test_initialize_still_sets_vlm_interface(self): - """initialize() must still correctly set vlm_interface class attribute.""" - from app.internal_action_interface import InternalActionInterface - - mock_vlm = MagicMock() - mock_llm = MagicMock() - mock_task = MagicMock() - mock_state = MagicMock() - - InternalActionInterface.initialize( - llm_interface=mock_llm, - task_manager=mock_task, - state_manager=mock_state, - vlm_interface=mock_vlm, - ) - - self.assertIs(InternalActionInterface.vlm_interface, mock_vlm, - "REGRESSION: initialize() no longer sets vlm_interface correctly.") - - def test_new_methods_do_not_shadow_describe_image(self): - """ - perform_ocr and understand_video must not accidentally override - or shadow describe_image on the class. - """ - from app.internal_action_interface import InternalActionInterface - # All three must coexist independently - self.assertTrue(hasattr(InternalActionInterface, "describe_image")) - self.assertTrue(hasattr(InternalActionInterface, "perform_ocr")) - self.assertTrue(hasattr(InternalActionInterface, "understand_video")) - - # describe_image must still delegate to vlm.describe_image - mock_vlm = MagicMock() - mock_vlm.describe_image.return_value = "original image description" - _inject_mock_vlm(mock_vlm) - - result = InternalActionInterface.describe_image("/fake.png") - mock_vlm.describe_image.assert_called_once() - # describe_image_ocr must NOT have been called - mock_vlm.describe_image_ocr.assert_not_called() - - def tearDown(self): - _reset_iai() - import shutil - shutil.rmtree(self.tmp_workspace, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main(verbosity=2) diff --git a/tests/test_step3_perform_ocr_action.py b/tests/test_step3_perform_ocr_action.py deleted file mode 100644 index 31a55f44..00000000 --- a/tests/test_step3_perform_ocr_action.py +++ /dev/null @@ -1,129 +0,0 @@ -# tests/test_step3_perform_ocr_action.py -""" -Step 3 Verification: perform_ocr action layer tests. -Tests input validation, simulated mode, schema contract, -and bridge delegation — without making real VLM calls. -""" -import os -import pytest -from unittest.mock import patch, MagicMock - - -# ── Helpers ──────────────────────────────────────────────────────────────── - -def load_action(image_path: str, simulated: bool = False) -> dict: - """Import and invoke the action directly.""" - from app.data.action.perform_ocr import execute - return execute({"image_path": image_path, "simulated_mode": simulated}) - - -# ── 1. Input Validation ──────────────────────────────────────────────────── - -class TestInputValidation: - - def test_missing_image_path_key(self): - from app.data.action.perform_ocr import execute - result = execute({}) - assert result["status"] == "error" - assert "image_path" in result["message"].lower() - - def test_empty_image_path_string(self): - result = load_action("") - assert result["status"] == "error" - - def test_nonexistent_file_path(self): - result = load_action("/tmp/does_not_exist_12345.png") - assert result["status"] == "error" - assert "not found" in result["message"].lower() or \ - "does not exist" in result["message"].lower() or \ - result["status"] == "error" - - def test_path_is_directory_not_file(self, tmp_path): - result = load_action(str(tmp_path)) # directory, not a file - assert result["status"] == "error" - - -# ── 2. Simulated Mode ────────────────────────────────────────────────────── - -class TestSimulatedMode: - - def test_simulated_mode_returns_success(self, tmp_path): - fake_image = tmp_path / "test.png" - fake_image.write_bytes(b"fake_png_bytes") - result = load_action(str(fake_image), simulated=True) - assert result["status"] == "success" - - def test_simulated_mode_makes_no_vlm_call(self, tmp_path): - fake_image = tmp_path / "test.png" - fake_image.write_bytes(b"fake_png_bytes") - with patch("app.internal_action_interface.InternalActionInterface.perform_ocr") as mock_ocr: - load_action(str(fake_image), simulated=True) - mock_ocr.assert_not_called() - - def test_simulated_mode_result_is_string(self, tmp_path): - fake_image = tmp_path / "test.png" - fake_image.write_bytes(b"fake_png_bytes") - result = load_action(str(fake_image), simulated=True) - # In simulated mode, summary or message might be the string - assert isinstance(result.get("summary") or result.get("message"), str) - - -# ── 3. Schema Contract ───────────────────────────────────────────────────── - -class TestSchemaContract: - - def test_success_response_has_required_keys(self, tmp_path): - fake_image = tmp_path / "test.png" - fake_image.write_bytes(b"fake_png_bytes") - mock_return = {"status": "success", "text": "Invoice #1234", "file_path": "/tmp/ocr.txt"} - with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", - return_value=mock_return): - result = load_action(str(fake_image)) - assert "status" in result - assert result["status"] in ("success", "error") - - def test_error_response_has_message(self, tmp_path): - fake_image = tmp_path / "test.png" - fake_image.write_bytes(b"fake_png_bytes") - with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", - side_effect=RuntimeError("VLM unavailable")): - result = load_action(str(fake_image)) - assert result["status"] == "error" - assert "message" in result - assert len(result["message"]) > 0 - - def test_success_exposes_extracted_text(self, tmp_path): - fake_image = tmp_path / "test.png" - fake_image.write_bytes(b"fake_png_bytes") - mock_return = {"status": "success", "text": "Hello World", "file_path": "/tmp/ocr.txt"} - with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", - return_value=mock_return): - result = load_action(str(fake_image)) - # The action must surface the text somewhere — either in result["text"], - # result["result"], or result["message"] - combined = str(result) - assert "Hello World" in combined - - -# ── 4. Bridge Delegation ─────────────────────────────────────────────────── - -class TestBridgeDelegation: - - def test_delegates_correct_image_path_to_bridge(self, tmp_path): - fake_image = tmp_path / "receipt.png" - fake_image.write_bytes(b"fake_png_bytes") - mock_return = {"status": "success", "text": "some text", "file_path": "/tmp/x.txt"} - with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", - return_value=mock_return) as mock_bridge: - load_action(str(fake_image)) - called_path = mock_bridge.call_args[0][0] - assert called_path == str(fake_image) - - def test_bridge_vlm_not_initialized_returns_error(self, tmp_path): - fake_image = tmp_path / "test.png" - fake_image.write_bytes(b"fake_png_bytes") - with patch("app.internal_action_interface.InternalActionInterface.perform_ocr", - side_effect=RuntimeError("InternalActionInterface not initialized with VLMInterface.")): - result = load_action(str(fake_image)) - assert result["status"] == "error" - assert "message" in result diff --git a/tests/test_step4_understand_video_action.py b/tests/test_step4_understand_video_action.py deleted file mode 100644 index 77883701..00000000 --- a/tests/test_step4_understand_video_action.py +++ /dev/null @@ -1,177 +0,0 @@ -# tests/test_step4_understand_video_action.py - -import pytest -from unittest.mock import patch - -def load_action(video_path: str, query: str = "", simulated: bool = False) -> dict: - from app.data.action.understand_video import execute - return execute({ - "video_path": video_path, - "query": query, - "simulated_mode": simulated, - }) - - -class TestInputValidation: - - def test_missing_video_path_key(self): - from app.data.action.understand_video import execute - result = execute({}) - assert result["status"] == "error" - assert "video_path" in result["message"].lower() - - def test_empty_video_path_string(self): - result = load_action("") - assert result["status"] == "error" - - def test_nonexistent_file_path(self): - result = load_action("/tmp/does_not_exist_98765.mp4") - assert result["status"] == "error" - - def test_path_is_directory_not_file(self, tmp_path): - result = load_action(str(tmp_path)) - assert result["status"] == "error" - - -class TestSimulatedMode: - - def test_simulated_mode_returns_success(self, tmp_path): - fake_video = tmp_path / "test.mp4" - fake_video.write_bytes(b"fake_video_bytes") - result = load_action(str(fake_video), simulated=True) - assert result["status"] == "success" - - def test_simulated_mode_makes_no_vlm_call(self, tmp_path): - fake_video = tmp_path / "test.mp4" - fake_video.write_bytes(b"fake_video_bytes") - with patch("app.internal_action_interface.InternalActionInterface.understand_video") as mock_bridge: - load_action(str(fake_video), simulated=True) - mock_bridge.assert_not_called() - - -class TestSchemaContract: - - def test_success_response_has_required_keys(self, tmp_path): - fake_video = tmp_path / "clip.mp4" - fake_video.write_bytes(b"fake_video_bytes") - - mock_return = { - "status": "success", - "summary": "A person walks into a room.", - "preview": "A person walks...", - "file_path": "/tmp/video_summary.txt", - } - with patch("app.internal_action_interface.InternalActionInterface.understand_video", - return_value=mock_return): - result = load_action(str(fake_video), query="What happens?") - - assert result["status"] == "success" - for key in ("summary", "file_path"): - assert key in result - - def test_error_response_has_message(self, tmp_path): - fake_video = tmp_path / "clip.mp4" - fake_video.write_bytes(b"fake_video_bytes") - - with patch("app.internal_action_interface.InternalActionInterface.understand_video", - side_effect=RuntimeError("VLM unavailable")): - result = load_action(str(fake_video)) - - assert result["status"] == "error" - assert "message" in result - assert len(result["message"]) > 0 - - -class TestBridgeDelegation: - - def test_delegates_correct_video_path_and_query(self, tmp_path): - fake_video = tmp_path / "scene.mp4" - fake_video.write_bytes(b"fake_video_bytes") - - mock_return = { - "status": "success", - "summary": "Some summary", - "preview": "Some...", - "file_path": "/tmp/video_summary.txt", - } - with patch("app.internal_action_interface.InternalActionInterface.understand_video", - return_value=mock_return) as mock_bridge: - load_action(str(fake_video), query="Who is present?") - - # Verify bridge call arguments - # In some versions of mock, call_args[0] is positional args - called_args = mock_bridge.call_args[0] - assert called_args[0] == str(fake_video) - assert mock_bridge.call_args[1].get('query') == "Who is present?" or called_args[1] == "Who is present?" - - def test_bridge_vlm_not_initialized_returns_error(self, tmp_path): - fake_video = tmp_path / "scene.mp4" - fake_video.write_bytes(b"fake_video_bytes") - - with patch("app.internal_action_interface.InternalActionInterface.understand_video", - side_effect=RuntimeError("InternalActionInterface not initialized with VLMInterface.")): - result = load_action(str(fake_video)) - - assert result["status"] == "error" - assert "message" in result - - -class TestPrimaryGeminiPath: - - @patch("app.config.get_api_key") - @patch("google.generativeai.upload_file") - @patch("google.generativeai.GenerativeModel") - @patch("google.generativeai.delete_file") - def test_gemini_path_success(self, mock_delete, mock_generative_model, mock_upload, mock_get_api_key, tmp_path): - from unittest.mock import MagicMock - mock_get_api_key.return_value = "fake_google_key" - - mock_file = MagicMock() - mock_file.name = "fake_video_name" - mock_file.state.name = "ACTIVE" - mock_upload.return_value = mock_file - - mock_model_instance = MagicMock() - mock_response = MagicMock() - mock_response.text = "This is a native Gemini summary of the video. " * 20 - mock_model_instance.generate_content.return_value = mock_response - mock_generative_model.return_value = mock_model_instance - - fake_video = tmp_path / "gemini_clip.mp4" - fake_video.write_bytes(b"fake_video_bytes") - - with patch("app.config.AGENT_WORKSPACE_ROOT", str(tmp_path)): - result = load_action(str(fake_video), query="What happens?") - - assert result["status"] == "success" - assert "native Gemini summary" in result["summary"] - assert result["file_saved"] is True - - mock_upload.assert_called_once() - mock_model_instance.generate_content.assert_called_once() - mock_delete.assert_called_once_with(mock_file.name) - - @patch("app.config.get_api_key") - @patch("app.internal_action_interface.InternalActionInterface.understand_video") - def test_fallback_path_triggered(self, mock_bridge, mock_get_api_key, tmp_path): - mock_get_api_key.return_value = None - - mock_return = { - "status": "success", - "summary": "Fallback summary", - "file_path": "/tmp/fallback.txt", - "file_saved": True, - "message": "" - } - mock_bridge.return_value = mock_return - - fake_video = tmp_path / "fallback_clip.mp4" - fake_video.write_bytes(b"fake_video_bytes") - - result = load_action(str(fake_video), query="Fallback query") - - assert result["status"] == "success" - assert result["summary"] == "Fallback summary" - mock_bridge.assert_called_once() - called_args = mock_bridge.call_args[0] - assert called_args[0] == str(fake_video) diff --git a/tests/test_understand_video_model.py b/tests/test_understand_video_model.py deleted file mode 100644 index cd69dc3b..00000000 --- a/tests/test_understand_video_model.py +++ /dev/null @@ -1,49 +0,0 @@ -import pytest -from unittest.mock import patch, MagicMock -import os - -def test_understand_video_uses_configured_model(): - """understand_video must use get_vlm_model(), not hardcode gemini-1.5-pro.""" - mock_file = MagicMock() - mock_file.state.name = "ACTIVE" - mock_model_instance = MagicMock() - mock_model_instance.generate_content.return_value = MagicMock(text="video summary") - - with patch("os.path.isfile", return_value=True), \ - patch("app.config.get_api_key", return_value="fake-key"), \ - patch("app.config.get_vlm_model", return_value="gemini-2.0-flash") as mock_get_model, \ - patch("google.generativeai.configure"), \ - patch("google.generativeai.upload_file", return_value=mock_file), \ - patch("google.generativeai.get_file", return_value=mock_file), \ - patch("google.generativeai.GenerativeModel", return_value=mock_model_instance) as mock_gm, \ - patch("google.generativeai.delete_file"), \ - patch("builtins.open", MagicMock()), \ - patch("app.config.AGENT_WORKSPACE_ROOT", "/tmp"): - from app.data.action.understand_video import understand_video - understand_video({"video_path": "/fake/video.mp4"}) - called_model_name = mock_gm.call_args[0][0] - assert called_model_name == "gemini-2.0-flash", \ - f"Expected gemini-2.0-flash from config, got {called_model_name}" - -def test_understand_video_falls_back_when_config_missing(): - """If get_vlm_model() returns None, fall back to gemini-1.5-pro.""" - mock_file = MagicMock() - mock_file.state.name = "ACTIVE" - mock_model_instance = MagicMock() - mock_model_instance.generate_content.return_value = MagicMock(text="summary") - - with patch("os.path.isfile", return_value=True), \ - patch("app.config.get_api_key", return_value="fake-key"), \ - patch("app.config.get_vlm_model", return_value=None), \ - patch("google.generativeai.configure"), \ - patch("google.generativeai.upload_file", return_value=mock_file), \ - patch("google.generativeai.get_file", return_value=mock_file), \ - patch("google.generativeai.GenerativeModel", return_value=mock_model_instance) as mock_gm, \ - patch("google.generativeai.delete_file"), \ - patch("builtins.open", MagicMock()), \ - patch("app.config.AGENT_WORKSPACE_ROOT", "/tmp"): - from app.data.action.understand_video import understand_video - understand_video({"video_path": "/fake/video.mp4"}) - called_model_name = mock_gm.call_args[0][0] - assert called_model_name == "gemini-1.5-pro", \ - f"Expected fallback gemini-1.5-pro, got {called_model_name}" diff --git a/tests/test_vlm_interface_json_mode.py b/tests/test_vlm_interface_json_mode.py deleted file mode 100644 index 3d38495c..00000000 --- a/tests/test_vlm_interface_json_mode.py +++ /dev/null @@ -1,53 +0,0 @@ -import pytest -from unittest.mock import MagicMock, patch -from agent_core.core.impl.vlm.interface import VLMInterface - -PLAIN_RESPONSE = {"content": "raw text output", "tokens_used": 5} - -def _make_vlm(provider="openai"): - """Create a VLMInterface with mocked internals.""" - with patch("agent_core.core.impl.vlm.interface.VLMInterface.__init__", return_value=None): - vlm = VLMInterface.__new__(VLMInterface) - vlm.provider = provider - vlm.model = "gpt-4o" - vlm.temperature = 0.5 - vlm._get_token_count = lambda: 0 - vlm._set_token_count = lambda x: None - vlm._report_usage = None - vlm._CODE_BLOCK_RE = VLMInterface._CODE_BLOCK_RE - return vlm - -def test_openai_json_mode_true_uses_json_method(): - """describe_image_bytes with json_mode=True (default) → _openai_describe_bytes.""" - vlm = _make_vlm("openai") - vlm._openai_describe_bytes = MagicMock(return_value=PLAIN_RESPONSE) - vlm._openai_describe_bytes_plain = MagicMock(return_value=PLAIN_RESPONSE) - vlm.describe_image_bytes(b"img", json_mode=True) - vlm._openai_describe_bytes.assert_called_once() - vlm._openai_describe_bytes_plain.assert_not_called() - -def test_openai_json_mode_false_uses_plain_method(): - """describe_image_bytes with json_mode=False → _openai_describe_bytes_plain.""" - vlm = _make_vlm("openai") - vlm._openai_describe_bytes = MagicMock(return_value=PLAIN_RESPONSE) - vlm._openai_describe_bytes_plain = MagicMock(return_value=PLAIN_RESPONSE) - vlm.describe_image_bytes(b"img", json_mode=False) - vlm._openai_describe_bytes_plain.assert_called_once() - vlm._openai_describe_bytes.assert_not_called() - -def test_default_json_mode_is_true(): - """Calling describe_image_bytes without json_mode defaults to True (no regression).""" - vlm = _make_vlm("openai") - vlm._openai_describe_bytes = MagicMock(return_value=PLAIN_RESPONSE) - vlm._openai_describe_bytes_plain = MagicMock(return_value=PLAIN_RESPONSE) - vlm.describe_image_bytes(b"img") # no json_mode arg - vlm._openai_describe_bytes.assert_called_once() - -def test_gemini_unaffected_by_json_mode(): - """Gemini always uses _gemini_describe_bytes regardless of json_mode flag.""" - vlm = _make_vlm("gemini") - vlm._gemini_describe_bytes = MagicMock(return_value=PLAIN_RESPONSE) - vlm.describe_image_bytes(b"img", json_mode=False) - vlm._gemini_describe_bytes.assert_called_once() - vlm.describe_image_bytes(b"img", json_mode=True) - assert vlm._gemini_describe_bytes.call_count == 2 diff --git a/tests/test_vlm_ocr_wrapper.py b/tests/test_vlm_ocr_wrapper.py deleted file mode 100644 index 8e12846d..00000000 --- a/tests/test_vlm_ocr_wrapper.py +++ /dev/null @@ -1,52 +0,0 @@ -import os -import pytest -import tempfile -from unittest.mock import MagicMock, patch -from agent_core.core.impl.vlm.interface import VLMInterface - -def _make_vlm(): - with patch("agent_core.core.impl.vlm.interface.VLMInterface.__init__", return_value=None): - vlm = VLMInterface.__new__(VLMInterface) - vlm.provider = "openai" - vlm.model = "gpt-4o" - vlm.temperature = 0.5 - vlm._get_token_count = lambda: 0 - vlm._set_token_count = lambda x: None - vlm._report_usage = None - vlm._CODE_BLOCK_RE = VLMInterface._CODE_BLOCK_RE - return vlm - -def test_ocr_calls_describe_image_bytes_with_json_mode_false(tmp_path): - """describe_image_ocr must delegate to describe_image_bytes with json_mode=False.""" - img_file = tmp_path / "test.png" - img_file.write_bytes(b"fakeimgdata") - vlm = _make_vlm() - vlm.describe_image_bytes = MagicMock(return_value="extracted text") - vlm.describe_image_ocr(str(img_file)) - call_kwargs = vlm.describe_image_bytes.call_args.kwargs - assert call_kwargs.get("json_mode") == False, \ - "describe_image_ocr must pass json_mode=False" - -def test_ocr_system_prompt_is_ocr_focused(tmp_path): - """The system prompt passed by OCR must mention OCR/extraction, not description.""" - img_file = tmp_path / "test.png" - img_file.write_bytes(b"fakeimgdata") - vlm = _make_vlm() - vlm.describe_image_bytes = MagicMock(return_value="text") - vlm.describe_image_ocr(str(img_file)) - sys_prompt = vlm.describe_image_bytes.call_args.kwargs.get("system_prompt", "") - assert "OCR" in sys_prompt or "extract" in sys_prompt.lower() - -def test_ocr_no_provider_routing_in_method(): - """describe_image_ocr source must not contain a provider routing switch.""" - import inspect - src = inspect.getsource(VLMInterface.describe_image_ocr) - assert "self.provider" not in src, \ - "describe_image_ocr still contains provider routing — refactor incomplete" - assert "elif self.provider ==" not in src, \ - "describe_image_ocr still contains provider routing switch" - -def test_ocr_raises_on_missing_file(): - vlm = _make_vlm() - with pytest.raises(FileNotFoundError): - vlm.describe_image_ocr("/nonexistent/path/image.png") From 670e5f0414afc6f8980ab35155173ade0497b416 Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Tue, 21 Apr 2026 16:18:11 +0530 Subject: [PATCH 20/30] update test deletion and future annotations for tests, prepared for final merge --- agent_core/core/impl/llm/cache/config.py | 2 +- agent_core/core/impl/llm/cache/metrics.py | 2 +- agent_core/core/impl/llm/errors.py | 2 +- agent_core/core/impl/llm/types.py | 2 +- agent_core/core/impl/memory/manager.py | 1 - agent_core/core/llm/google_gemini_client.py | 2 +- agent_core/core/protocols/trigger.py | 1 - agent_core/core/task/todo.py | 1 - agent_core/core/trigger.py | 1 - agent_core/decorators/log_events.py | 2 +- agent_core/decorators/profiler.py | 2 +- app/browser/interface.py | 1 - app/cli/interface.py | 1 - app/credentials/handlers.py | 2 +- app/external_comms/base.py | 1 - app/external_comms/config.py | 1 - app/external_comms/integration_settings.py | 1 - app/external_comms/manager.py | 1 - app/external_comms/platforms/google_workspace.py | 1 - app/external_comms/platforms/linkedin.py | 1 - app/external_comms/platforms/notion.py | 1 - app/external_comms/platforms/outlook.py | 1 - app/external_comms/platforms/slack.py | 1 - app/external_comms/platforms/telegram_bot.py | 1 - app/external_comms/platforms/telegram_user.py | 1 - app/external_comms/platforms/twitter.py | 1 - app/external_comms/platforms/whatsapp_bridge/client.py | 1 - app/external_comms/platforms/whatsapp_business.py | 1 - app/external_comms/platforms/whatsapp_web.py | 1 - app/external_comms/registry.py | 1 - app/ui_layer/commands/base.py | 1 - app/ui_layer/commands/builtin/agent_command.py | 1 - app/ui_layer/commands/builtin/clear.py | 1 - app/ui_layer/commands/builtin/cred.py | 1 - app/ui_layer/commands/builtin/exit.py | 1 - app/ui_layer/commands/builtin/help.py | 1 - app/ui_layer/commands/builtin/integrations.py | 1 - app/ui_layer/commands/builtin/mcp.py | 1 - app/ui_layer/commands/builtin/menu.py | 1 - app/ui_layer/commands/builtin/provider.py | 1 - app/ui_layer/commands/builtin/reset.py | 1 - app/ui_layer/commands/builtin/skill.py | 1 - app/ui_layer/commands/builtin/skill_invoke.py | 1 - app/ui_layer/commands/builtin/update.py | 1 - app/ui_layer/commands/executor.py | 1 - app/ui_layer/events/event_bus.py | 1 - app/ui_layer/events/event_types.py | 1 - app/ui_layer/events/transformer.py | 1 - app/ui_layer/onboarding/controller.py | 1 - app/ui_layer/themes/base.py | 1 - app/usage/action_storage.py | 1 - app/usage/chat_storage.py | 1 - app/usage/reporter.py | 1 - app/usage/session_storage.py | 1 - app/usage/storage.py | 1 - app/usage/task_storage.py | 1 - diagnostic/action_diagnose.py | 1 - diagnostic/environments/create_and_run_python_script.py | 1 - diagnostic/environments/create_pdf_file.py | 1 - diagnostic/environments/find_file_by_name.py | 1 - diagnostic/environments/find_in_file_content.py | 1 - diagnostic/environments/ignore.py | 1 - diagnostic/environments/keyboard_input.py | 1 - diagnostic/environments/keyboard_typing.py | 1 - diagnostic/environments/list_folder.py | 1 - diagnostic/environments/mouse_drag.py | 1 - diagnostic/environments/mouse_move.py | 1 - diagnostic/environments/open_application.py | 1 - diagnostic/environments/read_pdf_file.py | 1 - diagnostic/environments/scroll.py | 1 - diagnostic/environments/send_http_requests.py | 1 - diagnostic/environments/send_message.py | 1 - diagnostic/environments/shell_exec_windows.py | 1 - diagnostic/environments/switch_to_cli_mode.py | 1 - diagnostic/environments/trace_mouse.py | 1 - diagnostic/environments/view_image.py | 1 - diagnostic/environments/window_close.py | 1 - diagnostic/framework.py | 1 - mkdocs/scripts/gen_ref_pages.py | 1 - skills/model-usage/scripts/model_usage.py | 1 - skills/stock-market-pro/scripts/options_links.py | 1 - 81 files changed, 8 insertions(+), 81 deletions(-) diff --git a/agent_core/core/impl/llm/cache/config.py b/agent_core/core/impl/llm/cache/config.py index f958738c..57efc4f8 100644 --- a/agent_core/core/impl/llm/cache/config.py +++ b/agent_core/core/impl/llm/cache/config.py @@ -5,7 +5,7 @@ Shared cache configuration for all LLM providers. """ -from __future__ import annotations + import os from dataclasses import dataclass diff --git a/agent_core/core/impl/llm/cache/metrics.py b/agent_core/core/impl/llm/cache/metrics.py index 8f390825..d0af1f85 100644 --- a/agent_core/core/impl/llm/cache/metrics.py +++ b/agent_core/core/impl/llm/cache/metrics.py @@ -5,7 +5,7 @@ Cache effectiveness metrics tracking for all LLM providers. """ -from __future__ import annotations + import logging from dataclasses import dataclass diff --git a/agent_core/core/impl/llm/errors.py b/agent_core/core/impl/llm/errors.py index e310f686..f0cd6655 100644 --- a/agent_core/core/impl/llm/errors.py +++ b/agent_core/core/impl/llm/errors.py @@ -6,7 +6,7 @@ Uses proper exception types and HTTP status codes - no string pattern matching. """ -from __future__ import annotations + from typing import Optional diff --git a/agent_core/core/impl/llm/types.py b/agent_core/core/impl/llm/types.py index 1b942525..a4925ccb 100644 --- a/agent_core/core/impl/llm/types.py +++ b/agent_core/core/impl/llm/types.py @@ -3,7 +3,7 @@ Shared types and enums for the LLM interface module. """ -from __future__ import annotations + from enum import Enum diff --git a/agent_core/core/impl/memory/manager.py b/agent_core/core/impl/memory/manager.py index 5b491864..745bd2d4 100644 --- a/agent_core/core/impl/memory/manager.py +++ b/agent_core/core/impl/memory/manager.py @@ -13,7 +13,6 @@ the full content directly. This keeps retrieval lightweight. """ -from __future__ import annotations import hashlib import re diff --git a/agent_core/core/llm/google_gemini_client.py b/agent_core/core/llm/google_gemini_client.py index 36ae2f21..d8ce43d7 100644 --- a/agent_core/core/llm/google_gemini_client.py +++ b/agent_core/core/llm/google_gemini_client.py @@ -7,7 +7,7 @@ SDK emits during import/initialisation (e.g. the ``ALTS creds ignored`` message that was polluting the CLI output). """ -from __future__ import annotations + import base64 import logging diff --git a/agent_core/core/protocols/trigger.py b/agent_core/core/protocols/trigger.py index e6afc8aa..d563b369 100644 --- a/agent_core/core/protocols/trigger.py +++ b/agent_core/core/protocols/trigger.py @@ -2,7 +2,6 @@ """ Protocol definition for TriggerQueue. """ -from __future__ import annotations from typing import List, Protocol, Optional, runtime_checkable diff --git a/agent_core/core/task/todo.py b/agent_core/core/task/todo.py index d51afa92..89da9891 100644 --- a/agent_core/core/task/todo.py +++ b/agent_core/core/task/todo.py @@ -6,7 +6,6 @@ todo list mechanism similar to Claude Code's TodoWrite tool. """ -from __future__ import annotations import uuid from dataclasses import dataclass, field from typing import Dict, Any, Optional, Literal diff --git a/agent_core/core/trigger.py b/agent_core/core/trigger.py index c4970ec8..23a0666f 100644 --- a/agent_core/core/trigger.py +++ b/agent_core/core/trigger.py @@ -4,7 +4,6 @@ Trigger dataclass - the entry point for all agent reactions. """ -from __future__ import annotations from dataclasses import dataclass, field from typing import Dict, Any, Optional diff --git a/agent_core/decorators/log_events.py b/agent_core/decorators/log_events.py index a544091d..305b96a1 100644 --- a/agent_core/decorators/log_events.py +++ b/agent_core/decorators/log_events.py @@ -9,7 +9,7 @@ {id}, {name}, {args}, {kwargs}, {result}, {exception}, {duration_ms} """ -from __future__ import annotations + import logging import time diff --git a/agent_core/decorators/profiler.py b/agent_core/decorators/profiler.py index 73dc344f..0c083605 100644 --- a/agent_core/decorators/profiler.py +++ b/agent_core/decorators/profiler.py @@ -29,7 +29,7 @@ Set "auto_save_interval" to N to save after every N loops (0 = only at exit). """ -from __future__ import annotations + import atexit import asyncio diff --git a/app/browser/interface.py b/app/browser/interface.py index e61d596c..02551fc4 100644 --- a/app/browser/interface.py +++ b/app/browser/interface.py @@ -5,7 +5,6 @@ the centralized UI layer components. """ -from __future__ import annotations from typing import TYPE_CHECKING diff --git a/app/cli/interface.py b/app/cli/interface.py index 8af4ce66..2f783a69 100644 --- a/app/cli/interface.py +++ b/app/cli/interface.py @@ -6,7 +6,6 @@ the centralized UI layer components. """ -from __future__ import annotations from typing import TYPE_CHECKING diff --git a/app/credentials/handlers.py b/app/credentials/handlers.py index e4c2c40e..2f488a19 100644 --- a/app/credentials/handlers.py +++ b/app/credentials/handlers.py @@ -1,5 +1,5 @@ """All integration credential handlers + registry.""" -from __future__ import annotations + import base64 import hashlib diff --git a/app/external_comms/base.py b/app/external_comms/base.py index cab67f78..e7006662 100644 --- a/app/external_comms/base.py +++ b/app/external_comms/base.py @@ -5,7 +5,6 @@ Base classes for platform clients. """ -from __future__ import annotations import logging from abc import ABC, abstractmethod diff --git a/app/external_comms/config.py b/app/external_comms/config.py index bec98efc..54d70a63 100644 --- a/app/external_comms/config.py +++ b/app/external_comms/config.py @@ -5,7 +5,6 @@ Configuration for external communication channels (WhatsApp, Telegram). """ -from __future__ import annotations import json import logging diff --git a/app/external_comms/integration_settings.py b/app/external_comms/integration_settings.py index 25dfd390..35eb6d0b 100644 --- a/app/external_comms/integration_settings.py +++ b/app/external_comms/integration_settings.py @@ -1,5 +1,4 @@ """Integration settings management — shared by browser and TUI frontends.""" -from __future__ import annotations import asyncio import logging diff --git a/app/external_comms/manager.py b/app/external_comms/manager.py index f54fc5a9..4c9bcda3 100644 --- a/app/external_comms/manager.py +++ b/app/external_comms/manager.py @@ -6,7 +6,6 @@ Uses the platform registry to discover and start all platforms that support listening. """ -from __future__ import annotations import asyncio import logging diff --git a/app/external_comms/platforms/google_workspace.py b/app/external_comms/platforms/google_workspace.py index bb1e9976..fddfd564 100644 --- a/app/external_comms/platforms/google_workspace.py +++ b/app/external_comms/platforms/google_workspace.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- """Google Workspace client — Gmail + Calendar + Drive via httpx.""" -from __future__ import annotations import asyncio import base64 diff --git a/app/external_comms/platforms/linkedin.py b/app/external_comms/platforms/linkedin.py index 865f2517..53d0f5d9 100644 --- a/app/external_comms/platforms/linkedin.py +++ b/app/external_comms/platforms/linkedin.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- """LinkedIn REST API v2 client — direct HTTP via httpx.""" -from __future__ import annotations import time from dataclasses import dataclass diff --git a/app/external_comms/platforms/notion.py b/app/external_comms/platforms/notion.py index d05974d5..b873d073 100644 --- a/app/external_comms/platforms/notion.py +++ b/app/external_comms/platforms/notion.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- """Notion API client — direct HTTP via httpx.""" -from __future__ import annotations from dataclasses import dataclass from typing import Any, Dict, List, Optional diff --git a/app/external_comms/platforms/outlook.py b/app/external_comms/platforms/outlook.py index 139421df..933ce8c0 100644 --- a/app/external_comms/platforms/outlook.py +++ b/app/external_comms/platforms/outlook.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- """Outlook email client — Microsoft Graph API via httpx.""" -from __future__ import annotations import asyncio import logging diff --git a/app/external_comms/platforms/slack.py b/app/external_comms/platforms/slack.py index efb848ee..5634a86c 100644 --- a/app/external_comms/platforms/slack.py +++ b/app/external_comms/platforms/slack.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- """Slack API client — direct HTTP via httpx.""" -from __future__ import annotations import asyncio import logging diff --git a/app/external_comms/platforms/telegram_bot.py b/app/external_comms/platforms/telegram_bot.py index bc895b4b..9928b803 100644 --- a/app/external_comms/platforms/telegram_bot.py +++ b/app/external_comms/platforms/telegram_bot.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- """Telegram Bot API client — direct HTTP via httpx.""" -from __future__ import annotations import asyncio import logging diff --git a/app/external_comms/platforms/telegram_user.py b/app/external_comms/platforms/telegram_user.py index d80e068b..3aa4fdc8 100644 --- a/app/external_comms/platforms/telegram_user.py +++ b/app/external_comms/platforms/telegram_user.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- """Telegram MTProto (user account) client — uses Telethon with StringSession.""" -from __future__ import annotations import asyncio import logging diff --git a/app/external_comms/platforms/twitter.py b/app/external_comms/platforms/twitter.py index 9fc76143..f05cba9b 100644 --- a/app/external_comms/platforms/twitter.py +++ b/app/external_comms/platforms/twitter.py @@ -6,7 +6,6 @@ mention triggers to those containing a specific keyword. """ -from __future__ import annotations import asyncio import hashlib diff --git a/app/external_comms/platforms/whatsapp_bridge/client.py b/app/external_comms/platforms/whatsapp_bridge/client.py index 905d7f40..62c1a525 100644 --- a/app/external_comms/platforms/whatsapp_bridge/client.py +++ b/app/external_comms/platforms/whatsapp_bridge/client.py @@ -6,7 +6,6 @@ sending commands and receiving events via stdin/stdout JSON lines. """ -from __future__ import annotations import asyncio import json diff --git a/app/external_comms/platforms/whatsapp_business.py b/app/external_comms/platforms/whatsapp_business.py index 52057bee..384e7a27 100644 --- a/app/external_comms/platforms/whatsapp_business.py +++ b/app/external_comms/platforms/whatsapp_business.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- """WhatsApp Business Cloud API client — direct HTTP via httpx.""" -from __future__ import annotations from dataclasses import dataclass from typing import Any, Dict, List, Optional diff --git a/app/external_comms/platforms/whatsapp_web.py b/app/external_comms/platforms/whatsapp_web.py index 3ce612c4..75a1b628 100644 --- a/app/external_comms/platforms/whatsapp_web.py +++ b/app/external_comms/platforms/whatsapp_web.py @@ -9,7 +9,6 @@ ``app.external_comms.platforms.whatsapp_bridge.client``. """ -from __future__ import annotations import asyncio import logging diff --git a/app/external_comms/registry.py b/app/external_comms/registry.py index c55ed745..433559c9 100644 --- a/app/external_comms/registry.py +++ b/app/external_comms/registry.py @@ -5,7 +5,6 @@ Simple registry of platform clients. """ -from __future__ import annotations import logging from typing import Dict, Optional, Type diff --git a/app/ui_layer/commands/base.py b/app/ui_layer/commands/base.py index c63c7253..6c7aa063 100644 --- a/app/ui_layer/commands/base.py +++ b/app/ui_layer/commands/base.py @@ -1,6 +1,5 @@ """Base command class and result type.""" -from __future__ import annotations from abc import ABC, abstractmethod from dataclasses import dataclass, field diff --git a/app/ui_layer/commands/builtin/agent_command.py b/app/ui_layer/commands/builtin/agent_command.py index 0d3bf7e0..eaba2939 100644 --- a/app/ui_layer/commands/builtin/agent_command.py +++ b/app/ui_layer/commands/builtin/agent_command.py @@ -1,6 +1,5 @@ """Wrapper for agent-provided commands.""" -from __future__ import annotations from typing import TYPE_CHECKING, List diff --git a/app/ui_layer/commands/builtin/clear.py b/app/ui_layer/commands/builtin/clear.py index bf6d4796..550d736e 100644 --- a/app/ui_layer/commands/builtin/clear.py +++ b/app/ui_layer/commands/builtin/clear.py @@ -1,6 +1,5 @@ """Clear command implementation.""" -from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/cred.py b/app/ui_layer/commands/builtin/cred.py index d9809136..4a1428b5 100644 --- a/app/ui_layer/commands/builtin/cred.py +++ b/app/ui_layer/commands/builtin/cred.py @@ -1,6 +1,5 @@ """Credential management command implementation.""" -from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/exit.py b/app/ui_layer/commands/builtin/exit.py index 5cad002c..2c47ec2c 100644 --- a/app/ui_layer/commands/builtin/exit.py +++ b/app/ui_layer/commands/builtin/exit.py @@ -1,6 +1,5 @@ """Exit command implementation.""" -from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/help.py b/app/ui_layer/commands/builtin/help.py index 99030c72..77c6a68d 100644 --- a/app/ui_layer/commands/builtin/help.py +++ b/app/ui_layer/commands/builtin/help.py @@ -1,6 +1,5 @@ """Help command implementation.""" -from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/integrations.py b/app/ui_layer/commands/builtin/integrations.py index b924da95..c50a2490 100644 --- a/app/ui_layer/commands/builtin/integrations.py +++ b/app/ui_layer/commands/builtin/integrations.py @@ -5,7 +5,6 @@ share the same logic and side-effects (e.g. platform-listener startup). """ -from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/mcp.py b/app/ui_layer/commands/builtin/mcp.py index da8cc203..cf8de2bd 100644 --- a/app/ui_layer/commands/builtin/mcp.py +++ b/app/ui_layer/commands/builtin/mcp.py @@ -1,6 +1,5 @@ """MCP (Model Context Protocol) command implementation.""" -from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/menu.py b/app/ui_layer/commands/builtin/menu.py index 27c5ad5d..f6f8f28a 100644 --- a/app/ui_layer/commands/builtin/menu.py +++ b/app/ui_layer/commands/builtin/menu.py @@ -1,6 +1,5 @@ """Menu command implementation.""" -from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/provider.py b/app/ui_layer/commands/builtin/provider.py index e9c1d9b7..9be63d13 100644 --- a/app/ui_layer/commands/builtin/provider.py +++ b/app/ui_layer/commands/builtin/provider.py @@ -1,6 +1,5 @@ """Provider command implementation.""" -from __future__ import annotations import os from typing import List diff --git a/app/ui_layer/commands/builtin/reset.py b/app/ui_layer/commands/builtin/reset.py index c31d218a..fc042bee 100644 --- a/app/ui_layer/commands/builtin/reset.py +++ b/app/ui_layer/commands/builtin/reset.py @@ -1,6 +1,5 @@ """Reset command implementation.""" -from __future__ import annotations import asyncio from typing import List diff --git a/app/ui_layer/commands/builtin/skill.py b/app/ui_layer/commands/builtin/skill.py index 2e6e2207..90cfecb2 100644 --- a/app/ui_layer/commands/builtin/skill.py +++ b/app/ui_layer/commands/builtin/skill.py @@ -1,6 +1,5 @@ """Skill command implementation.""" -from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/skill_invoke.py b/app/ui_layer/commands/builtin/skill_invoke.py index 0c077e00..b9c7e0e4 100644 --- a/app/ui_layer/commands/builtin/skill_invoke.py +++ b/app/ui_layer/commands/builtin/skill_invoke.py @@ -1,6 +1,5 @@ """Skill invocation command - allows invoking skills as slash commands.""" -from __future__ import annotations from typing import List, TYPE_CHECKING diff --git a/app/ui_layer/commands/builtin/update.py b/app/ui_layer/commands/builtin/update.py index 81156848..dae409d6 100644 --- a/app/ui_layer/commands/builtin/update.py +++ b/app/ui_layer/commands/builtin/update.py @@ -1,6 +1,5 @@ """Update command implementation.""" -from __future__ import annotations import asyncio from typing import List diff --git a/app/ui_layer/commands/executor.py b/app/ui_layer/commands/executor.py index 475e8ad5..e002b5be 100644 --- a/app/ui_layer/commands/executor.py +++ b/app/ui_layer/commands/executor.py @@ -1,6 +1,5 @@ """Command executor for parsing and executing commands.""" -from __future__ import annotations from typing import TYPE_CHECKING diff --git a/app/ui_layer/events/event_bus.py b/app/ui_layer/events/event_bus.py index 54f0ef3a..a8c4b5f3 100644 --- a/app/ui_layer/events/event_bus.py +++ b/app/ui_layer/events/event_bus.py @@ -1,6 +1,5 @@ """Publish/subscribe event bus for UI events.""" -from __future__ import annotations import asyncio from collections import defaultdict diff --git a/app/ui_layer/events/event_types.py b/app/ui_layer/events/event_types.py index 7dbd9c25..516645a6 100644 --- a/app/ui_layer/events/event_types.py +++ b/app/ui_layer/events/event_types.py @@ -1,6 +1,5 @@ """UI Event types and data structures.""" -from __future__ import annotations from dataclasses import dataclass, field from datetime import datetime diff --git a/app/ui_layer/events/transformer.py b/app/ui_layer/events/transformer.py index 3bca0d10..f4b96be4 100644 --- a/app/ui_layer/events/transformer.py +++ b/app/ui_layer/events/transformer.py @@ -1,6 +1,5 @@ """Transform agent events to UI events.""" -from __future__ import annotations from datetime import datetime from typing import Optional, Any, TYPE_CHECKING diff --git a/app/ui_layer/onboarding/controller.py b/app/ui_layer/onboarding/controller.py index 04b6631f..ced24261 100644 --- a/app/ui_layer/onboarding/controller.py +++ b/app/ui_layer/onboarding/controller.py @@ -1,6 +1,5 @@ """Onboarding flow controller.""" -from __future__ import annotations from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type from dataclasses import dataclass, field diff --git a/app/ui_layer/themes/base.py b/app/ui_layer/themes/base.py index b7cc7967..c1acd4f0 100644 --- a/app/ui_layer/themes/base.py +++ b/app/ui_layer/themes/base.py @@ -1,6 +1,5 @@ """Theme base classes and protocols.""" -from __future__ import annotations from abc import ABC, abstractmethod from dataclasses import dataclass diff --git a/app/usage/action_storage.py b/app/usage/action_storage.py index 1c41c154..7e839389 100644 --- a/app/usage/action_storage.py +++ b/app/usage/action_storage.py @@ -6,7 +6,6 @@ Provides local persistence for action history across agent restarts. """ -from __future__ import annotations import logging import sqlite3 diff --git a/app/usage/chat_storage.py b/app/usage/chat_storage.py index da85aa3e..da5b3981 100644 --- a/app/usage/chat_storage.py +++ b/app/usage/chat_storage.py @@ -6,7 +6,6 @@ Provides local persistence for chat history across agent restarts. """ -from __future__ import annotations import json import logging diff --git a/app/usage/reporter.py b/app/usage/reporter.py index 3a06c0d5..5254bb8d 100644 --- a/app/usage/reporter.py +++ b/app/usage/reporter.py @@ -6,7 +6,6 @@ Adapts the WhiteCollarAgent UsageReporter pattern for local SQLite storage. """ -from __future__ import annotations import asyncio import logging diff --git a/app/usage/session_storage.py b/app/usage/session_storage.py index 9eac006c..9edb36be 100644 --- a/app/usage/session_storage.py +++ b/app/usage/session_storage.py @@ -7,7 +7,6 @@ event context can be restored. """ -from __future__ import annotations import json import logging diff --git a/app/usage/storage.py b/app/usage/storage.py index a2be57d7..14b1ae13 100644 --- a/app/usage/storage.py +++ b/app/usage/storage.py @@ -6,7 +6,6 @@ Provides local persistence for LLM/VLM token usage tracking. """ -from __future__ import annotations import json import logging diff --git a/app/usage/task_storage.py b/app/usage/task_storage.py index a5e59f6a..4b65dd01 100644 --- a/app/usage/task_storage.py +++ b/app/usage/task_storage.py @@ -6,7 +6,6 @@ Provides local persistence for task execution history. """ -from __future__ import annotations import json import logging diff --git a/diagnostic/action_diagnose.py b/diagnostic/action_diagnose.py index 010c3371..39368257 100644 --- a/diagnostic/action_diagnose.py +++ b/diagnostic/action_diagnose.py @@ -1,5 +1,4 @@ """Diagnostic tool for validating action implementations.""" -from __future__ import annotations import argparse import json diff --git a/diagnostic/environments/create_and_run_python_script.py b/diagnostic/environments/create_and_run_python_script.py index 777f74ad..05b37949 100644 --- a/diagnostic/environments/create_and_run_python_script.py +++ b/diagnostic/environments/create_and_run_python_script.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "create and run python script" action.""" -from __future__ import annotations from pathlib import Path from typing import Any, Mapping, Tuple diff --git a/diagnostic/environments/create_pdf_file.py b/diagnostic/environments/create_pdf_file.py index 45987c9d..1e9f8a63 100644 --- a/diagnostic/environments/create_pdf_file.py +++ b/diagnostic/environments/create_pdf_file.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "create pdf file" action.""" -from __future__ import annotations import types from pathlib import Path diff --git a/diagnostic/environments/find_file_by_name.py b/diagnostic/environments/find_file_by_name.py index 4739e591..91d82a6d 100644 --- a/diagnostic/environments/find_file_by_name.py +++ b/diagnostic/environments/find_file_by_name.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "find file by name" action.""" -from __future__ import annotations from pathlib import Path from typing import Any, Mapping, Tuple diff --git a/diagnostic/environments/find_in_file_content.py b/diagnostic/environments/find_in_file_content.py index feb4410a..5d9450f9 100644 --- a/diagnostic/environments/find_in_file_content.py +++ b/diagnostic/environments/find_in_file_content.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "find in file content" action.""" -from __future__ import annotations from pathlib import Path from typing import Any, Mapping, Tuple diff --git a/diagnostic/environments/ignore.py b/diagnostic/environments/ignore.py index f89b434f..55901b5c 100644 --- a/diagnostic/environments/ignore.py +++ b/diagnostic/environments/ignore.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "ignore" action.""" -from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/keyboard_input.py b/diagnostic/environments/keyboard_input.py index 8ea9422e..b2d34492 100644 --- a/diagnostic/environments/keyboard_input.py +++ b/diagnostic/environments/keyboard_input.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "keyboard input" action.""" -from __future__ import annotations import types from typing import Any, List, Mapping, Tuple diff --git a/diagnostic/environments/keyboard_typing.py b/diagnostic/environments/keyboard_typing.py index b1fc5c37..ca01f38f 100644 --- a/diagnostic/environments/keyboard_typing.py +++ b/diagnostic/environments/keyboard_typing.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "keyboard typing" action.""" -from __future__ import annotations import types from typing import Any, Mapping, Tuple diff --git a/diagnostic/environments/list_folder.py b/diagnostic/environments/list_folder.py index de48c633..1733c9c3 100644 --- a/diagnostic/environments/list_folder.py +++ b/diagnostic/environments/list_folder.py @@ -1,5 +1,4 @@ """Environment and validation for the "list folder" action.""" -from __future__ import annotations from pathlib import Path from typing import Any, Mapping, Tuple diff --git a/diagnostic/environments/mouse_drag.py b/diagnostic/environments/mouse_drag.py index d640d9f3..812c8a8e 100644 --- a/diagnostic/environments/mouse_drag.py +++ b/diagnostic/environments/mouse_drag.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "mouse drag" action.""" -from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/mouse_move.py b/diagnostic/environments/mouse_move.py index 7198cd2e..5cf54096 100644 --- a/diagnostic/environments/mouse_move.py +++ b/diagnostic/environments/mouse_move.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "mouse move" action.""" -from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/open_application.py b/diagnostic/environments/open_application.py index fd9efad2..c85ecd2b 100644 --- a/diagnostic/environments/open_application.py +++ b/diagnostic/environments/open_application.py @@ -1,6 +1,5 @@ """Diagnostic environment for the "open application" action.""" -from __future__ import annotations import types from pathlib import Path diff --git a/diagnostic/environments/read_pdf_file.py b/diagnostic/environments/read_pdf_file.py index 0d75c18e..c954aa01 100644 --- a/diagnostic/environments/read_pdf_file.py +++ b/diagnostic/environments/read_pdf_file.py @@ -1,5 +1,4 @@ """Environment and validation for the "read pdf file" action.""" -from __future__ import annotations import textwrap import types diff --git a/diagnostic/environments/scroll.py b/diagnostic/environments/scroll.py index 6567de82..7fe6fa40 100644 --- a/diagnostic/environments/scroll.py +++ b/diagnostic/environments/scroll.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "scroll" action.""" -from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/send_http_requests.py b/diagnostic/environments/send_http_requests.py index 155814ba..84b03573 100644 --- a/diagnostic/environments/send_http_requests.py +++ b/diagnostic/environments/send_http_requests.py @@ -1,6 +1,5 @@ """Diagnostic environment for the "send HTTP requests" action.""" -from __future__ import annotations import types from typing import Any, Mapping diff --git a/diagnostic/environments/send_message.py b/diagnostic/environments/send_message.py index fea58745..08f61d26 100644 --- a/diagnostic/environments/send_message.py +++ b/diagnostic/environments/send_message.py @@ -1,6 +1,5 @@ """Diagnostic environment for the "send_message" action.""" -from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/shell_exec_windows.py b/diagnostic/environments/shell_exec_windows.py index fcddc987..869b4ed7 100644 --- a/diagnostic/environments/shell_exec_windows.py +++ b/diagnostic/environments/shell_exec_windows.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "shell exec (windows)" action.""" -from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/switch_to_cli_mode.py b/diagnostic/environments/switch_to_cli_mode.py index a732bc04..57102556 100644 --- a/diagnostic/environments/switch_to_cli_mode.py +++ b/diagnostic/environments/switch_to_cli_mode.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "switch to CLI mode" action.""" -from __future__ import annotations import types from pathlib import Path diff --git a/diagnostic/environments/trace_mouse.py b/diagnostic/environments/trace_mouse.py index 902bb72c..caabf182 100644 --- a/diagnostic/environments/trace_mouse.py +++ b/diagnostic/environments/trace_mouse.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "trace mouse" action.""" -from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/view_image.py b/diagnostic/environments/view_image.py index 4d8ffbd7..028dc077 100644 --- a/diagnostic/environments/view_image.py +++ b/diagnostic/environments/view_image.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "view image" action.""" -from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/window_close.py b/diagnostic/environments/window_close.py index ecf824d7..6b003c1d 100644 --- a/diagnostic/environments/window_close.py +++ b/diagnostic/environments/window_close.py @@ -1,5 +1,4 @@ """Diagnostic environment for the "window close" action.""" -from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/framework.py b/diagnostic/framework.py index f508a025..ea85ee9d 100644 --- a/diagnostic/framework.py +++ b/diagnostic/framework.py @@ -1,5 +1,4 @@ """Common utilities for diagnostic action harnesses.""" -from __future__ import annotations import dataclasses import io diff --git a/mkdocs/scripts/gen_ref_pages.py b/mkdocs/scripts/gen_ref_pages.py index f3699ed9..0a4788a9 100644 --- a/mkdocs/scripts/gen_ref_pages.py +++ b/mkdocs/scripts/gen_ref_pages.py @@ -1,4 +1,3 @@ -from __future__ import annotations from pathlib import Path import mkdocs_gen_files diff --git a/skills/model-usage/scripts/model_usage.py b/skills/model-usage/scripts/model_usage.py index 0b71f96e..7db15b61 100644 --- a/skills/model-usage/scripts/model_usage.py +++ b/skills/model-usage/scripts/model_usage.py @@ -5,7 +5,6 @@ Defaults to current model (most recent daily entry), or list all models. """ -from __future__ import annotations import argparse import json diff --git a/skills/stock-market-pro/scripts/options_links.py b/skills/stock-market-pro/scripts/options_links.py index 5e87ae0e..4839311d 100644 --- a/skills/stock-market-pro/scripts/options_links.py +++ b/skills/stock-market-pro/scripts/options_links.py @@ -8,7 +8,6 @@ - python3 scripts/options_links.py NVDA """ -from __future__ import annotations import argparse From fdf91711c99fcf11d08628190c1e780df1437372 Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Tue, 21 Apr 2026 16:42:01 +0530 Subject: [PATCH 21/30] fix: unify mime-type to image/jpeg in generate_multimodal --- agent_core/core/impl/vlm/interface.py | 2 +- agent_core/core/llm/google_gemini_client.py | 16 ++++------------ app/config/mcp_config.json | 16 ++++++++-------- app/config/settings.json | 10 +++++----- app/config/skills_config.json | 16 +++++++++++----- 5 files changed, 29 insertions(+), 31 deletions(-) diff --git a/agent_core/core/impl/vlm/interface.py b/agent_core/core/impl/vlm/interface.py index cd82b7b5..5049c7f3 100644 --- a/agent_core/core/impl/vlm/interface.py +++ b/agent_core/core/impl/vlm/interface.py @@ -601,7 +601,7 @@ def _gemini_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) result = self._gemini_client.generate_multimodal( self.model, text=usr, - image_bytes=image_bytes, + image_bytes_list=[image_bytes], system_prompt=sys, temperature=self.temperature, json_mode=False, diff --git a/agent_core/core/llm/google_gemini_client.py b/agent_core/core/llm/google_gemini_client.py index d8ce43d7..1d274fde 100644 --- a/agent_core/core/llm/google_gemini_client.py +++ b/agent_core/core/llm/google_gemini_client.py @@ -168,8 +168,7 @@ def generate_multimodal( model: str, *, text: str, - image_bytes: Optional[bytes] = None, - image_bytes_list: Optional[List[bytes]] = None, + image_bytes_list: List[bytes], system_prompt: Optional[str] = None, temperature: Optional[float] = None, json_mode: bool = False, @@ -189,7 +188,7 @@ def generate_multimodal( Args: model: Model identifier text: The text prompt - image_bytes: Single PNG image data (for backward compatibility) + image_bytes_list: List of image data (PNG/JPEG) system_prompt: Optional system instruction temperature: Sampling temperature @@ -198,16 +197,9 @@ def generate_multimodal( Returns: Dict with generation results and token counts """ - # Normalise: single image wraps into list; list takes priority if both provided - images = image_bytes_list if image_bytes_list is not None else ([image_bytes] if image_bytes else []) - if not images: - raise ValueError("At least one of `image_bytes` or `image_bytes_list` must be provided.") - parts: List[Dict[str, Any]] = [{"text": text}] - for img in images: - # Preserve existing mime-type logic: single-image callers stay PNG index, - # multi-image callers (video frames) use JPEG. - mime = "image/jpeg" if image_bytes_list is not None else "image/png" + for img in image_bytes_list: + mime = "image/jpeg" parts.append({ "inlineData": { "mimeType": mime, diff --git a/app/config/mcp_config.json b/app/config/mcp_config.json index d9040e06..0b80e85c 100644 --- a/app/config/mcp_config.json +++ b/app/config/mcp_config.json @@ -25,7 +25,7 @@ "env": { "GITHUB_PERSONAL_ACCESS_TOKEN": "" }, - "enabled": false + "enabled": true }, { "name": "google-workspace-mcp", @@ -57,7 +57,7 @@ "GOOGLE_CLIENT_ID": "", "GOOGLE_CLIENT_SECRET": "" }, - "enabled": false + "enabled": true }, { "name": "outlook-mcp", @@ -145,7 +145,7 @@ "SLACK_BOT_TOKEN": "", "SLACK_TEAM_ID": "" }, - "enabled": false + "enabled": true }, { "name": "discord-mcp", @@ -481,7 +481,7 @@ "GOOGLE_CLIENT_ID": "", "GOOGLE_CLIENT_SECRET": "" }, - "enabled": false + "enabled": true }, { "name": "apple-notes-mcp", @@ -552,7 +552,7 @@ "env": { "NOTION_API_KEY": "" }, - "enabled": false + "enabled": true }, { "name": "obsidian-mcp", @@ -568,7 +568,7 @@ "OBSIDIAN_VERIFY_SSL": "false", "OBSIDIAN_ENABLE_CACHE": "true" }, - "enabled": false + "enabled": true }, { "name": "todoist-mcp", @@ -582,7 +582,7 @@ "env": { "TODOIST_API_TOKEN": "" }, - "enabled": false + "enabled": true }, { "name": "jira-mcp", @@ -2109,7 +2109,7 @@ "env": { "BRAVE_API_KEY": "" }, - "enabled": false + "enabled": true }, { "name": "google-maps", diff --git a/app/config/settings.json b/app/config/settings.json index 4d5efca0..11ae4db5 100644 --- a/app/config/settings.json +++ b/app/config/settings.json @@ -11,10 +11,10 @@ "enabled": true }, "model": { - "llm_provider": "byteplus", - "vlm_provider": "byteplus", - "llm_model": "kimi-k2-250905", - "vlm_model": "seed-1-6-250915", + "llm_provider": "gemini", + "vlm_provider": "gemini", + "llm_model": null, + "vlm_model": null, "slow_mode": true, "slow_mode_tpm_limit": 25000 }, @@ -76,4 +76,4 @@ "google": true, "byteplus": true } -} +} \ No newline at end of file diff --git a/app/config/skills_config.json b/app/config/skills_config.json index a1917de2..ebfc570b 100644 --- a/app/config/skills_config.json +++ b/app/config/skills_config.json @@ -6,7 +6,17 @@ "pdf", "playwright-mcp", "pptx", - "xlsx" + "xlsx", + "research-assistant", + "writing-assistant", + "brave-search", + "task-planner", + "gmail", + "google-drive", + "notion", + "obsidian", + "github", + "google-sheets" ], "disabled_skills": [ "cli-anything", @@ -27,7 +37,6 @@ "bbc-news", "better-notion", "blogwatcher", - "brave-search", "calctl", "calendly", "clickup", @@ -48,15 +57,12 @@ "gifgrep", "github-api", "gkeep", - "gmail", "gmail-manager", "gog", "gogcli", - "google-drive", "google-meet", "google-play", "google-sheet", - "google-sheets", "google-slides", "google-workspace-admin", "goplaces", From 29ee8a5b249c977432a0c41316d65894f21ee0dc Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Wed, 22 Apr 2026 00:51:25 +0530 Subject: [PATCH 22/30] chore(config): revert personal dev config changes from settings, mcp_config, skills_config --- app/config/mcp_config.json | 16 ++++++++-------- app/config/settings.json | 10 +++++----- app/config/skills_config.json | 16 +++++----------- 3 files changed, 18 insertions(+), 24 deletions(-) diff --git a/app/config/mcp_config.json b/app/config/mcp_config.json index 0b80e85c..d9040e06 100644 --- a/app/config/mcp_config.json +++ b/app/config/mcp_config.json @@ -25,7 +25,7 @@ "env": { "GITHUB_PERSONAL_ACCESS_TOKEN": "" }, - "enabled": true + "enabled": false }, { "name": "google-workspace-mcp", @@ -57,7 +57,7 @@ "GOOGLE_CLIENT_ID": "", "GOOGLE_CLIENT_SECRET": "" }, - "enabled": true + "enabled": false }, { "name": "outlook-mcp", @@ -145,7 +145,7 @@ "SLACK_BOT_TOKEN": "", "SLACK_TEAM_ID": "" }, - "enabled": true + "enabled": false }, { "name": "discord-mcp", @@ -481,7 +481,7 @@ "GOOGLE_CLIENT_ID": "", "GOOGLE_CLIENT_SECRET": "" }, - "enabled": true + "enabled": false }, { "name": "apple-notes-mcp", @@ -552,7 +552,7 @@ "env": { "NOTION_API_KEY": "" }, - "enabled": true + "enabled": false }, { "name": "obsidian-mcp", @@ -568,7 +568,7 @@ "OBSIDIAN_VERIFY_SSL": "false", "OBSIDIAN_ENABLE_CACHE": "true" }, - "enabled": true + "enabled": false }, { "name": "todoist-mcp", @@ -582,7 +582,7 @@ "env": { "TODOIST_API_TOKEN": "" }, - "enabled": true + "enabled": false }, { "name": "jira-mcp", @@ -2109,7 +2109,7 @@ "env": { "BRAVE_API_KEY": "" }, - "enabled": true + "enabled": false }, { "name": "google-maps", diff --git a/app/config/settings.json b/app/config/settings.json index 11ae4db5..4d5efca0 100644 --- a/app/config/settings.json +++ b/app/config/settings.json @@ -11,10 +11,10 @@ "enabled": true }, "model": { - "llm_provider": "gemini", - "vlm_provider": "gemini", - "llm_model": null, - "vlm_model": null, + "llm_provider": "byteplus", + "vlm_provider": "byteplus", + "llm_model": "kimi-k2-250905", + "vlm_model": "seed-1-6-250915", "slow_mode": true, "slow_mode_tpm_limit": 25000 }, @@ -76,4 +76,4 @@ "google": true, "byteplus": true } -} \ No newline at end of file +} diff --git a/app/config/skills_config.json b/app/config/skills_config.json index ebfc570b..a1917de2 100644 --- a/app/config/skills_config.json +++ b/app/config/skills_config.json @@ -6,17 +6,7 @@ "pdf", "playwright-mcp", "pptx", - "xlsx", - "research-assistant", - "writing-assistant", - "brave-search", - "task-planner", - "gmail", - "google-drive", - "notion", - "obsidian", - "github", - "google-sheets" + "xlsx" ], "disabled_skills": [ "cli-anything", @@ -37,6 +27,7 @@ "bbc-news", "better-notion", "blogwatcher", + "brave-search", "calctl", "calendly", "clickup", @@ -57,12 +48,15 @@ "gifgrep", "github-api", "gkeep", + "gmail", "gmail-manager", "gog", "gogcli", + "google-drive", "google-meet", "google-play", "google-sheet", + "google-sheets", "google-slides", "google-workspace-admin", "goplaces", From 9552308b41c61cf49e82a70067a21d03bc5694ad Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Wed, 22 Apr 2026 01:02:52 +0530 Subject: [PATCH 23/30] chore(actions): remove unused execute alias from perform_ocr and understand_video --- app/data/action/perform_ocr.py | 2 -- app/data/action/understand_video.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/app/data/action/perform_ocr.py b/app/data/action/perform_ocr.py index 85c2a5d6..fc3e3e39 100644 --- a/app/data/action/perform_ocr.py +++ b/app/data/action/perform_ocr.py @@ -78,5 +78,3 @@ def perform_ocr(input_data: dict) -> dict: return {**result, 'message': ''} except Exception as e: return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': str(e)} - -execute = perform_ocr diff --git a/app/data/action/understand_video.py b/app/data/action/understand_video.py index 12a19804..5b922947 100644 --- a/app/data/action/understand_video.py +++ b/app/data/action/understand_video.py @@ -130,5 +130,3 @@ def understand_video(input_data: dict) -> dict: return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': str(e)} except Exception as e: return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': str(e)} - -execute = understand_video From 5e0a957d94f3615e42ee4568ed0019ea1e4e3f60 Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Wed, 22 Apr 2026 01:02:57 +0530 Subject: [PATCH 24/30] refactor(vlm): merge _openai_describe_bytes_plain into _openai_describe_bytes via json_mode param --- agent_core/core/impl/vlm/interface.py | 35 +++------------------------ 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/agent_core/core/impl/vlm/interface.py b/agent_core/core/impl/vlm/interface.py index 5049c7f3..dc86d82b 100644 --- a/agent_core/core/impl/vlm/interface.py +++ b/agent_core/core/impl/vlm/interface.py @@ -237,10 +237,7 @@ def describe_image_bytes( if self.provider == "deepseek": raise RuntimeError("DeepSeek does not support vision/VLM. Use a different provider for image description.") elif self.provider in ("openai", "minimax", "moonshot", "grok"): - if json_mode: - response = self._openai_describe_bytes(image_bytes, system_prompt, user_prompt) - else: - response = self._openai_describe_bytes_plain(image_bytes, system_prompt, user_prompt) + response = self._openai_describe_bytes(image_bytes, system_prompt, user_prompt, json_mode=json_mode) elif self.provider == "remote": response = self._ollama_describe_bytes(image_bytes, system_prompt, user_prompt) elif self.provider == "gemini": @@ -429,29 +426,6 @@ def _report_usage_async( except Exception as e: logger.warning(f"[VLM] Failed to report usage: {e}") - def _openai_describe_bytes_plain(self, image_bytes: bytes, sys: str | None, usr: str) -> Dict[str, Any]: - """OpenAI vision request WITHOUT json_object enforcement — for raw text output (OCR).""" - img_b64 = base64.b64encode(image_bytes).decode() - messages: list[Dict[str, Any]] = [] - if sys: - messages.append({"role": "system", "content": sys}) - messages.append({ - "role": "user", - "content": [ - {"type": "text", "text": usr}, - {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}, - ], - }) - response = self.client.chat.completions.create( - model=self.model, - messages=messages, - temperature=self.temperature, - max_tokens=4096, # OCR may return large amounts of text - # NOTE: No response_format — OCR returns plain text - ) - content = response.choices[0].message.content.strip() - total_tokens = response.usage.prompt_tokens + response.usage.completion_tokens - return {"tokens_used": total_tokens, "content": content} def _gemini_describe_video_frames( self, frame_bytes_list: list[bytes], sys: str | None, usr: str @@ -496,7 +470,7 @@ def _multi_frame_describe_fallback( ) return synthesis - def _openai_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) -> Dict[str, Any]: + def _openai_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str, json_mode: bool = True) -> Dict[str, Any]: """OpenAI/Grok vision request with automatic prompt caching metrics.""" img_b64 = base64.b64encode(image_bytes).decode() mime_type = self._detect_mime_type(image_bytes) @@ -514,14 +488,13 @@ def _openai_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) ) # Newer OpenAI models (o1, o3, o4, gpt-5, etc.) require # 'max_completion_tokens' instead of the legacy 'max_tokens' parameter. - # Note: response_format=json_object is intentionally NOT set here because - # describe_image returns plain text descriptions, not JSON. Enabling JSON - # mode would also require the prompt to contain the word "json". request_kwargs: Dict[str, Any] = { "model": self.model, "messages": messages, "temperature": self.temperature, } + if json_mode: + request_kwargs["response_format"] = {"type": "json_object"} model_lower = (self.model or "").lower() uses_max_completion_tokens = ( model_lower.startswith("o1") From c97536e7722b0063664bcd70eb6328be626e023a Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Wed, 22 Apr 2026 01:03:01 +0530 Subject: [PATCH 25/30] fix(decorators): restore correct position of from __future__ import annotations in log_events and profiler --- agent_core/decorators/log_events.py | 3 +-- agent_core/decorators/profiler.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/agent_core/decorators/log_events.py b/agent_core/decorators/log_events.py index 305b96a1..41a84547 100644 --- a/agent_core/decorators/log_events.py +++ b/agent_core/decorators/log_events.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import annotations """ Flexible function-level logging: - logs start @@ -9,7 +8,7 @@ {id}, {name}, {args}, {kwargs}, {result}, {exception}, {duration_ms} """ - +from __future__ import annotations import logging import time diff --git a/agent_core/decorators/profiler.py b/agent_core/decorators/profiler.py index 0c083605..ca35a343 100644 --- a/agent_core/decorators/profiler.py +++ b/agent_core/decorators/profiler.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import annotations """ Profiler Module - Comprehensive performance tracking for the agent. @@ -29,7 +28,7 @@ Set "auto_save_interval" to N to save after every N loops (0 = only at exit). """ - +from __future__ import annotations import atexit import asyncio From 932faad785b24f8c44990bef349177cc0f6a4fdc Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Wed, 22 Apr 2026 01:04:47 +0530 Subject: [PATCH 26/30] chore: restore from __future__ import annotations across all files (reverted bulk deletion) --- .restore_future_annotations.sh | 143 ++++++++++++++++++ agent_core/core/impl/llm/cache/config.py | 1 + agent_core/core/impl/llm/cache/metrics.py | 1 + agent_core/core/impl/llm/errors.py | 1 + agent_core/core/impl/llm/types.py | 1 + agent_core/core/impl/memory/manager.py | 1 + agent_core/core/llm/google_gemini_client.py | 1 + agent_core/core/protocols/trigger.py | 1 + agent_core/core/task/todo.py | 1 + agent_core/core/trigger.py | 1 + app/browser/interface.py | 1 + app/cli/interface.py | 1 + app/credentials/handlers.py | 1 + app/external_comms/base.py | 1 + app/external_comms/config.py | 1 + app/external_comms/integration_settings.py | 1 + app/external_comms/manager.py | 1 + .../platforms/google_workspace.py | 1 + app/external_comms/platforms/linkedin.py | 1 + app/external_comms/platforms/notion.py | 1 + app/external_comms/platforms/outlook.py | 1 + app/external_comms/platforms/slack.py | 1 + app/external_comms/platforms/telegram_bot.py | 1 + app/external_comms/platforms/telegram_user.py | 1 + app/external_comms/platforms/twitter.py | 1 + .../platforms/whatsapp_bridge/client.py | 1 + .../platforms/whatsapp_business.py | 1 + app/external_comms/platforms/whatsapp_web.py | 1 + app/external_comms/registry.py | 1 + app/ui_layer/commands/base.py | 1 + .../commands/builtin/agent_command.py | 1 + app/ui_layer/commands/builtin/clear.py | 1 + app/ui_layer/commands/builtin/cred.py | 1 + app/ui_layer/commands/builtin/exit.py | 1 + app/ui_layer/commands/builtin/help.py | 1 + app/ui_layer/commands/builtin/integrations.py | 1 + app/ui_layer/commands/builtin/mcp.py | 1 + app/ui_layer/commands/builtin/menu.py | 1 + app/ui_layer/commands/builtin/provider.py | 1 + app/ui_layer/commands/builtin/reset.py | 1 + app/ui_layer/commands/builtin/skill.py | 1 + app/ui_layer/commands/builtin/skill_invoke.py | 1 + app/ui_layer/commands/builtin/update.py | 1 + app/ui_layer/commands/executor.py | 1 + app/ui_layer/events/event_bus.py | 1 + app/ui_layer/events/event_types.py | 1 + app/ui_layer/events/transformer.py | 1 + app/ui_layer/onboarding/controller.py | 1 + app/ui_layer/themes/base.py | 1 + app/usage/action_storage.py | 1 + app/usage/chat_storage.py | 1 + app/usage/reporter.py | 1 + app/usage/session_storage.py | 1 + app/usage/storage.py | 1 + app/usage/task_storage.py | 1 + diagnostic/action_diagnose.py | 1 + .../create_and_run_python_script.py | 1 + diagnostic/environments/create_pdf_file.py | 1 + diagnostic/environments/find_file_by_name.py | 1 + .../environments/find_in_file_content.py | 1 + diagnostic/environments/ignore.py | 1 + diagnostic/environments/keyboard_input.py | 1 + diagnostic/environments/keyboard_typing.py | 1 + diagnostic/environments/list_folder.py | 1 + diagnostic/environments/mouse_drag.py | 1 + diagnostic/environments/mouse_move.py | 1 + diagnostic/environments/open_application.py | 1 + diagnostic/environments/read_pdf_file.py | 1 + diagnostic/environments/scroll.py | 1 + diagnostic/environments/send_http_requests.py | 1 + diagnostic/environments/send_message.py | 1 + diagnostic/environments/shell_exec_windows.py | 1 + diagnostic/environments/switch_to_cli_mode.py | 1 + diagnostic/environments/trace_mouse.py | 1 + diagnostic/environments/view_image.py | 1 + diagnostic/environments/window_close.py | 1 + diagnostic/framework.py | 1 + mkdocs/scripts/gen_ref_pages.py | 1 + skills/model-usage/scripts/model_usage.py | 1 + .../stock-market-pro/scripts/options_links.py | 1 + 80 files changed, 222 insertions(+) create mode 100755 .restore_future_annotations.sh diff --git a/.restore_future_annotations.sh b/.restore_future_annotations.sh new file mode 100755 index 00000000..cec97862 --- /dev/null +++ b/.restore_future_annotations.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash +# Restore `from __future__ import annotations` in files that lost it vs dev. +# For each file: find the line number after the module docstring in the dev version, +# then insert it in the same position in the working tree using Python (cross-platform safe). + +set -euo pipefail + +FILES=( + "agent_core/core/impl/llm/cache/config.py" + "agent_core/core/impl/llm/cache/metrics.py" + "agent_core/core/impl/llm/errors.py" + "agent_core/core/impl/llm/types.py" + "agent_core/core/impl/memory/manager.py" + "agent_core/core/llm/google_gemini_client.py" + "agent_core/core/protocols/trigger.py" + "agent_core/core/task/todo.py" + "agent_core/core/trigger.py" + "app/browser/interface.py" + "app/cli/interface.py" + "app/credentials/handlers.py" + "app/external_comms/base.py" + "app/external_comms/config.py" + "app/external_comms/integration_settings.py" + "app/external_comms/manager.py" + "app/external_comms/platforms/google_workspace.py" + "app/external_comms/platforms/linkedin.py" + "app/external_comms/platforms/notion.py" + "app/external_comms/platforms/outlook.py" + "app/external_comms/platforms/slack.py" + "app/external_comms/platforms/telegram_bot.py" + "app/external_comms/platforms/telegram_user.py" + "app/external_comms/platforms/twitter.py" + "app/external_comms/platforms/whatsapp_bridge/client.py" + "app/external_comms/platforms/whatsapp_business.py" + "app/external_comms/platforms/whatsapp_web.py" + "app/external_comms/registry.py" + "app/ui_layer/commands/base.py" + "app/ui_layer/commands/builtin/agent_command.py" + "app/ui_layer/commands/builtin/clear.py" + "app/ui_layer/commands/builtin/cred.py" + "app/ui_layer/commands/builtin/exit.py" + "app/ui_layer/commands/builtin/help.py" + "app/ui_layer/commands/builtin/integrations.py" + "app/ui_layer/commands/builtin/mcp.py" + "app/ui_layer/commands/builtin/menu.py" + "app/ui_layer/commands/builtin/provider.py" + "app/ui_layer/commands/builtin/reset.py" + "app/ui_layer/commands/builtin/skill.py" + "app/ui_layer/commands/builtin/skill_invoke.py" + "app/ui_layer/commands/builtin/update.py" + "app/ui_layer/commands/executor.py" + "app/ui_layer/events/event_bus.py" + "app/ui_layer/events/event_types.py" + "app/ui_layer/events/transformer.py" + "app/ui_layer/onboarding/controller.py" + "app/ui_layer/themes/base.py" + "app/usage/action_storage.py" + "app/usage/chat_storage.py" + "app/usage/reporter.py" + "app/usage/session_storage.py" + "app/usage/storage.py" + "app/usage/task_storage.py" + "diagnostic/action_diagnose.py" + "diagnostic/environments/create_and_run_python_script.py" + "diagnostic/environments/create_pdf_file.py" + "diagnostic/environments/find_file_by_name.py" + "diagnostic/environments/find_in_file_content.py" + "diagnostic/environments/ignore.py" + "diagnostic/environments/keyboard_input.py" + "diagnostic/environments/keyboard_typing.py" + "diagnostic/environments/list_folder.py" + "diagnostic/environments/mouse_drag.py" + "diagnostic/environments/mouse_move.py" + "diagnostic/environments/open_application.py" + "diagnostic/environments/read_pdf_file.py" + "diagnostic/environments/scroll.py" + "diagnostic/environments/send_http_requests.py" + "diagnostic/environments/send_message.py" + "diagnostic/environments/shell_exec_windows.py" + "diagnostic/environments/switch_to_cli_mode.py" + "diagnostic/environments/trace_mouse.py" + "diagnostic/environments/view_image.py" + "diagnostic/environments/window_close.py" + "diagnostic/framework.py" + "mkdocs/scripts/gen_ref_pages.py" + "skills/model-usage/scripts/model_usage.py" + "skills/stock-market-pro/scripts/options_links.py" +) + +ANNOTATION="from __future__ import annotations" +RESTORED=0 +SKIPPED=0 +ERRORS=0 + +for FILE in "${FILES[@]}"; do + if [ ! -f "$FILE" ]; then + echo "SKIP (not found): $FILE" + ((SKIPPED++)) || true + continue + fi + + # Skip if annotation already present (e.g. vlm/interface.py which we kept) + if grep -q "^from __future__ import annotations" "$FILE"; then + echo "SKIP (already has it): $FILE" + ((SKIPPED++)) || true + continue + fi + + # Get the line number of from __future__ import annotations in the dev version + DEV_LINE=$(git show "dev:$FILE" 2>/dev/null | grep -n "^from __future__ import annotations" | head -1 | cut -d: -f1) + + if [ -z "$DEV_LINE" ]; then + echo "SKIP (not in dev either): $FILE" + ((SKIPPED++)) || true + continue + fi + + # Insert the annotation at that line number in the current file using Python + python3 - "$FILE" "$DEV_LINE" "$ANNOTATION" <<'PYEOF' +import sys + +filepath = sys.argv[1] +insert_at = int(sys.argv[2]) # 1-indexed line number from dev +annotation = sys.argv[3] + +with open(filepath, "r", encoding="utf-8") as f: + lines = f.readlines() + +# Insert at insert_at - 1 (0-indexed), preserving newlines +insert_idx = insert_at - 1 +lines.insert(insert_idx, annotation + "\n") + +with open(filepath, "w", encoding="utf-8") as f: + f.writelines(lines) + +print(f"OK: inserted '{annotation}' at line {insert_at} in {filepath}") +PYEOF + + ((RESTORED++)) || true +done + +echo "" +echo "Done: $RESTORED restored, $SKIPPED skipped, $ERRORS errors." diff --git a/agent_core/core/impl/llm/cache/config.py b/agent_core/core/impl/llm/cache/config.py index 57efc4f8..aacc411e 100644 --- a/agent_core/core/impl/llm/cache/config.py +++ b/agent_core/core/impl/llm/cache/config.py @@ -5,6 +5,7 @@ Shared cache configuration for all LLM providers. """ +from __future__ import annotations import os diff --git a/agent_core/core/impl/llm/cache/metrics.py b/agent_core/core/impl/llm/cache/metrics.py index d0af1f85..0e1bbc6b 100644 --- a/agent_core/core/impl/llm/cache/metrics.py +++ b/agent_core/core/impl/llm/cache/metrics.py @@ -5,6 +5,7 @@ Cache effectiveness metrics tracking for all LLM providers. """ +from __future__ import annotations import logging diff --git a/agent_core/core/impl/llm/errors.py b/agent_core/core/impl/llm/errors.py index f0cd6655..052e2611 100644 --- a/agent_core/core/impl/llm/errors.py +++ b/agent_core/core/impl/llm/errors.py @@ -6,6 +6,7 @@ Uses proper exception types and HTTP status codes - no string pattern matching. """ +from __future__ import annotations from typing import Optional diff --git a/agent_core/core/impl/llm/types.py b/agent_core/core/impl/llm/types.py index a4925ccb..4f51eabe 100644 --- a/agent_core/core/impl/llm/types.py +++ b/agent_core/core/impl/llm/types.py @@ -3,6 +3,7 @@ Shared types and enums for the LLM interface module. """ +from __future__ import annotations from enum import Enum diff --git a/agent_core/core/impl/memory/manager.py b/agent_core/core/impl/memory/manager.py index 745bd2d4..5b491864 100644 --- a/agent_core/core/impl/memory/manager.py +++ b/agent_core/core/impl/memory/manager.py @@ -13,6 +13,7 @@ the full content directly. This keeps retrieval lightweight. """ +from __future__ import annotations import hashlib import re diff --git a/agent_core/core/llm/google_gemini_client.py b/agent_core/core/llm/google_gemini_client.py index 1d274fde..114734fb 100644 --- a/agent_core/core/llm/google_gemini_client.py +++ b/agent_core/core/llm/google_gemini_client.py @@ -7,6 +7,7 @@ SDK emits during import/initialisation (e.g. the ``ALTS creds ignored`` message that was polluting the CLI output). """ +from __future__ import annotations import base64 diff --git a/agent_core/core/protocols/trigger.py b/agent_core/core/protocols/trigger.py index d563b369..e6afc8aa 100644 --- a/agent_core/core/protocols/trigger.py +++ b/agent_core/core/protocols/trigger.py @@ -2,6 +2,7 @@ """ Protocol definition for TriggerQueue. """ +from __future__ import annotations from typing import List, Protocol, Optional, runtime_checkable diff --git a/agent_core/core/task/todo.py b/agent_core/core/task/todo.py index 89da9891..d51afa92 100644 --- a/agent_core/core/task/todo.py +++ b/agent_core/core/task/todo.py @@ -6,6 +6,7 @@ todo list mechanism similar to Claude Code's TodoWrite tool. """ +from __future__ import annotations import uuid from dataclasses import dataclass, field from typing import Dict, Any, Optional, Literal diff --git a/agent_core/core/trigger.py b/agent_core/core/trigger.py index 23a0666f..c4970ec8 100644 --- a/agent_core/core/trigger.py +++ b/agent_core/core/trigger.py @@ -4,6 +4,7 @@ Trigger dataclass - the entry point for all agent reactions. """ +from __future__ import annotations from dataclasses import dataclass, field from typing import Dict, Any, Optional diff --git a/app/browser/interface.py b/app/browser/interface.py index 02551fc4..e61d596c 100644 --- a/app/browser/interface.py +++ b/app/browser/interface.py @@ -5,6 +5,7 @@ the centralized UI layer components. """ +from __future__ import annotations from typing import TYPE_CHECKING diff --git a/app/cli/interface.py b/app/cli/interface.py index 2f783a69..8af4ce66 100644 --- a/app/cli/interface.py +++ b/app/cli/interface.py @@ -6,6 +6,7 @@ the centralized UI layer components. """ +from __future__ import annotations from typing import TYPE_CHECKING diff --git a/app/credentials/handlers.py b/app/credentials/handlers.py index 2f488a19..aa683d10 100644 --- a/app/credentials/handlers.py +++ b/app/credentials/handlers.py @@ -1,4 +1,5 @@ """All integration credential handlers + registry.""" +from __future__ import annotations import base64 diff --git a/app/external_comms/base.py b/app/external_comms/base.py index e7006662..cab67f78 100644 --- a/app/external_comms/base.py +++ b/app/external_comms/base.py @@ -5,6 +5,7 @@ Base classes for platform clients. """ +from __future__ import annotations import logging from abc import ABC, abstractmethod diff --git a/app/external_comms/config.py b/app/external_comms/config.py index 54d70a63..bec98efc 100644 --- a/app/external_comms/config.py +++ b/app/external_comms/config.py @@ -5,6 +5,7 @@ Configuration for external communication channels (WhatsApp, Telegram). """ +from __future__ import annotations import json import logging diff --git a/app/external_comms/integration_settings.py b/app/external_comms/integration_settings.py index 35eb6d0b..25dfd390 100644 --- a/app/external_comms/integration_settings.py +++ b/app/external_comms/integration_settings.py @@ -1,4 +1,5 @@ """Integration settings management — shared by browser and TUI frontends.""" +from __future__ import annotations import asyncio import logging diff --git a/app/external_comms/manager.py b/app/external_comms/manager.py index 4c9bcda3..f54fc5a9 100644 --- a/app/external_comms/manager.py +++ b/app/external_comms/manager.py @@ -6,6 +6,7 @@ Uses the platform registry to discover and start all platforms that support listening. """ +from __future__ import annotations import asyncio import logging diff --git a/app/external_comms/platforms/google_workspace.py b/app/external_comms/platforms/google_workspace.py index fddfd564..bb1e9976 100644 --- a/app/external_comms/platforms/google_workspace.py +++ b/app/external_comms/platforms/google_workspace.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Google Workspace client — Gmail + Calendar + Drive via httpx.""" +from __future__ import annotations import asyncio import base64 diff --git a/app/external_comms/platforms/linkedin.py b/app/external_comms/platforms/linkedin.py index 53d0f5d9..865f2517 100644 --- a/app/external_comms/platforms/linkedin.py +++ b/app/external_comms/platforms/linkedin.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """LinkedIn REST API v2 client — direct HTTP via httpx.""" +from __future__ import annotations import time from dataclasses import dataclass diff --git a/app/external_comms/platforms/notion.py b/app/external_comms/platforms/notion.py index b873d073..d05974d5 100644 --- a/app/external_comms/platforms/notion.py +++ b/app/external_comms/platforms/notion.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Notion API client — direct HTTP via httpx.""" +from __future__ import annotations from dataclasses import dataclass from typing import Any, Dict, List, Optional diff --git a/app/external_comms/platforms/outlook.py b/app/external_comms/platforms/outlook.py index 933ce8c0..139421df 100644 --- a/app/external_comms/platforms/outlook.py +++ b/app/external_comms/platforms/outlook.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Outlook email client — Microsoft Graph API via httpx.""" +from __future__ import annotations import asyncio import logging diff --git a/app/external_comms/platforms/slack.py b/app/external_comms/platforms/slack.py index 5634a86c..efb848ee 100644 --- a/app/external_comms/platforms/slack.py +++ b/app/external_comms/platforms/slack.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Slack API client — direct HTTP via httpx.""" +from __future__ import annotations import asyncio import logging diff --git a/app/external_comms/platforms/telegram_bot.py b/app/external_comms/platforms/telegram_bot.py index 9928b803..bc895b4b 100644 --- a/app/external_comms/platforms/telegram_bot.py +++ b/app/external_comms/platforms/telegram_bot.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Telegram Bot API client — direct HTTP via httpx.""" +from __future__ import annotations import asyncio import logging diff --git a/app/external_comms/platforms/telegram_user.py b/app/external_comms/platforms/telegram_user.py index 3aa4fdc8..d80e068b 100644 --- a/app/external_comms/platforms/telegram_user.py +++ b/app/external_comms/platforms/telegram_user.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Telegram MTProto (user account) client — uses Telethon with StringSession.""" +from __future__ import annotations import asyncio import logging diff --git a/app/external_comms/platforms/twitter.py b/app/external_comms/platforms/twitter.py index f05cba9b..9fc76143 100644 --- a/app/external_comms/platforms/twitter.py +++ b/app/external_comms/platforms/twitter.py @@ -6,6 +6,7 @@ mention triggers to those containing a specific keyword. """ +from __future__ import annotations import asyncio import hashlib diff --git a/app/external_comms/platforms/whatsapp_bridge/client.py b/app/external_comms/platforms/whatsapp_bridge/client.py index 62c1a525..905d7f40 100644 --- a/app/external_comms/platforms/whatsapp_bridge/client.py +++ b/app/external_comms/platforms/whatsapp_bridge/client.py @@ -6,6 +6,7 @@ sending commands and receiving events via stdin/stdout JSON lines. """ +from __future__ import annotations import asyncio import json diff --git a/app/external_comms/platforms/whatsapp_business.py b/app/external_comms/platforms/whatsapp_business.py index 384e7a27..52057bee 100644 --- a/app/external_comms/platforms/whatsapp_business.py +++ b/app/external_comms/platforms/whatsapp_business.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """WhatsApp Business Cloud API client — direct HTTP via httpx.""" +from __future__ import annotations from dataclasses import dataclass from typing import Any, Dict, List, Optional diff --git a/app/external_comms/platforms/whatsapp_web.py b/app/external_comms/platforms/whatsapp_web.py index 75a1b628..3ce612c4 100644 --- a/app/external_comms/platforms/whatsapp_web.py +++ b/app/external_comms/platforms/whatsapp_web.py @@ -9,6 +9,7 @@ ``app.external_comms.platforms.whatsapp_bridge.client``. """ +from __future__ import annotations import asyncio import logging diff --git a/app/external_comms/registry.py b/app/external_comms/registry.py index 433559c9..c55ed745 100644 --- a/app/external_comms/registry.py +++ b/app/external_comms/registry.py @@ -5,6 +5,7 @@ Simple registry of platform clients. """ +from __future__ import annotations import logging from typing import Dict, Optional, Type diff --git a/app/ui_layer/commands/base.py b/app/ui_layer/commands/base.py index 6c7aa063..c63c7253 100644 --- a/app/ui_layer/commands/base.py +++ b/app/ui_layer/commands/base.py @@ -1,5 +1,6 @@ """Base command class and result type.""" +from __future__ import annotations from abc import ABC, abstractmethod from dataclasses import dataclass, field diff --git a/app/ui_layer/commands/builtin/agent_command.py b/app/ui_layer/commands/builtin/agent_command.py index eaba2939..0d3bf7e0 100644 --- a/app/ui_layer/commands/builtin/agent_command.py +++ b/app/ui_layer/commands/builtin/agent_command.py @@ -1,5 +1,6 @@ """Wrapper for agent-provided commands.""" +from __future__ import annotations from typing import TYPE_CHECKING, List diff --git a/app/ui_layer/commands/builtin/clear.py b/app/ui_layer/commands/builtin/clear.py index 550d736e..bf6d4796 100644 --- a/app/ui_layer/commands/builtin/clear.py +++ b/app/ui_layer/commands/builtin/clear.py @@ -1,5 +1,6 @@ """Clear command implementation.""" +from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/cred.py b/app/ui_layer/commands/builtin/cred.py index 4a1428b5..d9809136 100644 --- a/app/ui_layer/commands/builtin/cred.py +++ b/app/ui_layer/commands/builtin/cred.py @@ -1,5 +1,6 @@ """Credential management command implementation.""" +from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/exit.py b/app/ui_layer/commands/builtin/exit.py index 2c47ec2c..5cad002c 100644 --- a/app/ui_layer/commands/builtin/exit.py +++ b/app/ui_layer/commands/builtin/exit.py @@ -1,5 +1,6 @@ """Exit command implementation.""" +from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/help.py b/app/ui_layer/commands/builtin/help.py index 77c6a68d..99030c72 100644 --- a/app/ui_layer/commands/builtin/help.py +++ b/app/ui_layer/commands/builtin/help.py @@ -1,5 +1,6 @@ """Help command implementation.""" +from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/integrations.py b/app/ui_layer/commands/builtin/integrations.py index c50a2490..b924da95 100644 --- a/app/ui_layer/commands/builtin/integrations.py +++ b/app/ui_layer/commands/builtin/integrations.py @@ -5,6 +5,7 @@ share the same logic and side-effects (e.g. platform-listener startup). """ +from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/mcp.py b/app/ui_layer/commands/builtin/mcp.py index cf8de2bd..da8cc203 100644 --- a/app/ui_layer/commands/builtin/mcp.py +++ b/app/ui_layer/commands/builtin/mcp.py @@ -1,5 +1,6 @@ """MCP (Model Context Protocol) command implementation.""" +from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/menu.py b/app/ui_layer/commands/builtin/menu.py index f6f8f28a..27c5ad5d 100644 --- a/app/ui_layer/commands/builtin/menu.py +++ b/app/ui_layer/commands/builtin/menu.py @@ -1,5 +1,6 @@ """Menu command implementation.""" +from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/provider.py b/app/ui_layer/commands/builtin/provider.py index 9be63d13..e9c1d9b7 100644 --- a/app/ui_layer/commands/builtin/provider.py +++ b/app/ui_layer/commands/builtin/provider.py @@ -1,5 +1,6 @@ """Provider command implementation.""" +from __future__ import annotations import os from typing import List diff --git a/app/ui_layer/commands/builtin/reset.py b/app/ui_layer/commands/builtin/reset.py index fc042bee..c31d218a 100644 --- a/app/ui_layer/commands/builtin/reset.py +++ b/app/ui_layer/commands/builtin/reset.py @@ -1,5 +1,6 @@ """Reset command implementation.""" +from __future__ import annotations import asyncio from typing import List diff --git a/app/ui_layer/commands/builtin/skill.py b/app/ui_layer/commands/builtin/skill.py index 90cfecb2..2e6e2207 100644 --- a/app/ui_layer/commands/builtin/skill.py +++ b/app/ui_layer/commands/builtin/skill.py @@ -1,5 +1,6 @@ """Skill command implementation.""" +from __future__ import annotations from typing import List diff --git a/app/ui_layer/commands/builtin/skill_invoke.py b/app/ui_layer/commands/builtin/skill_invoke.py index b9c7e0e4..0c077e00 100644 --- a/app/ui_layer/commands/builtin/skill_invoke.py +++ b/app/ui_layer/commands/builtin/skill_invoke.py @@ -1,5 +1,6 @@ """Skill invocation command - allows invoking skills as slash commands.""" +from __future__ import annotations from typing import List, TYPE_CHECKING diff --git a/app/ui_layer/commands/builtin/update.py b/app/ui_layer/commands/builtin/update.py index dae409d6..81156848 100644 --- a/app/ui_layer/commands/builtin/update.py +++ b/app/ui_layer/commands/builtin/update.py @@ -1,5 +1,6 @@ """Update command implementation.""" +from __future__ import annotations import asyncio from typing import List diff --git a/app/ui_layer/commands/executor.py b/app/ui_layer/commands/executor.py index e002b5be..475e8ad5 100644 --- a/app/ui_layer/commands/executor.py +++ b/app/ui_layer/commands/executor.py @@ -1,5 +1,6 @@ """Command executor for parsing and executing commands.""" +from __future__ import annotations from typing import TYPE_CHECKING diff --git a/app/ui_layer/events/event_bus.py b/app/ui_layer/events/event_bus.py index a8c4b5f3..54f0ef3a 100644 --- a/app/ui_layer/events/event_bus.py +++ b/app/ui_layer/events/event_bus.py @@ -1,5 +1,6 @@ """Publish/subscribe event bus for UI events.""" +from __future__ import annotations import asyncio from collections import defaultdict diff --git a/app/ui_layer/events/event_types.py b/app/ui_layer/events/event_types.py index 516645a6..7dbd9c25 100644 --- a/app/ui_layer/events/event_types.py +++ b/app/ui_layer/events/event_types.py @@ -1,5 +1,6 @@ """UI Event types and data structures.""" +from __future__ import annotations from dataclasses import dataclass, field from datetime import datetime diff --git a/app/ui_layer/events/transformer.py b/app/ui_layer/events/transformer.py index f4b96be4..3bca0d10 100644 --- a/app/ui_layer/events/transformer.py +++ b/app/ui_layer/events/transformer.py @@ -1,5 +1,6 @@ """Transform agent events to UI events.""" +from __future__ import annotations from datetime import datetime from typing import Optional, Any, TYPE_CHECKING diff --git a/app/ui_layer/onboarding/controller.py b/app/ui_layer/onboarding/controller.py index ced24261..04b6631f 100644 --- a/app/ui_layer/onboarding/controller.py +++ b/app/ui_layer/onboarding/controller.py @@ -1,5 +1,6 @@ """Onboarding flow controller.""" +from __future__ import annotations from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type from dataclasses import dataclass, field diff --git a/app/ui_layer/themes/base.py b/app/ui_layer/themes/base.py index c1acd4f0..b7cc7967 100644 --- a/app/ui_layer/themes/base.py +++ b/app/ui_layer/themes/base.py @@ -1,5 +1,6 @@ """Theme base classes and protocols.""" +from __future__ import annotations from abc import ABC, abstractmethod from dataclasses import dataclass diff --git a/app/usage/action_storage.py b/app/usage/action_storage.py index 7e839389..1c41c154 100644 --- a/app/usage/action_storage.py +++ b/app/usage/action_storage.py @@ -6,6 +6,7 @@ Provides local persistence for action history across agent restarts. """ +from __future__ import annotations import logging import sqlite3 diff --git a/app/usage/chat_storage.py b/app/usage/chat_storage.py index da5b3981..da85aa3e 100644 --- a/app/usage/chat_storage.py +++ b/app/usage/chat_storage.py @@ -6,6 +6,7 @@ Provides local persistence for chat history across agent restarts. """ +from __future__ import annotations import json import logging diff --git a/app/usage/reporter.py b/app/usage/reporter.py index 5254bb8d..3a06c0d5 100644 --- a/app/usage/reporter.py +++ b/app/usage/reporter.py @@ -6,6 +6,7 @@ Adapts the WhiteCollarAgent UsageReporter pattern for local SQLite storage. """ +from __future__ import annotations import asyncio import logging diff --git a/app/usage/session_storage.py b/app/usage/session_storage.py index 9edb36be..9eac006c 100644 --- a/app/usage/session_storage.py +++ b/app/usage/session_storage.py @@ -7,6 +7,7 @@ event context can be restored. """ +from __future__ import annotations import json import logging diff --git a/app/usage/storage.py b/app/usage/storage.py index 14b1ae13..a2be57d7 100644 --- a/app/usage/storage.py +++ b/app/usage/storage.py @@ -6,6 +6,7 @@ Provides local persistence for LLM/VLM token usage tracking. """ +from __future__ import annotations import json import logging diff --git a/app/usage/task_storage.py b/app/usage/task_storage.py index 4b65dd01..a5e59f6a 100644 --- a/app/usage/task_storage.py +++ b/app/usage/task_storage.py @@ -6,6 +6,7 @@ Provides local persistence for task execution history. """ +from __future__ import annotations import json import logging diff --git a/diagnostic/action_diagnose.py b/diagnostic/action_diagnose.py index 39368257..010c3371 100644 --- a/diagnostic/action_diagnose.py +++ b/diagnostic/action_diagnose.py @@ -1,4 +1,5 @@ """Diagnostic tool for validating action implementations.""" +from __future__ import annotations import argparse import json diff --git a/diagnostic/environments/create_and_run_python_script.py b/diagnostic/environments/create_and_run_python_script.py index 05b37949..777f74ad 100644 --- a/diagnostic/environments/create_and_run_python_script.py +++ b/diagnostic/environments/create_and_run_python_script.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "create and run python script" action.""" +from __future__ import annotations from pathlib import Path from typing import Any, Mapping, Tuple diff --git a/diagnostic/environments/create_pdf_file.py b/diagnostic/environments/create_pdf_file.py index 1e9f8a63..45987c9d 100644 --- a/diagnostic/environments/create_pdf_file.py +++ b/diagnostic/environments/create_pdf_file.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "create pdf file" action.""" +from __future__ import annotations import types from pathlib import Path diff --git a/diagnostic/environments/find_file_by_name.py b/diagnostic/environments/find_file_by_name.py index 91d82a6d..4739e591 100644 --- a/diagnostic/environments/find_file_by_name.py +++ b/diagnostic/environments/find_file_by_name.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "find file by name" action.""" +from __future__ import annotations from pathlib import Path from typing import Any, Mapping, Tuple diff --git a/diagnostic/environments/find_in_file_content.py b/diagnostic/environments/find_in_file_content.py index 5d9450f9..feb4410a 100644 --- a/diagnostic/environments/find_in_file_content.py +++ b/diagnostic/environments/find_in_file_content.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "find in file content" action.""" +from __future__ import annotations from pathlib import Path from typing import Any, Mapping, Tuple diff --git a/diagnostic/environments/ignore.py b/diagnostic/environments/ignore.py index 55901b5c..f89b434f 100644 --- a/diagnostic/environments/ignore.py +++ b/diagnostic/environments/ignore.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "ignore" action.""" +from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/keyboard_input.py b/diagnostic/environments/keyboard_input.py index b2d34492..8ea9422e 100644 --- a/diagnostic/environments/keyboard_input.py +++ b/diagnostic/environments/keyboard_input.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "keyboard input" action.""" +from __future__ import annotations import types from typing import Any, List, Mapping, Tuple diff --git a/diagnostic/environments/keyboard_typing.py b/diagnostic/environments/keyboard_typing.py index ca01f38f..b1fc5c37 100644 --- a/diagnostic/environments/keyboard_typing.py +++ b/diagnostic/environments/keyboard_typing.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "keyboard typing" action.""" +from __future__ import annotations import types from typing import Any, Mapping, Tuple diff --git a/diagnostic/environments/list_folder.py b/diagnostic/environments/list_folder.py index 1733c9c3..de48c633 100644 --- a/diagnostic/environments/list_folder.py +++ b/diagnostic/environments/list_folder.py @@ -1,4 +1,5 @@ """Environment and validation for the "list folder" action.""" +from __future__ import annotations from pathlib import Path from typing import Any, Mapping, Tuple diff --git a/diagnostic/environments/mouse_drag.py b/diagnostic/environments/mouse_drag.py index 812c8a8e..d640d9f3 100644 --- a/diagnostic/environments/mouse_drag.py +++ b/diagnostic/environments/mouse_drag.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "mouse drag" action.""" +from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/mouse_move.py b/diagnostic/environments/mouse_move.py index 5cf54096..7198cd2e 100644 --- a/diagnostic/environments/mouse_move.py +++ b/diagnostic/environments/mouse_move.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "mouse move" action.""" +from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/open_application.py b/diagnostic/environments/open_application.py index c85ecd2b..fd9efad2 100644 --- a/diagnostic/environments/open_application.py +++ b/diagnostic/environments/open_application.py @@ -1,5 +1,6 @@ """Diagnostic environment for the "open application" action.""" +from __future__ import annotations import types from pathlib import Path diff --git a/diagnostic/environments/read_pdf_file.py b/diagnostic/environments/read_pdf_file.py index c954aa01..0d75c18e 100644 --- a/diagnostic/environments/read_pdf_file.py +++ b/diagnostic/environments/read_pdf_file.py @@ -1,4 +1,5 @@ """Environment and validation for the "read pdf file" action.""" +from __future__ import annotations import textwrap import types diff --git a/diagnostic/environments/scroll.py b/diagnostic/environments/scroll.py index 7fe6fa40..6567de82 100644 --- a/diagnostic/environments/scroll.py +++ b/diagnostic/environments/scroll.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "scroll" action.""" +from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/send_http_requests.py b/diagnostic/environments/send_http_requests.py index 84b03573..155814ba 100644 --- a/diagnostic/environments/send_http_requests.py +++ b/diagnostic/environments/send_http_requests.py @@ -1,5 +1,6 @@ """Diagnostic environment for the "send HTTP requests" action.""" +from __future__ import annotations import types from typing import Any, Mapping diff --git a/diagnostic/environments/send_message.py b/diagnostic/environments/send_message.py index 08f61d26..fea58745 100644 --- a/diagnostic/environments/send_message.py +++ b/diagnostic/environments/send_message.py @@ -1,5 +1,6 @@ """Diagnostic environment for the "send_message" action.""" +from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/shell_exec_windows.py b/diagnostic/environments/shell_exec_windows.py index 869b4ed7..fcddc987 100644 --- a/diagnostic/environments/shell_exec_windows.py +++ b/diagnostic/environments/shell_exec_windows.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "shell exec (windows)" action.""" +from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/switch_to_cli_mode.py b/diagnostic/environments/switch_to_cli_mode.py index 57102556..a732bc04 100644 --- a/diagnostic/environments/switch_to_cli_mode.py +++ b/diagnostic/environments/switch_to_cli_mode.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "switch to CLI mode" action.""" +from __future__ import annotations import types from pathlib import Path diff --git a/diagnostic/environments/trace_mouse.py b/diagnostic/environments/trace_mouse.py index caabf182..902bb72c 100644 --- a/diagnostic/environments/trace_mouse.py +++ b/diagnostic/environments/trace_mouse.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "trace mouse" action.""" +from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/view_image.py b/diagnostic/environments/view_image.py index 028dc077..4d8ffbd7 100644 --- a/diagnostic/environments/view_image.py +++ b/diagnostic/environments/view_image.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "view image" action.""" +from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/environments/window_close.py b/diagnostic/environments/window_close.py index 6b003c1d..ecf824d7 100644 --- a/diagnostic/environments/window_close.py +++ b/diagnostic/environments/window_close.py @@ -1,4 +1,5 @@ """Diagnostic environment for the "window close" action.""" +from __future__ import annotations from diagnostic.framework import ActionTestCase diff --git a/diagnostic/framework.py b/diagnostic/framework.py index ea85ee9d..f508a025 100644 --- a/diagnostic/framework.py +++ b/diagnostic/framework.py @@ -1,4 +1,5 @@ """Common utilities for diagnostic action harnesses.""" +from __future__ import annotations import dataclasses import io diff --git a/mkdocs/scripts/gen_ref_pages.py b/mkdocs/scripts/gen_ref_pages.py index 0a4788a9..f3699ed9 100644 --- a/mkdocs/scripts/gen_ref_pages.py +++ b/mkdocs/scripts/gen_ref_pages.py @@ -1,3 +1,4 @@ +from __future__ import annotations from pathlib import Path import mkdocs_gen_files diff --git a/skills/model-usage/scripts/model_usage.py b/skills/model-usage/scripts/model_usage.py index 7db15b61..0b71f96e 100644 --- a/skills/model-usage/scripts/model_usage.py +++ b/skills/model-usage/scripts/model_usage.py @@ -5,6 +5,7 @@ Defaults to current model (most recent daily entry), or list all models. """ +from __future__ import annotations import argparse import json diff --git a/skills/stock-market-pro/scripts/options_links.py b/skills/stock-market-pro/scripts/options_links.py index 4839311d..5e87ae0e 100644 --- a/skills/stock-market-pro/scripts/options_links.py +++ b/skills/stock-market-pro/scripts/options_links.py @@ -8,6 +8,7 @@ - python3 scripts/options_links.py NVDA """ +from __future__ import annotations import argparse From 76e8c29cce3bb14f26dae38a9e1091cc168d2946 Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Wed, 22 Apr 2026 01:04:59 +0530 Subject: [PATCH 27/30] chore: remove temporary restore script --- .restore_future_annotations.sh | 143 --------------------------------- 1 file changed, 143 deletions(-) delete mode 100755 .restore_future_annotations.sh diff --git a/.restore_future_annotations.sh b/.restore_future_annotations.sh deleted file mode 100755 index cec97862..00000000 --- a/.restore_future_annotations.sh +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env bash -# Restore `from __future__ import annotations` in files that lost it vs dev. -# For each file: find the line number after the module docstring in the dev version, -# then insert it in the same position in the working tree using Python (cross-platform safe). - -set -euo pipefail - -FILES=( - "agent_core/core/impl/llm/cache/config.py" - "agent_core/core/impl/llm/cache/metrics.py" - "agent_core/core/impl/llm/errors.py" - "agent_core/core/impl/llm/types.py" - "agent_core/core/impl/memory/manager.py" - "agent_core/core/llm/google_gemini_client.py" - "agent_core/core/protocols/trigger.py" - "agent_core/core/task/todo.py" - "agent_core/core/trigger.py" - "app/browser/interface.py" - "app/cli/interface.py" - "app/credentials/handlers.py" - "app/external_comms/base.py" - "app/external_comms/config.py" - "app/external_comms/integration_settings.py" - "app/external_comms/manager.py" - "app/external_comms/platforms/google_workspace.py" - "app/external_comms/platforms/linkedin.py" - "app/external_comms/platforms/notion.py" - "app/external_comms/platforms/outlook.py" - "app/external_comms/platforms/slack.py" - "app/external_comms/platforms/telegram_bot.py" - "app/external_comms/platforms/telegram_user.py" - "app/external_comms/platforms/twitter.py" - "app/external_comms/platforms/whatsapp_bridge/client.py" - "app/external_comms/platforms/whatsapp_business.py" - "app/external_comms/platforms/whatsapp_web.py" - "app/external_comms/registry.py" - "app/ui_layer/commands/base.py" - "app/ui_layer/commands/builtin/agent_command.py" - "app/ui_layer/commands/builtin/clear.py" - "app/ui_layer/commands/builtin/cred.py" - "app/ui_layer/commands/builtin/exit.py" - "app/ui_layer/commands/builtin/help.py" - "app/ui_layer/commands/builtin/integrations.py" - "app/ui_layer/commands/builtin/mcp.py" - "app/ui_layer/commands/builtin/menu.py" - "app/ui_layer/commands/builtin/provider.py" - "app/ui_layer/commands/builtin/reset.py" - "app/ui_layer/commands/builtin/skill.py" - "app/ui_layer/commands/builtin/skill_invoke.py" - "app/ui_layer/commands/builtin/update.py" - "app/ui_layer/commands/executor.py" - "app/ui_layer/events/event_bus.py" - "app/ui_layer/events/event_types.py" - "app/ui_layer/events/transformer.py" - "app/ui_layer/onboarding/controller.py" - "app/ui_layer/themes/base.py" - "app/usage/action_storage.py" - "app/usage/chat_storage.py" - "app/usage/reporter.py" - "app/usage/session_storage.py" - "app/usage/storage.py" - "app/usage/task_storage.py" - "diagnostic/action_diagnose.py" - "diagnostic/environments/create_and_run_python_script.py" - "diagnostic/environments/create_pdf_file.py" - "diagnostic/environments/find_file_by_name.py" - "diagnostic/environments/find_in_file_content.py" - "diagnostic/environments/ignore.py" - "diagnostic/environments/keyboard_input.py" - "diagnostic/environments/keyboard_typing.py" - "diagnostic/environments/list_folder.py" - "diagnostic/environments/mouse_drag.py" - "diagnostic/environments/mouse_move.py" - "diagnostic/environments/open_application.py" - "diagnostic/environments/read_pdf_file.py" - "diagnostic/environments/scroll.py" - "diagnostic/environments/send_http_requests.py" - "diagnostic/environments/send_message.py" - "diagnostic/environments/shell_exec_windows.py" - "diagnostic/environments/switch_to_cli_mode.py" - "diagnostic/environments/trace_mouse.py" - "diagnostic/environments/view_image.py" - "diagnostic/environments/window_close.py" - "diagnostic/framework.py" - "mkdocs/scripts/gen_ref_pages.py" - "skills/model-usage/scripts/model_usage.py" - "skills/stock-market-pro/scripts/options_links.py" -) - -ANNOTATION="from __future__ import annotations" -RESTORED=0 -SKIPPED=0 -ERRORS=0 - -for FILE in "${FILES[@]}"; do - if [ ! -f "$FILE" ]; then - echo "SKIP (not found): $FILE" - ((SKIPPED++)) || true - continue - fi - - # Skip if annotation already present (e.g. vlm/interface.py which we kept) - if grep -q "^from __future__ import annotations" "$FILE"; then - echo "SKIP (already has it): $FILE" - ((SKIPPED++)) || true - continue - fi - - # Get the line number of from __future__ import annotations in the dev version - DEV_LINE=$(git show "dev:$FILE" 2>/dev/null | grep -n "^from __future__ import annotations" | head -1 | cut -d: -f1) - - if [ -z "$DEV_LINE" ]; then - echo "SKIP (not in dev either): $FILE" - ((SKIPPED++)) || true - continue - fi - - # Insert the annotation at that line number in the current file using Python - python3 - "$FILE" "$DEV_LINE" "$ANNOTATION" <<'PYEOF' -import sys - -filepath = sys.argv[1] -insert_at = int(sys.argv[2]) # 1-indexed line number from dev -annotation = sys.argv[3] - -with open(filepath, "r", encoding="utf-8") as f: - lines = f.readlines() - -# Insert at insert_at - 1 (0-indexed), preserving newlines -insert_idx = insert_at - 1 -lines.insert(insert_idx, annotation + "\n") - -with open(filepath, "w", encoding="utf-8") as f: - f.writelines(lines) - -print(f"OK: inserted '{annotation}' at line {insert_at} in {filepath}") -PYEOF - - ((RESTORED++)) || true -done - -echo "" -echo "Done: $RESTORED restored, $SKIPPED skipped, $ERRORS errors." From b71dee2c0ebca71678a6d551255fb9e199c493e8 Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Wed, 22 Apr 2026 15:10:52 +0530 Subject: [PATCH 28/30] add comment on understand_video.py explaining dual path execution instead of delegating entirely to InternalActionInterface --- app/data/action/understand_video.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/app/data/action/understand_video.py b/app/data/action/understand_video.py index 5b922947..b38d8094 100644 --- a/app/data/action/understand_video.py +++ b/app/data/action/understand_video.py @@ -83,6 +83,26 @@ def understand_video(input_data: dict) -> dict: from app.config import get_api_key, get_vlm_model api_key = get_api_key('gemini') +# --- Dual-path execution --- +# This is the only video action that contains its own dispatch logic rather than +# delegating entirely to InternalActionInterface. The reason is architectural: +# +# PATH 1 — Gemini Native (below, runs when api_key is present): +# Uses the Gemini Files API (genai.upload_file) for true native video +# understanding. The full video is uploaded and processed by the model with +# temporal context — no frame sampling needed. The uploaded file is deleted +# from Gemini servers after the call. The full summary is saved to disk. +# This path is preferred: more accurate, handles long videos, no OpenCV dep. +# +# PATH 2 — OpenCV Keyframe Fallback (bottom of function): +# Used when no Gemini API key is configured, or if PATH 1 raises any exception. +# Delegates to InternalActionInterface.understand_video(), which extracts +# evenly-spaced keyframes using OpenCV and sends them to whatever VLM provider +# is currently configured. Results are returned directly without saving to disk. +# +# The Gemini Files API is not accessible through VLMInterface, which is why +# this action cannot follow the standard single-delegation pattern. + if api_key: try: import google.generativeai as genai From ecac8517aee3ae3681e0b95cc692757e3839c378 Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Wed, 22 Apr 2026 15:23:42 +0530 Subject: [PATCH 29/30] add video to the action sets and define video in the DEFAULT_ST_DESCRIPTIONS --- app/action/action_set.py | 2 ++ app/data/action/perform_ocr.py | 2 +- app/data/action/understand_video.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/app/action/action_set.py b/app/action/action_set.py index 60adc8e3..a307f1b7 100644 --- a/app/action/action_set.py +++ b/app/action/action_set.py @@ -20,6 +20,8 @@ "file_operations": "File and folder manipulation (read, write, search, edit)", "web_research": "Internet search and browsing (web search, fetch URLs)", "document_processing": "PDF and document handling (read, create, convert)", + "image": "Image viewing, analysis, and OCR (screenshots, photos, diagrams)", + "video": "Video analysis and understanding — describe, summarise, or answer questions about video files (MP4, AVI, MOV)", # [V1.2.2] GUI mode temporarily disabled. Uncomment to re-enable. # "gui_interaction": "Mouse, keyboard, and screen operations", "clipboard": "Clipboard read/write operations", diff --git a/app/data/action/perform_ocr.py b/app/data/action/perform_ocr.py index fc3e3e39..ba83d2fb 100644 --- a/app/data/action/perform_ocr.py +++ b/app/data/action/perform_ocr.py @@ -4,7 +4,7 @@ name="perform_ocr", description="Extracts all text from an image using OCR via a Vision Language Model. Use this when the user wants to read text from a screenshot, scanned document, photo of a receipt, whiteboard, sign, or any image containing text. Returns extracted text saved to a file in workspace.", mode="CLI", - action_sets=["document_processing", "image"], + action_sets=["document_processing", "image", "video"], input_schema={ "image_path": { "type": "string", diff --git a/app/data/action/understand_video.py b/app/data/action/understand_video.py index b38d8094..10f5cc71 100644 --- a/app/data/action/understand_video.py +++ b/app/data/action/understand_video.py @@ -4,7 +4,7 @@ name="understand_video", description="Uses the configured VLM model (default: Gemini 1.5 Pro) for native video understanding when a Google API key is configured. Falls back to keyframe extraction via OpenCV if no Google API key is available.", mode="CLI", - action_sets=["document_processing", "image"], + action_sets=["document_processing", "image", "video"], requirement=["google-generativeai"], input_schema={ "video_path": { From 2448bf689ade7695af59778813afb32b8dc0af6e Mon Sep 17 00:00:00 2001 From: AlanAAG Date: Wed, 22 Apr 2026 15:47:53 +0530 Subject: [PATCH 30/30] modify import from google.generativeai to the new supported google.genai --- app/data/action/generate_image.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app/data/action/generate_image.py b/app/data/action/generate_image.py index e692db32..f2aa6987 100644 --- a/app/data/action/generate_image.py +++ b/app/data/action/generate_image.py @@ -77,7 +77,7 @@ "description": "Status message or error message." } }, - requirement=["google-generativeai", "Pillow"], + requirement=["google-genai", "Pillow"], test_payload={ "prompt": "A cute cartoon cat sitting on a rainbow", "resolution": "1K", @@ -241,7 +241,7 @@ def _ensure_package(pkg_name): subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg_name, '--quiet']) try: - _ensure_package('google-generativeai') + _ensure_package('google-genai') _ensure_package('Pillow') except Exception as e: return { @@ -253,7 +253,7 @@ def _ensure_package(pkg_name): } try: - import google.generativeai as genai + from google import genai from PIL import Image import io import base64