diff --git a/agent_core/core/embedding_interface.py b/agent_core/core/embedding_interface.py index b9894cbd..17acfa99 100644 --- a/agent_core/core/embedding_interface.py +++ b/agent_core/core/embedding_interface.py @@ -12,6 +12,8 @@ - GOOGLE_API_KEY (for provider="gemini") """ +from __future__ import annotations + import os from typing import List, Optional diff --git a/agent_core/core/impl/action/router.py b/agent_core/core/impl/action/router.py index 12f1fef9..210c2458 100644 --- a/agent_core/core/impl/action/router.py +++ b/agent_core/core/impl/action/router.py @@ -6,6 +6,8 @@ based on user queries using LLM reasoning. """ +from __future__ import annotations + import json import ast from typing import Optional, List, Dict, Any, Tuple diff --git a/agent_core/core/impl/llm/cache/config.py b/agent_core/core/impl/llm/cache/config.py index f958738c..aacc411e 100644 --- a/agent_core/core/impl/llm/cache/config.py +++ b/agent_core/core/impl/llm/cache/config.py @@ -7,6 +7,7 @@ from __future__ import annotations + import os from dataclasses import dataclass from typing import Optional diff --git a/agent_core/core/impl/llm/cache/metrics.py b/agent_core/core/impl/llm/cache/metrics.py index 8f390825..0e1bbc6b 100644 --- a/agent_core/core/impl/llm/cache/metrics.py +++ b/agent_core/core/impl/llm/cache/metrics.py @@ -7,6 +7,7 @@ from __future__ import annotations + import logging from dataclasses import dataclass from typing import Dict, Optional diff --git a/agent_core/core/impl/llm/errors.py b/agent_core/core/impl/llm/errors.py index e310f686..052e2611 100644 --- a/agent_core/core/impl/llm/errors.py +++ b/agent_core/core/impl/llm/errors.py @@ -8,6 +8,7 @@ from __future__ import annotations + from typing import Optional # Import provider exception types diff --git a/agent_core/core/impl/llm/interface.py b/agent_core/core/impl/llm/interface.py index 84dec178..ca010f94 100644 --- a/agent_core/core/impl/llm/interface.py +++ b/agent_core/core/impl/llm/interface.py @@ -1184,9 +1184,10 @@ def _generate_openai( # Always enforce JSON output format request_kwargs["response_format"] = {"type": "json_object"} - # Add prompt_cache_key when call_type is provided for better cache routing - # This helps when alternating between different call types (reasoning, action_selection) - if call_type and system_prompt and len(system_prompt) >= config.min_cache_tokens: + # Add prompt_cache_key for OpenAI/DeepSeek cache routing. + # Grok (xAI) does not support prompt_cache_key — it uses automatic + # prefix caching and ignores this parameter, so skip it for Grok. + if self.provider != "grok" and call_type and system_prompt and len(system_prompt) >= config.min_cache_tokens: prompt_hash = hashlib.sha256(system_prompt.encode()).hexdigest()[:16] cache_key = f"{call_type}_{prompt_hash}" request_kwargs["extra_body"] = {"prompt_cache_key": cache_key} @@ -1197,21 +1198,26 @@ def _generate_openai( token_count_input = response.usage.prompt_tokens token_count_output = response.usage.completion_tokens - # Extract cached tokens from prompt_tokens_details (OpenAI automatic caching) - # Available for prompts ≥1024 tokens - prompt_tokens_details = getattr(response.usage, "prompt_tokens_details", None) - if prompt_tokens_details: - cached_tokens = getattr(prompt_tokens_details, "cached_tokens", 0) or 0 + # Extract cached tokens — field name differs by provider: + # - OpenAI: response.usage.prompt_tokens_details.cached_tokens + # - Grok (xAI): response.usage.prompt_cache_hit_tokens + if self.provider == "grok": + cached_tokens = getattr(response.usage, "prompt_cache_hit_tokens", 0) or 0 + else: + prompt_tokens_details = getattr(response.usage, "prompt_tokens_details", None) + if prompt_tokens_details: + cached_tokens = getattr(prompt_tokens_details, "cached_tokens", 0) or 0 # Record cache metrics + provider_label = self.provider # "openai", "grok", "deepseek", etc. metrics = get_cache_metrics() if cached_tokens > 0: - logger.info(f"[CACHE] OpenAI {cache_type} cache hit: {cached_tokens}/{token_count_input} tokens from cache") - metrics.record_hit("openai", cache_type, cached_tokens=cached_tokens, total_tokens=token_count_input) + logger.info(f"[CACHE] {provider_label} {cache_type} cache hit: {cached_tokens}/{token_count_input} tokens from cache") + metrics.record_hit(provider_label, cache_type, cached_tokens=cached_tokens, total_tokens=token_count_input) elif system_prompt and len(system_prompt) >= config.min_cache_tokens: # Caching should have been attempted (prompt long enough) # This is a miss - either first call or cache expired - metrics.record_miss("openai", cache_type, total_tokens=token_count_input) + metrics.record_miss(provider_label, cache_type, total_tokens=token_count_input) status = "success" except Exception as exc: @@ -1262,22 +1268,24 @@ def _generate_ollama(self, system_prompt: str | None, user_prompt: str) -> Dict[ try: payload = { "model": self.model, - "system": system_prompt, "prompt": user_prompt, "stream": False, + "format": "json", "options": { "temperature": self.temperature, } } + if system_prompt: + payload["system"] = system_prompt url: str = f"{self.remote_url.rstrip('/')}/api/generate" response = requests.post(url, json=payload, timeout=600) response.raise_for_status() result = response.json() content = result.get("response", "").strip() - total_tokens = result.get("usage", {}).get("total_tokens", 0) token_count_input = result.get("prompt_eval_count", 0) token_count_output = result.get("eval_count", 0) + total_tokens = token_count_input + token_count_output status = "success" except Exception as exc: exc_obj = exc diff --git a/agent_core/core/impl/llm/types.py b/agent_core/core/impl/llm/types.py index 1b942525..4f51eabe 100644 --- a/agent_core/core/impl/llm/types.py +++ b/agent_core/core/impl/llm/types.py @@ -5,6 +5,7 @@ from __future__ import annotations + from enum import Enum diff --git a/agent_core/core/impl/vlm/interface.py b/agent_core/core/impl/vlm/interface.py index 0e1a7e4d..dc86d82b 100644 --- a/agent_core/core/impl/vlm/interface.py +++ b/agent_core/core/impl/vlm/interface.py @@ -217,6 +217,7 @@ def describe_image_bytes( system_prompt: str | None = None, user_prompt: str | None = "Describe this image in detail.", log_response: bool = True, + json_mode: bool = True, ) -> str: """Describe an image from raw bytes using the VLM. @@ -233,8 +234,10 @@ def describe_image_bytes( if log_response: logger.info(f"[LLM SEND] system={system_prompt} | user={user_prompt}") - if self.provider in ("openai", "minimax", "deepseek", "moonshot", "grok"): - response = self._openai_describe_bytes(image_bytes, system_prompt, user_prompt) + if self.provider == "deepseek": + raise RuntimeError("DeepSeek does not support vision/VLM. Use a different provider for image description.") + elif self.provider in ("openai", "minimax", "moonshot", "grok"): + response = self._openai_describe_bytes(image_bytes, system_prompt, user_prompt, json_mode=json_mode) elif self.provider == "remote": response = self._ollama_describe_bytes(image_bytes, system_prompt, user_prompt) elif self.provider == "gemini": @@ -286,8 +289,114 @@ async def generate_response_async( log_response, ) + def describe_image_ocr( + self, + image_path: str, + user_prompt: str | None = None, + ) -> str: + """ + Run OCR on an image. Returns raw extracted text, not a description. + Uses a structured extraction system prompt regardless of provider. + """ + if not os.path.isfile(image_path): + raise FileNotFoundError(f"Image file not found: {image_path}") + + with open(image_path, "rb") as f: + image_bytes = f.read() + + system_prompt = ( + "You are a precise OCR engine. Extract ALL text from this image exactly as it appears. " + "Preserve line breaks, indentation, and formatting. " + "Do NOT add commentary, interpretation, or markdown. " + "Output only the raw extracted text. If no text is present, output an empty string." + ) + effective_user = user_prompt or "Extract all text from this image." + + logger.info(f"[LLM SEND] OCR request | path={image_path}") + + cleaned = self.describe_image_bytes( + image_bytes, + system_prompt=system_prompt, + user_prompt=effective_user, + log_response=False, # Logged below + json_mode=False, + ) + + logger.info(f"[LLM RECV OCR] {cleaned[:120]}...") + return cleaned + + def describe_video_frames( + self, + video_path: str, + query: str | None = None, + max_frames: int = 8, + ) -> str: + """ + Analyse video by extracting evenly-spaced keyframes and sending to VLM. + Falls back to graceful error if OpenCV is unavailable. + """ + try: + import cv2 + except ImportError: + raise RuntimeError( + "opencv-python-headless is required for video analysis. " + "Install with: pip install opencv-python-headless" + ) + + if not os.path.isfile(video_path): + raise FileNotFoundError(f"Video file not found: {video_path}") + + cap = cv2.VideoCapture(video_path) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + if total_frames == 0: + cap.release() + raise ValueError("Video has 0 frames or could not be read.") + + indices = [int(i * total_frames / max_frames) for i in range(max_frames)] + frame_bytes_list: list[bytes] = [] + + for idx in indices: + cap.set(cv2.CAP_PROP_POS_FRAMES, idx) + ret, frame = cap.read() + if ret: + success, buf = cv2.imencode(".jpg", frame) + if success: + frame_bytes_list.append(buf.tobytes()) + cap.release() + + if not frame_bytes_list: + raise ValueError("Could not extract any frames from the video.") + + system_prompt = ( + f"You are analysing a video represented by {len(frame_bytes_list)} evenly-spaced keyframes. " + "Provide: 1) An overall narrative summary of what is happening, " + "2) Any visible text or titles, " + "3) Key objects, people, or scenes, " + "4) Notable transitions between frames." + ) + effective_user = query or "Summarise the content of this video." + + # For multi-frame, send frames sequentially (all providers support single-image per call) + # Gemini 1.5 Pro supports native multi-image; others receive concatenated descriptions + if self.provider == "gemini" and len(frame_bytes_list) > 1: + return self._gemini_describe_video_frames(frame_bytes_list, system_prompt, effective_user) + else: + # Universal fallback: describe each frame, then synthesise + return self._multi_frame_describe_fallback(frame_bytes_list, system_prompt, effective_user) + # ───────────────────── Provider Helpers ───────────────────── + @staticmethod + def _detect_mime_type(image_bytes: bytes) -> str: + """Detect image MIME type from the first few bytes of image data.""" + if image_bytes[:8] == b'\x89PNG\r\n\x1a\n': + return "image/png" + if image_bytes[:4] == b'GIF8': + return "image/gif" + if image_bytes[:4] == b'RIFF' and image_bytes[8:12] == b'WEBP': + return "image/webp" + return "image/jpeg" + def _report_usage_async( self, service_type: str, @@ -317,9 +426,54 @@ def _report_usage_async( except Exception as e: logger.warning(f"[VLM] Failed to report usage: {e}") - def _openai_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) -> Dict[str, Any]: - """OpenAI vision request with automatic prompt caching metrics.""" + + def _gemini_describe_video_frames( + self, frame_bytes_list: list[bytes], sys: str | None, usr: str + ) -> str: + """Gemini-specific multi-image frame analysis in a single API call.""" + result = self._gemini_client.generate_multimodal( + self.model, + text=usr, + image_bytes_list=frame_bytes_list, + system_prompt=sys, + temperature=self.temperature, + json_mode=False, + ) + tokens_used = result.get("tokens_used", 0) + if tokens_used: + self._set_token_count(self._get_token_count() + tokens_used) + return re.sub(self._CODE_BLOCK_RE, "", result.get("content", "").strip()) + + def _multi_frame_describe_fallback( + self, frame_bytes_list: list[bytes], system_prompt: str, user_prompt: str + ) -> str: + """Describe each frame individually, then synthesise into a narrative.""" + frame_descriptions = [] + for i, fb in enumerate(frame_bytes_list): + desc = self.describe_image_bytes( + fb, + system_prompt=f"Frame {i+1} of {len(frame_bytes_list)}: Describe what you see.", + user_prompt=user_prompt, + log_response=False, + ) + frame_descriptions.append(f"[Frame {i+1}]: {desc}") + + synthesis_prompt = ( + "You received descriptions of video keyframes. Write a coherent video summary:\n\n" + + "\n".join(frame_descriptions) + ) + synthesis = self.describe_image_bytes( + frame_bytes_list[-1], # anchor with last frame for context + system_prompt=system_prompt, + user_prompt=synthesis_prompt, + log_response=True, + ) + return synthesis + + def _openai_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str, json_mode: bool = True) -> Dict[str, Any]: + """OpenAI/Grok vision request with automatic prompt caching metrics.""" img_b64 = base64.b64encode(image_bytes).decode() + mime_type = self._detect_mime_type(image_bytes) messages: list[Dict[str, Any]] = [] if sys: messages.append({"role": "system", "content": sys}) @@ -328,20 +482,19 @@ def _openai_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) "role": "user", "content": [ {"type": "text", "text": usr}, - {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}, + {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{img_b64}"}}, ], } ) # Newer OpenAI models (o1, o3, o4, gpt-5, etc.) require # 'max_completion_tokens' instead of the legacy 'max_tokens' parameter. - # Note: response_format=json_object is intentionally NOT set here because - # describe_image returns plain text descriptions, not JSON. Enabling JSON - # mode would also require the prompt to contain the word "json". request_kwargs: Dict[str, Any] = { "model": self.model, "messages": messages, "temperature": self.temperature, } + if json_mode: + request_kwargs["response_format"] = {"type": "json_object"} model_lower = (self.model or "").lower() uses_max_completion_tokens = ( model_lower.startswith("o1") @@ -375,9 +528,9 @@ def _openai_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) elif sys and len(sys) >= config.min_cache_tokens: metrics.record_miss("openai", "automatic_vlm", total_tokens=token_count_input) - # Report usage via hook + # Report usage via hook (use actual provider name, e.g. "grok", "minimax") self._report_usage_async( - "vlm_openai", "openai", self.model, + f"vlm_{self.provider}", self.provider, self.model, token_count_input, token_count_output, cached_tokens ) @@ -393,16 +546,20 @@ def _ollama_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) payload = { "model": self.model, "prompt": usr, - "system": sys, "images": [img_b64], "stream": False, - "temperature": self.temperature, + "options": {"temperature": self.temperature}, } + if sys: + payload["system"] = sys url: str = f"{self.remote_url.rstrip('/')}/api/generate" r = requests.post(url, json=payload, timeout=600) r.raise_for_status() - content = r.json().get("response", "").strip() - total_tokens = r.json().get("usage", {}).get("total_tokens", 0) + result = r.json() + content = result.get("response", "").strip() + token_count_input = result.get("prompt_eval_count", 0) + token_count_output = result.get("eval_count", 0) + total_tokens = token_count_input + token_count_output return { "tokens_used": total_tokens or 0, @@ -417,10 +574,10 @@ def _gemini_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) result = self._gemini_client.generate_multimodal( self.model, text=usr, - image_bytes=image_bytes, + image_bytes_list=[image_bytes], system_prompt=sys, temperature=self.temperature, - json_mode=True, + json_mode=False, ) # Record cache metrics @@ -447,6 +604,7 @@ def _gemini_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) def _byteplus_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str) -> Dict[str, Any]: """BytePlus vision request.""" img_b64 = base64.b64encode(image_bytes).decode() + mime_type = self._detect_mime_type(image_bytes) messages: list[Dict[str, Any]] = [] if sys: messages.append({"role": "system", "content": sys}) @@ -456,7 +614,7 @@ def _byteplus_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: str "role": "user", "content": [ {"type": "text", "text": usr}, - {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}, + {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{img_b64}"}}, ], } ) @@ -501,14 +659,7 @@ def _anthropic_describe_bytes(self, image_bytes: bytes, sys: str | None, usr: st img_b64 = base64.b64encode(image_bytes).decode() config = get_cache_config() - # Detect media type from image bytes - media_type = "image/jpeg" - if image_bytes[:8] == b'\x89PNG\r\n\x1a\n': - media_type = "image/png" - elif image_bytes[:4] == b'GIF8': - media_type = "image/gif" - elif image_bytes[:4] == b'RIFF' and image_bytes[8:12] == b'WEBP': - media_type = "image/webp" + media_type = self._detect_mime_type(image_bytes) message_content = [ { diff --git a/agent_core/core/llm/google_gemini_client.py b/agent_core/core/llm/google_gemini_client.py index f6d1688b..114734fb 100644 --- a/agent_core/core/llm/google_gemini_client.py +++ b/agent_core/core/llm/google_gemini_client.py @@ -9,6 +9,7 @@ """ from __future__ import annotations + import base64 import logging import os @@ -168,12 +169,15 @@ def generate_multimodal( model: str, *, text: str, - image_bytes: bytes, + image_bytes_list: List[bytes], system_prompt: Optional[str] = None, temperature: Optional[float] = None, json_mode: bool = False, ) -> Dict[str, Any]: - """Generate text from a prompt that also contains an inline image. + """Generate text from a prompt that contains one or more inline images. + + Normalises both single-image and multi-image inputs into a consistent + request format for the Gemini API. Returns a dict containing: - tokens_used: Total tokens consumed @@ -185,7 +189,8 @@ def generate_multimodal( Args: model: Model identifier text: The text prompt - image_bytes: PNG image data + + image_bytes_list: List of image data (PNG/JPEG) system_prompt: Optional system instruction temperature: Sampling temperature json_mode: If True, enforce JSON output format @@ -193,12 +198,16 @@ def generate_multimodal( Returns: Dict with generation results and token counts """ - inline_data = { - "mimeType": "image/png", - "data": base64.b64encode(image_bytes).decode("utf-8"), - } + parts: List[Dict[str, Any]] = [{"text": text}] + for img in image_bytes_list: + mime = "image/jpeg" + parts.append({ + "inlineData": { + "mimeType": mime, + "data": base64.b64encode(img).decode("utf-8"), + } + }) - parts: List[Dict[str, Any]] = [{"text": text}, {"inlineData": inline_data}] contents = [{"role": "user", "parts": parts}] payload: Dict[str, Any] = {"contents": contents} @@ -236,6 +245,8 @@ def generate_multimodal( "cached_tokens": cached_tokens, } + + def embed_text(self, model: str, *, text: str) -> List[float]: """Fetch an embedding vector for the supplied text. diff --git a/agent_core/core/registry/action.py b/agent_core/core/registry/action.py index 956c9dba..46478333 100644 --- a/agent_core/core/registry/action.py +++ b/agent_core/core/registry/action.py @@ -19,6 +19,8 @@ result = await executor.execute_action(action, input_data) """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/core/registry/context.py b/agent_core/core/registry/context.py index fe3aef47..4ba203d5 100644 --- a/agent_core/core/registry/context.py +++ b/agent_core/core/registry/context.py @@ -16,6 +16,8 @@ system_prompt, user_prompt = engine.make_prompt(query="...") """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/core/registry/database.py b/agent_core/core/registry/database.py index ab04e20d..cb5a3827 100644 --- a/agent_core/core/registry/database.py +++ b/agent_core/core/registry/database.py @@ -18,6 +18,8 @@ db.list_actions() """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/core/registry/event_stream.py b/agent_core/core/registry/event_stream.py index 041ff55d..fec9e3e3 100644 --- a/agent_core/core/registry/event_stream.py +++ b/agent_core/core/registry/event_stream.py @@ -16,6 +16,8 @@ manager.log("INFO", "Something happened") """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/core/registry/llm.py b/agent_core/core/registry/llm.py index 4e82fb67..be8d40ab 100644 --- a/agent_core/core/registry/llm.py +++ b/agent_core/core/registry/llm.py @@ -18,6 +18,8 @@ response = await llm.generate_response_async(prompt) """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/core/registry/memory.py b/agent_core/core/registry/memory.py index c1586d69..cf774336 100644 --- a/agent_core/core/registry/memory.py +++ b/agent_core/core/registry/memory.py @@ -21,6 +21,8 @@ pointers = memory.retrieve("user preferences") """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/core/registry/state.py b/agent_core/core/registry/state.py index 3b869851..45571b50 100644 --- a/agent_core/core/registry/state.py +++ b/agent_core/core/registry/state.py @@ -19,6 +19,8 @@ await manager.start_session() """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/core/registry/task_manager.py b/agent_core/core/registry/task_manager.py index ce87f4e8..da57db77 100644 --- a/agent_core/core/registry/task_manager.py +++ b/agent_core/core/registry/task_manager.py @@ -16,6 +16,8 @@ task_id = manager.create_task("My Task", "Do something") """ +from __future__ import annotations + from typing import TYPE_CHECKING from agent_core.core.registry.base import ComponentRegistry diff --git a/agent_core/decorators/log_events.py b/agent_core/decorators/log_events.py index ab9a7cfe..41a84547 100644 --- a/agent_core/decorators/log_events.py +++ b/agent_core/decorators/log_events.py @@ -8,6 +8,8 @@ {id}, {name}, {args}, {kwargs}, {result}, {exception}, {duration_ms} """ +from __future__ import annotations + import logging import time import uuid diff --git a/agent_core/decorators/profiler.py b/agent_core/decorators/profiler.py index 38e5e77c..ca35a343 100644 --- a/agent_core/decorators/profiler.py +++ b/agent_core/decorators/profiler.py @@ -28,6 +28,8 @@ Set "auto_save_interval" to N to save after every N loops (0 = only at exit). """ +from __future__ import annotations + import atexit import asyncio import functools diff --git a/app/action/action_set.py b/app/action/action_set.py index 60adc8e3..a307f1b7 100644 --- a/app/action/action_set.py +++ b/app/action/action_set.py @@ -20,6 +20,8 @@ "file_operations": "File and folder manipulation (read, write, search, edit)", "web_research": "Internet search and browsing (web search, fetch URLs)", "document_processing": "PDF and document handling (read, create, convert)", + "image": "Image viewing, analysis, and OCR (screenshots, photos, diagrams)", + "video": "Video analysis and understanding — describe, summarise, or answer questions about video files (MP4, AVI, MOV)", # [V1.2.2] GUI mode temporarily disabled. Uncomment to re-enable. # "gui_interaction": "Mouse, keyboard, and screen operations", "clipboard": "Clipboard read/write operations", diff --git a/app/agent_base.py b/app/agent_base.py index aa1b85de..49586e25 100644 --- a/app/agent_base.py +++ b/app/agent_base.py @@ -143,7 +143,7 @@ def __init__( llm_api_key: API key for the LLM provider. llm_base_url: Base URL for the LLM provider (optional). llm_model: Model name override (None = use registry default). - vlm_provider: Provider name for VLM (defaults to llm_provider). + vlm_provider: Provider name for VLM (defaults to llm_provider if None). vlm_model: VLM model name override (None = use registry default). deferred_init: If True, allow LLM/VLM initialization to be deferred until API key is configured (useful for first-time setup). @@ -162,11 +162,11 @@ def __init__( base_url=llm_base_url, deferred=deferred_init, ) - # VLM uses its own provider/model settings, falling back to LLM values - _vlm_provider = vlm_provider or llm_provider - _vlm_api_key = get_api_key(_vlm_provider) if vlm_provider else llm_api_key - _vlm_base_url = get_base_url(_vlm_provider) if vlm_provider else llm_base_url + _vlm_provider = vlm_provider or llm_provider + _vlm_api_key = get_api_key(_vlm_provider) if vlm_provider else llm_api_key + _vlm_base_url = get_base_url(_vlm_provider) if vlm_provider else llm_base_url + self.vlm = VLMInterface( provider=_vlm_provider, model=vlm_model, diff --git a/app/config/settings.json b/app/config/settings.json index 669d7ebd..4d5efca0 100644 --- a/app/config/settings.json +++ b/app/config/settings.json @@ -1,5 +1,5 @@ { - "version": "1.2.2", + "version": "1.2.3", "general": { "agent_name": "CraftBot", "os_language": "en" @@ -76,4 +76,4 @@ "google": true, "byteplus": true } -} \ No newline at end of file +} diff --git a/app/config/skills_config.json b/app/config/skills_config.json index 09aa5d49..a1917de2 100644 --- a/app/config/skills_config.json +++ b/app/config/skills_config.json @@ -9,6 +9,7 @@ "xlsx" ], "disabled_skills": [ + "cli-anything", "agentmail", "ai-news-collector", "ai-ppt-generator", diff --git a/app/credentials/handlers.py b/app/credentials/handlers.py index e4c2c40e..aa683d10 100644 --- a/app/credentials/handlers.py +++ b/app/credentials/handlers.py @@ -1,6 +1,7 @@ """All integration credential handlers + registry.""" from __future__ import annotations + import base64 import hashlib import logging diff --git a/app/data/action/describe_image.py b/app/data/action/describe_image.py index 67e58e20..6ab2cade 100644 --- a/app/data/action/describe_image.py +++ b/app/data/action/describe_image.py @@ -48,9 +48,36 @@ def view_image(input_data: dict) -> dict: prompt = str(input_data.get('prompt', '')).strip() or "Describe the content of this image in detail." if simulated_mode: - # Return mock result for testing return {'status': 'success', 'description': 'A simulated image description showing various objects and colors.', 'message': ''} + # ── VLM availability guard ────────────────────────────────────────── + import app.internal_action_interface as iai + from agent_core.core.models.model_registry import MODEL_REGISTRY + from agent_core.core.models.types import InterfaceType + from app.config import get_vlm_provider + + vlm = iai.InternalActionInterface.vlm_interface + current_provider = get_vlm_provider() + registry_vlm = MODEL_REGISTRY.get(current_provider, {}).get(InterfaceType.VLM) + + if vlm is None or not registry_vlm: + return { + 'status': 'error', + 'description': '', + 'message': ( + f"The current VLM provider '{current_provider}' does not support vision/image analysis. " + "Please inform the user and suggest switching to a provider that supports VLM.\n\n" + "Providers with VLM support: openai, anthropic, gemini, byteplus.\n\n" + "To switch provider, edit 'app/config/settings.json' and update:\n" + ' "vlm_provider": "" (e.g. "anthropic")\n' + ' "vlm_model": "" (e.g. "claude-sonnet-4-6" for anthropic)\n\n' + "Make sure the corresponding API key is configured under 'api_keys' in the same file. " + "If no API key is set, ask the user to provide one. " + "The system will automatically detect the config change and reload." + ), + } + # ─────────────────────────────────────────────────────────────────── + if not image_path: return {'status': 'error', 'description': '', 'message': 'image_path is required.'} diff --git a/app/data/action/generate_image.py b/app/data/action/generate_image.py index e692db32..f2aa6987 100644 --- a/app/data/action/generate_image.py +++ b/app/data/action/generate_image.py @@ -77,7 +77,7 @@ "description": "Status message or error message." } }, - requirement=["google-generativeai", "Pillow"], + requirement=["google-genai", "Pillow"], test_payload={ "prompt": "A cute cartoon cat sitting on a rainbow", "resolution": "1K", @@ -241,7 +241,7 @@ def _ensure_package(pkg_name): subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg_name, '--quiet']) try: - _ensure_package('google-generativeai') + _ensure_package('google-genai') _ensure_package('Pillow') except Exception as e: return { @@ -253,7 +253,7 @@ def _ensure_package(pkg_name): } try: - import google.generativeai as genai + from google import genai from PIL import Image import io import base64 diff --git a/app/data/action/perform_ocr.py b/app/data/action/perform_ocr.py new file mode 100644 index 00000000..ba83d2fb --- /dev/null +++ b/app/data/action/perform_ocr.py @@ -0,0 +1,80 @@ +from agent_core import action + +@action( + name="perform_ocr", + description="Extracts all text from an image using OCR via a Vision Language Model. Use this when the user wants to read text from a screenshot, scanned document, photo of a receipt, whiteboard, sign, or any image containing text. Returns extracted text saved to a file in workspace.", + mode="CLI", + action_sets=["document_processing", "image", "video"], + input_schema={ + "image_path": { + "type": "string", + "example": "C:\\Users\\user\\Pictures\\receipt.jpg", + "description": "Absolute path to the image file containing text to extract." + }, + "user_prompt": { + "type": "string", + "example": "Extract all text including prices and product names.", + "description": "Optional: extra instruction to guide the OCR (e.g. focus on specific regions or text types)." + } + }, + output_schema={ + "status": { + "type": "string", + "example": "success", + "description": "'success' if OCR completed, 'error' otherwise." + }, + "summary": { + "type": "string", + "example": "OCR complete: 42 lines, 1250 characters extracted.", + "description": "Brief summary of extraction results." + }, + "file_path": { + "type": "string", + "example": "/workspace/ocr_result_20260414_153000.txt", + "description": "Absolute path to the .txt file containing full extracted text." + }, + "file_saved": { + "type": "boolean", + "example": True, + "description": "True if the extracted text was saved to disk." + }, + "message": { + "type": "string", + "example": "File not found.", + "description": "Error message if applicable." + } + }, + test_payload={ + "image_path": "C:\\Users\\user\\Pictures\\sample.jpg", + "user_prompt": "Extract all visible text.", + "simulated_mode": True + } +) +def perform_ocr(input_data: dict) -> dict: + import os + + image_path = str(input_data.get('image_path', '')).strip() + user_prompt = str(input_data.get('user_prompt', '')).strip() or None + simulated_mode = input_data.get('simulated_mode', False) + + if simulated_mode: + return { + 'status': 'success', + 'summary': 'OCR complete: 5 lines, 120 characters extracted.', + 'file_path': '/workspace/ocr_result_simulated.txt', + 'file_saved': True, + 'message': '' + } + + if not image_path: + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'image_path is required.'} + + if not os.path.isfile(image_path): + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'File not found.'} + + try: + import app.internal_action_interface as iai + result = iai.InternalActionInterface.perform_ocr(image_path, user_prompt=user_prompt) + return {**result, 'message': ''} + except Exception as e: + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': str(e)} diff --git a/app/data/action/understand_video.py b/app/data/action/understand_video.py new file mode 100644 index 00000000..10f5cc71 --- /dev/null +++ b/app/data/action/understand_video.py @@ -0,0 +1,152 @@ +from agent_core import action + +@action( + name="understand_video", + description="Uses the configured VLM model (default: Gemini 1.5 Pro) for native video understanding when a Google API key is configured. Falls back to keyframe extraction via OpenCV if no Google API key is available.", + mode="CLI", + action_sets=["document_processing", "image", "video"], + requirement=["google-generativeai"], + input_schema={ + "video_path": { + "type": "string", + "example": "C:\\Users\\user\\Videos\\meeting.mp4", + "description": "Absolute path to the video file (MP4, AVI, MOV supported)." + }, + "query": { + "type": "string", + "example": "What is being presented on the slides?", + "description": "Optional: specific question to answer about the video." + }, + "max_frames": { + "type": "integer", + "example": 8, + "description": "Number of evenly-spaced keyframes to sample (default: 8, max recommended: 16)." + } + }, + output_schema={ + "status": { + "type": "string", + "example": "success", + "description": "'success' if analysis completed, 'error' otherwise." + }, + "summary": { + "type": "string", + "example": "The video shows a person presenting slides about quarterly sales...", + "description": "First 500 characters of the video summary. Full summary saved to file." + }, + "file_path": { + "type": "string", + "example": "/workspace/video_summary_20260414_153000.txt", + "description": "Absolute path to the .txt file containing the full video summary." + }, + "file_saved": { + "type": "boolean", + "example": True, + "description": "True if the full summary was saved to disk." + }, + "message": { + "type": "string", + "example": "File not found.", + "description": "Error message if applicable." + } + }, + test_payload={ + "video_path": "C:\\Users\\user\\Videos\\sample.mp4", + "query": "Summarise the video content.", + "max_frames": 8, + "simulated_mode": True + } +) +def understand_video(input_data: dict) -> dict: + import os + + video_path = str(input_data.get('video_path', '')).strip() + query = str(input_data.get('query', '')).strip() or None + max_frames = int(input_data.get('max_frames', 8)) + simulated_mode = input_data.get('simulated_mode', False) + + if simulated_mode: + return { + 'status': 'success', + 'summary': 'The video shows a simulated presentation with 3 speakers.', + 'file_path': '/workspace/video_summary_simulated.txt', + 'file_saved': True, + 'message': '' + } + + if not video_path: + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'video_path is required.'} + + if not os.path.isfile(video_path): + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': 'File not found.'} + + from app.config import get_api_key, get_vlm_model + api_key = get_api_key('gemini') + +# --- Dual-path execution --- +# This is the only video action that contains its own dispatch logic rather than +# delegating entirely to InternalActionInterface. The reason is architectural: +# +# PATH 1 — Gemini Native (below, runs when api_key is present): +# Uses the Gemini Files API (genai.upload_file) for true native video +# understanding. The full video is uploaded and processed by the model with +# temporal context — no frame sampling needed. The uploaded file is deleted +# from Gemini servers after the call. The full summary is saved to disk. +# This path is preferred: more accurate, handles long videos, no OpenCV dep. +# +# PATH 2 — OpenCV Keyframe Fallback (bottom of function): +# Used when no Gemini API key is configured, or if PATH 1 raises any exception. +# Delegates to InternalActionInterface.understand_video(), which extracts +# evenly-spaced keyframes using OpenCV and sends them to whatever VLM provider +# is currently configured. Results are returned directly without saving to disk. +# +# The Gemini Files API is not accessible through VLMInterface, which is why +# this action cannot follow the standard single-delegation pattern. + + if api_key: + try: + import google.generativeai as genai + genai.configure(api_key=api_key) + import time + from datetime import datetime + from app.config import AGENT_WORKSPACE_ROOT + + video_file = genai.upload_file(path=video_path) + + while video_file.state.name == "PROCESSING": + time.sleep(2) + video_file = genai.get_file(video_file.name) + + vlm_model = get_vlm_model() or "gemini-1.5-pro" + model = genai.GenerativeModel(vlm_model) + prompt = query if query else "Understand and describe the contents of this video." + response = model.generate_content([video_file, prompt]) + + genai.delete_file(video_file.name) + + full_text = response.text + ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + out_path = os.path.join(AGENT_WORKSPACE_ROOT, f"video_summary_{ts}.txt") + with open(out_path, "w", encoding="utf-8") as f: + f.write(full_text) + + return { + 'status': 'success', + 'summary': full_text[:500] + ("..." if len(full_text) > 500 else ""), + 'file_path': out_path, + 'file_saved': True, + 'message': '' + } + except Exception as e: + # Fall through to fallback path if Gemini native path fails + pass + + try: + import app.internal_action_interface as iai + result = iai.InternalActionInterface.understand_video(video_path, query=query, max_frames=max_frames) + return {**result, 'message': ''} + except RuntimeError as e: + # Catches missing opencv gracefully + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': str(e)} + except Exception as e: + return {'status': 'error', 'summary': '', 'file_path': '', 'file_saved': False, 'message': str(e)} diff --git a/app/gui/gui_module.py b/app/gui/gui_module.py index 24374600..37c5a6d2 100644 --- a/app/gui/gui_module.py +++ b/app/gui/gui_module.py @@ -1,3 +1,5 @@ + +from __future__ import annotations import json import ast import tempfile diff --git a/app/internal_action_interface.py b/app/internal_action_interface.py index 05a98b27..208e035c 100644 --- a/app/internal_action_interface.py +++ b/app/internal_action_interface.py @@ -5,6 +5,8 @@ framework internal functions. """ +from __future__ import annotations + from typing import Dict, Any, Optional, List, TYPE_CHECKING from app.llm import LLMInterface, LLMCallType from app.vlm_interface import VLMInterface @@ -98,6 +100,69 @@ def describe_image(cls, image_path: str, prompt: Optional[str] = None) -> str: raise RuntimeError("InternalActionInterface not initialized with VLMInterface.") return cls.vlm_interface.describe_image(image_path, user_prompt=prompt) + @classmethod + def perform_ocr(cls, image_path: str, user_prompt: Optional[str] = None) -> dict: + """ + Run OCR on an image and persist the extracted text to workspace. + Returns a concise status dict + saved file path to avoid TUI flooding. + """ + if cls.vlm_interface is None: + raise RuntimeError("InternalActionInterface not initialized with VLMInterface.") + + import os + from datetime import datetime + + raw_text = cls.vlm_interface.describe_image_ocr(image_path, user_prompt=user_prompt) + + # Persist to workspace to prevent token ballooning in the agent context + ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + out_path = os.path.join(AGENT_WORKSPACE_ROOT, f"ocr_result_{ts}.txt") + with open(out_path, "w", encoding="utf-8") as f: + f.write(raw_text) + + line_count = raw_text.count("\n") + 1 + char_count = len(raw_text) + return { + "status": "success", + "summary": f"OCR complete: {line_count} lines, {char_count} characters extracted.", + "text": raw_text, + "file_path": out_path, + "file_saved": True, + } + + @classmethod + def understand_video( + cls, + video_path: str, + query: Optional[str] = None, + max_frames: int = 8, + ) -> dict: + """ + Analyse a video by extracting keyframes and querying the VLM. + Persists the summary to workspace to avoid TUI/context flooding. + """ + if cls.vlm_interface is None: + raise RuntimeError("InternalActionInterface not initialized with VLMInterface.") + + import os + from datetime import datetime + + summary = cls.vlm_interface.describe_video_frames( + video_path, query=query, max_frames=max_frames + ) + + ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + out_path = os.path.join(AGENT_WORKSPACE_ROOT, f"video_summary_{ts}.txt") + with open(out_path, "w", encoding="utf-8") as f: + f.write(summary) + + return { + "status": "success", + "summary": summary[:500] + ("..." if len(summary) > 500 else ""), + "file_path": out_path, + "file_saved": True, + } + # ───────────────── Memory Search ───────────────── @classmethod diff --git a/app/main.py b/app/main.py index 50f2c83b..610e4b66 100644 --- a/app/main.py +++ b/app/main.py @@ -117,8 +117,8 @@ def _initial_settings() -> tuple: """Determine initial provider, API key, and base URL from settings.json. Returns: - Tuple of (provider, api_key, base_url, model, vlm_provider, vlm_model, has_valid_key) where has_valid_key - indicates if a working API key was found. + Tuple of (provider, api_key, base_url, model, vlm_provider, vlm_model, has_valid_key) + where has_valid_key indicates if a working API key was found. """ # Read directly from settings.json provider = get_llm_provider() @@ -131,6 +131,7 @@ def _initial_settings() -> tuple: # Remote (Ollama) doesn't require API key has_key = bool(api_key) or provider == "remote" + return provider, api_key, base_url, model, vlm_prov, vlm_mod, has_key diff --git a/app/onboarding/interfaces/steps.py b/app/onboarding/interfaces/steps.py index 40cdd035..d87c02aa 100644 --- a/app/onboarding/interfaces/steps.py +++ b/app/onboarding/interfaces/steps.py @@ -242,8 +242,13 @@ def get_options(self) -> List[StepOption]: def validate(self, value: Any) -> tuple[bool, Optional[str]]: # Accept legacy string submissions (plain text name) for backward compat. if isinstance(value, str): + if len(value) > 20: + return False, "Agent name must be 20 characters or fewer" return True, None if isinstance(value, dict): + agent_name = value.get("agent_name") + if agent_name and len(str(agent_name)) > 20: + return False, "Agent name must be 20 characters or fewer" picture = value.get("agent_profile_picture") if picture not in (None, ""): if not isinstance(picture, str) or picture.lower() not in self.ALLOWED_PICTURE_EXTS: @@ -446,14 +451,17 @@ def get_options(self) -> List[StepOption]: return [] def validate(self, value: Any) -> tuple[bool, Optional[str]]: - """Validate the form data dict. All fields are optional.""" - if not isinstance(value, dict): - return False, "Expected a dictionary of form values" - # Validate approval is a list if present - approval = value.get("approval") - if approval is not None and not isinstance(approval, list): - return False, "Approval settings must be a list" - return True, None + """Validate the form data dict. All fields are optional.""" + if not isinstance(value, dict): + return False, "Expected a dictionary of form values" + user_name = value.get("user_name") + if user_name and len(str(user_name)) > 20: + return False, "Name must be 20 characters or fewer" + # Validate approval is a list if present + approval = value.get("approval") + if approval is not None and not isinstance(approval, list): + return False, "Approval settings must be a list" + return True, None def get_default(self) -> Dict[str, Any]: """Return defaults for all fields.""" diff --git a/app/security/prompt_sanitizer.py b/app/security/prompt_sanitizer.py index 83e662cb..4a1e6053 100644 --- a/app/security/prompt_sanitizer.py +++ b/app/security/prompt_sanitizer.py @@ -9,6 +9,8 @@ - Format manipulation attacks """ +from __future__ import annotations + import re from typing import Any diff --git a/app/ui_layer/browser/frontend/src/pages/Onboarding/OnboardingPage.tsx b/app/ui_layer/browser/frontend/src/pages/Onboarding/OnboardingPage.tsx index 3d5b7458..88f3ff8c 100644 --- a/app/ui_layer/browser/frontend/src/pages/Onboarding/OnboardingPage.tsx +++ b/app/ui_layer/browser/frontend/src/pages/Onboarding/OnboardingPage.tsx @@ -829,6 +829,7 @@ export function OnboardingPage() { value={textValue} onChange={e => setTextValue(e.target.value)} placeholder={isApiKey ? 'Enter your API key' : 'Enter a name'} + maxLength={isApiKey ? undefined : 20} autoFocus onKeyDown={e => { if (e.key === 'Enter' && canSubmit) handleSubmit() }} /> diff --git a/app/ui_layer/local_llm_setup.py b/app/ui_layer/local_llm_setup.py index 67437eab..e998c510 100644 --- a/app/ui_layer/local_llm_setup.py +++ b/app/ui_layer/local_llm_setup.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- """Local LLM setup utilities for Ollama.""" +from __future__ import annotations + import asyncio import json import logging diff --git a/install.py b/install.py index bfbfc982..2c346460 100644 --- a/install.py +++ b/install.py @@ -553,19 +553,43 @@ def verify_conda_env(env_name: str) -> bool: def install_nodejs_linux(): """ - Automatically install Node.js on Linux systems (including Kali). - Detects the package manager (apt, pacman, yum) and installs accordingly. + Automatically install Node.js on Linux/macOS systems (including Kali). + Detects the package manager (brew, apt, pacman, yum) and installs accordingly. """ if sys.platform == "win32": return True # Windows users should install Node.js manually from nodejs.org - + # Check if node is already installed if shutil.which("node") and shutil.which("npm"): print("✓ Node.js and npm are already installed") return True - + print("\n🔧 Installing Node.js...") - + + # macOS: try Homebrew first, then nvm + if sys.platform == "darwin": + if shutil.which("brew"): + print(" Found Homebrew, installing Node.js...") + try: + result = run_command(["brew", "install", "node"], check=False, capture=True, quiet=True, show_error=False) + if result and hasattr(result, 'returncode') and result.returncode == 0: + print("✓ Node.js installed via Homebrew") + time.sleep(1) + if shutil.which("node") and shutil.which("npm"): + return True + print("⚠ Node.js installed but not yet in PATH. Restart your terminal.") + return False + except Exception as e: + print(f" ⚠ brew install node failed: {str(e)[:100]}") + print("\n⚠ Could not automatically install Node.js on macOS") + print("\nOptions:") + print(" 1. Install Homebrew (https://brew.sh), then run: brew install node") + print(" 2. Download Node.js from: https://nodejs.org/ (LTS version)") + print(" 3. Use nvm: curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.0/install.sh | bash") + print(" then: nvm install --lts") + print("\n After installation, restart your terminal and run: python3 install.py") + return False + # Detect package manager and prepare install commands # Format: (package_manager, update_cmd, install_cmd) package_managers = [ @@ -1148,10 +1172,127 @@ def show_api_setup_instructions(): print("="*50 + "\n") +# ========================================== +# LINUX PYTHON COMPATIBILITY CHECK +# ========================================== +def _check_linux_python() -> None: + """ + Warn Linux users who are running an old or system-managed Python. + + Common problem scenarios: + - Python < 3.9 (Ubuntu 20.04 default is 3.8) + - System Python used directly without a venv, which triggers PEP 668 + "externally-managed-environment" errors on newer distros + """ + ver = sys.version_info + + # Already gated to >= 3.9 above, but warn hard about 3.9 since + # it's the bare minimum — 3.11+ is much more reliable. + if ver < (3, 10): + print("\n" + "=" * 62) + print(f" ⚠ Python {ver.major}.{ver.minor} detected — upgrade recommended") + print("=" * 62) + print(f"\n You are running Python {ver.major}.{ver.minor}.{ver.micro}.") + print(" CraftBot works on 3.9+ but runs best on Python 3.11 or newer.") + print("\n To install Python 3.11 on Ubuntu/Debian/Kali:") + print(" sudo apt update") + print(" sudo apt install -y software-properties-common") + print(" sudo add-apt-repository ppa:deadsnakes/ppa") + print(" sudo apt install -y python3.11 python3.11-venv python3.11-pip") + print(" python3.11 install.py") + print() + print(" Or use pyenv (works on any distro):") + print(" curl https://pyenv.run | bash") + print(" pyenv install 3.11.9") + print(" pyenv local 3.11.9") + print(" python install.py") + print("=" * 62) + choice = input("\n Continue with Python 3.9 anyway? (y/n): ").strip().lower() + if choice != "y": + print("\n Installation cancelled. Please upgrade Python and try again.\n") + sys.exit(1) + print() + + +# ========================================== +# MAC PYTHON COMPATIBILITY CHECK +# ========================================== +def _check_mac_python() -> None: + """ + Warn Mac users who are running a problematic Python interpreter. + + Common bad interpreters on macOS: + - Xcode bundled Python (/Applications/Xcode.app/...) + - macOS system Python (/usr/bin/python3) + + Both are difficult to install packages into and are intended as OS + tooling, not for running user applications. Homebrew or python.org + Python is recommended instead. + """ + exe = sys.executable or "" + is_xcode = "Xcode.app" in exe or "Python3.framework" in exe + is_system = exe.startswith("/usr/bin/python") + + if not (is_xcode or is_system): + return # Running a proper Python — nothing to warn about + + ver = sys.version_info + label = "Xcode's built-in Python" if is_xcode else "macOS system Python" + + print("\n" + "=" * 62) + print(" ⚠ WARNING: Wrong Python interpreter detected") + print("=" * 62) + print(f"\n You are using {label}:") + print(f" {exe}") + print(f"\n This Python ({ver.major}.{ver.minor}.{ver.micro}) is reserved for macOS") + print(" system tools. Installing packages into it can be unreliable") + print(" and may break system components.") + print("\n Recommended fix — install Python via Homebrew:") + print() + print(" # 1. Install Homebrew (if not already installed):") + print(' /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"') + print() + print(" # 2. Install Python 3.11 (or newer):") + print(" brew install python@3.11") + print() + print(" # 3. Re-run the installer with Homebrew Python:") + print(" /opt/homebrew/bin/python3.11 install.py # Apple Silicon") + print(" /usr/local/bin/python3.11 install.py # Intel Mac") + print() + print(" Alternative: download Python from https://www.python.org/downloads/") + print("=" * 62) + + choice = input("\n Continue with the current interpreter anyway? (y/n): ").strip().lower() + if choice != "y": + print("\n Installation cancelled. Please use a Homebrew or python.org Python.\n") + sys.exit(1) + print() + + # ========================================== # MAIN # ========================================== if __name__ == "__main__": + # ── Python version gate ──────────────────────────────────────────────── + _ver = sys.version_info + if _ver < (3, 9): + print(f"\n❌ Python {_ver.major}.{_ver.minor} is not supported.") + print(" CraftBot requires Python 3.9 or newer.") + if sys.platform == "darwin": + print("\n Recommended fix:") + print(" 1. Install Homebrew: https://brew.sh") + print(" 2. Run: brew install python@3.11") + print(" 3. Re-run: /opt/homebrew/bin/python3.11 install.py") + else: + print("\n Please install Python 3.9+ from https://www.python.org/downloads/") + sys.exit(1) + + # ── platform-specific interpreter checks ────────────────────────────── + if sys.platform == "darwin": + _check_mac_python() + elif sys.platform == "linux": + _check_linux_python() + args = set(sys.argv[1:]) # Parse flags diff --git a/requirements.txt b/requirements.txt index 6cc79a4e..d9012d07 100644 --- a/requirements.txt +++ b/requirements.txt @@ -45,4 +45,6 @@ watchdog telethon croniter>=2.0.0 # Cron expression parsing for scheduler playwright # WhatsApp Web browser automation +opencv-python-headless # Video analysis keyframe extraction babel>=2.14.0 # Language list for onboarding + diff --git a/skills/cli-anything/SKILL.md b/skills/cli-anything/SKILL.md new file mode 100644 index 00000000..5dbff223 --- /dev/null +++ b/skills/cli-anything/SKILL.md @@ -0,0 +1,347 @@ +--- +name: cli-anything +description: "Automatically handles image editing, document conversion, audio/video editing, 3D rendering, diagrams, AI image generation, and more — using GIMP, LibreOffice, Blender, Audacity, Inkscape, Krita, Kdenlive, Shotcut, OBS, Draw.io, Mermaid, Ollama, Stable Diffusion, ComfyUI, JupyterLab, FreeCAD, QGIS, Grafana, Gitea, GitLab, NextCloud, Jenkins, AdGuard Home, Zoom, Mubu. User does NOT need to mention CLI-Anything — agent auto-selects the right app for the task. Auto-installs if not present." +action-sets: ["shell", "file_operations"] +--- + +# CLI-Anything Skill + +**Core rule: Do everything yourself. Never give the user a command to run. Never explain steps. Just execute the task and report the result.** + +**Activation rule: The user does NOT need to say "CLI-Anything". If their task matches a supported app below, use it automatically — no prompting needed.** + +--- + +## Task Routing — Auto-select the right app (check this before every task) + +| If the user asks about... | Use this app | Command prefix | +|---|---|---| +| Resize / crop / filter / edit an image | **GIMP** | `cli-anything-gimp` | +| Convert image format (JPG→PNG, PNG→WEBP, etc.) | **GIMP** | `cli-anything-gimp` | +| SVG, vector graphics, logos | **Inkscape** | `cli-anything-inkscape` | +| Digital painting, .kra files | **Krita** | `cli-anything-krita` | +| Convert DOCX / XLSX / PPTX → PDF | **LibreOffice** | `cli-anything-libreoffice` | +| Writer / Calc / Impress / spreadsheet macros | **LibreOffice** | `cli-anything-libreoffice` | +| Trim / convert / export audio (MP3, WAV, FLAC) | **Audacity** | `cli-anything-audacity` | +| Render / edit video | **Kdenlive** or **Shotcut** | `cli-anything-kdenlive` | +| Record screen or live stream | **OBS Studio** | `cli-anything-obs` | +| 3D modeling / rendering / .blend files | **Blender** | `cli-anything-blender` | +| Create or export diagrams (.drawio) | **Draw.io** | `cli-anything-draw-io` | +| Render Mermaid diagram code | **Mermaid** | `cli-anything-mermaid` | +| Generate image from text prompt (AI) | **Stable Diffusion** or **ComfyUI** | `cli-anything-stable-diffusion` | +| Run a local LLM | **Ollama** | `cli-anything-ollama` | +| AI content generation | **AnyGen** | `cli-anything-anygen` | +| AI research / summarize PDF | **NotebookLM** | `cli-anything-notebooklm` | +| Execute a Jupyter notebook | **JupyterLab** | `cli-anything-jupyterlab` | +| CAD / 3D design, .fcstd files | **FreeCAD** | `cli-anything-freecad` | +| GIS / maps, .qgz files | **QGIS** | `cli-anything-qgis` | +| Monitoring dashboards | **Grafana** | `cli-anything-grafana` | +| Git hosting, create repos | **Gitea** or **GitLab** | `cli-anything-gitea` | +| CI/CD pipelines | **Jenkins** | `cli-anything-jenkins` | +| Cloud file sync | **NextCloud** | `cli-anything-nextcloud` | +| Network-wide ad blocking | **AdGuard Home** | `cli-anything-adguard-home` | +| Video conferencing | **Zoom** | `cli-anything-zoom` | +| Knowledge outlines | **Mubu** | `cli-anything-mubu` | + +--- + +## Smart Fallback — When CLI-Anything fails + +CLI-Anything is the first choice, but if it fails the agent must still complete the task: + +1. **Try CLI-Anything first** — always attempt the harness (`cli-anything-`) +2. **If harness fails after 1 retry** — fall back to Python (PIL, python-docx, pydub, moviepy, etc.) and complete the task anyway +3. **Always tell the user** what was actually used and suggest installing the app for better results + +Example: +> "Done — resized using Python PIL as a fallback (GIMP harness failed). Install GIMP for higher quality results next time." + +Never leave the user with no result. Always complete the task one way or another. + +--- + +## FORBIDDEN — Never Do These (causes bugs on all platforms) + +These patterns are strictly banned. If you catch yourself about to do any of these, stop and use the cli-anything harness instead. + +| ❌ FORBIDDEN | ✅ CORRECT | +|---|---| +| `soffice.exe --headless --convert-to pdf ...` | `cli-anything-libreoffice convert doc.docx output.pdf` | +| `cd "C:\Program Files\LibreOffice\program" && soffice.exe ...` | `cli-anything-libreoffice convert doc.docx output.pdf` | +| `gimp --batch-interpreter=script-fu-use-v2 ...` | `cli-anything-gimp image resize input.jpg output.jpg 1920 1080` | +| `blender --background scene.blend --render-output ...` | `cli-anything-blender render scene.blend --output frames/ --format PNG` | +| `inkscape --export-type=png logo.svg` | `cli-anything-inkscape export logo.svg logo.png --dpi 300` | +| Chaining with `&&`: `cmd1 && cmd2` | Two separate `run_shell` calls | +| Any `.exe` extension in a command | No `.exe` — harness is cross-platform | +| Hardcoded paths like `C:\Program Files\...` | Use the harness — it finds the app automatically | + +**Why these are banned:** +- `.exe` only exists on Windows — breaks on macOS and Linux +- `C:\Program Files\...` paths break on macOS and Linux +- `&&` chaining breaks in PowerShell on Windows +- Raw app CLIs require knowing app-specific flags — the harness handles all of that + +--- + +## Help Response (no tools needed — just reply with text) + +If the user's message matches any of these (case-insensitive, any wording): +- "cli anything help" / "cli-anything help" / "cli help" +- "what apps does cli-anything support" / "what can cli-anything do" +- "show cli apps" / "cli anything guide" / "list cli apps" +- Any variation asking what CLI-Anything can do or which apps are supported + +**Do not run any tools. Reply directly with this message:** + +--- + +**CLI-Anything — What I Can Do** + +Just describe your task in plain English — you don't need to mention CLI-Anything. I'll pick the right app, install it if needed, and complete the task. Works on Windows, macOS, and Linux. + +**Creative & Media** +| App | What I do | Example | +|---|---|---| +| GIMP _(image editing)_ | Resize, crop, filter, convert, export images | "Resize photo.jpg to 1920×1080" | +| Blender _(3D modeling & rendering)_ | Render scenes, export models, run scripts | "Render scene.blend to PNG frames" | +| Inkscape _(vector graphics)_ | Export SVG to PNG/PDF, convert vectors | "Export logo.svg as 300 DPI PNG" | +| Audacity _(audio production)_ | Trim, convert, export audio | "Trim first 30s from audio.mp3" | +| OBS Studio _(live streaming & recording)_ | Record screen, capture video, stream | "Record my screen for 60 seconds" | +| Kdenlive _(video editing)_ | Render video projects to MP4/MKV | "Render project.kdenlive to MP4" | +| Shotcut _(video editing)_ | Render video projects to MP4 | "Render project.mlt to MP4" | +| Krita _(digital painting)_ | Export paintings, batch convert .kra files | "Export painting.kra as PNG" | + +**Office & Productivity** +| App | What I do | Example | +|---|---|---| +| LibreOffice _(Writer, Calc, Impress)_ | Convert DOCX/XLSX/PPTX to PDF, run macros | "Convert report.docx to PDF" | +| Mubu _(knowledge management & outlining)_ | Manage outlines and knowledge bases | "Open my outline in Mubu" | + +**Communication** +| App | What I do | Example | +|---|---|---| +| Zoom _(video conferencing)_ | Start or join meetings | "Start a Zoom meeting" | + +**Diagramming** +| App | What I do | Example | +|---|---|---| +| Draw.io _(diagrams)_ | Export diagrams to PNG/SVG/PDF | "Export diagram.drawio as PNG" | +| Mermaid Live Editor _(diagrams)_ | Render diagram code to image | "Render: graph TD; A-->B; B-->C" | + +**AI & ML** +| App | What I do | Example | +|---|---|---| +| ComfyUI _(AI image generation)_ | Run AI image workflows | "Run workflow.json, save to output/" | +| AnyGen _(AI content generation)_ | Generate AI content | "Generate content using AnyGen" | +| NotebookLM _(AI research assistant)_ | Research, summarize documents | "Summarize this PDF in NotebookLM" | +| Ollama _(local LLM inference)_ | Run local AI models | "Run llama3: summarize this text" | +| Stable Diffusion WebUI | Generate images from text prompts | "Generate 'sunset over mountains'" | + +**Network & Infrastructure** +| App | What I do | Example | +|---|---|---| +| AdGuard Home _(network-wide ad blocking)_ | Set up DNS-level ad blocking | "Set up AdGuard Home ad blocking" | +| JupyterLab | Execute notebooks, save output | "Run notebook.ipynb and save output" | +| Jenkins | Trigger CI/CD pipelines | "Trigger my build pipeline" | +| Gitea | Git hosting, create/manage repos | "Create private repo called myrepo" | +| NextCloud | Cloud file sync | "Sync my folder to NextCloud" | +| GitLab | Projects, CI/CD pipelines | "Create a new GitLab project" | +| Grafana | Export monitoring dashboards | "Export my dashboard as JSON" | +| FreeCAD | Export 3D models to STL/STEP | "Export model.fcstd as STL" | +| QGIS | Export maps to PNG/PDF | "Export map.qgz as PNG" | + +**Tips:** +- Give me the full file path (e.g. `C:\Users\you\Desktop\photo.jpg` or `/home/user/photo.jpg`) +- If the app isn't installed, I install it automatically — no action needed from you +- If the app fails, I fall back to a Python alternative and tell you +- Works on Windows, macOS, and Linux + +--- + +## Supported Apps Reference + +Use this table to look up the correct names for every step. + +| App | cli-hub name | Windows (winget) | macOS (brew cask) | Linux (apt) | +|---|---|---|---|---| +| GIMP | `gimp` | `GIMP.GIMP` | `gimp` | `gimp` | +| Blender | `blender` | `BlenderFoundation.Blender` | `blender` | `blender` | +| Inkscape | `inkscape` | `Inkscape.Inkscape` | `inkscape` | `inkscape` | +| Audacity | `audacity` | `Audacity.Audacity` | `audacity` | `audacity` | +| OBS Studio | `obs` | `OBSProject.OBSStudio` | `obs` | `obs-studio` | +| Kdenlive | `kdenlive` | `KDE.Kdenlive` | `kdenlive` | `kdenlive` | +| Shotcut | `shotcut` | `Meltytech.Shotcut` | `shotcut` | `shotcut` | +| Krita | `krita` | `KDE.Krita` | `krita` | `krita` | +| LibreOffice | `libreoffice` | `TheDocumentFoundation.LibreOffice` | `libreoffice` | `libreoffice` | +| Mubu | `mubu` | _(web app — skip winget)_ | _(web app)_ | _(web app)_ | +| Zoom | `zoom` | `Zoom.Zoom` | `zoom` | `zoom` | +| Draw.io | `draw-io` | `JGraph.Draw` | `drawio` | _(AppImage)_ | +| Mermaid | `mermaid` | `OpenJS.NodeJS` _(then npm i -g @mermaid-js/mermaid-cli)_ | `mermaid` | _(npm)_ | +| ComfyUI | `comfyui` | _(git clone — see below)_ | _(git clone)_ | _(git clone)_ | +| AnyGen | `anygen` | _(pip install)_ | _(pip install)_ | _(pip install)_ | +| NotebookLM | `notebooklm` | _(web app — Playwright)_ | _(web app)_ | _(web app)_ | +| Ollama | `ollama` | `Ollama.Ollama` | `ollama` | _(curl install)_ | +| AdGuard Home | `adguard-home` | `AdGuard.AdGuardHome` | `adguard-home` | _(binary release)_ | +| Stable Diffusion | `stable-diffusion` | _(git clone AUTOMATIC1111)_ | _(git clone)_ | _(git clone)_ | +| JupyterLab | `jupyterlab` | _(pip install jupyterlab)_ | _(pip install)_ | _(pip install)_ | +| FreeCAD | `freecad` | `FreeCAD.FreeCAD` | `freecad` | `freecad` | +| QGIS | `qgis` | `OSGeo.QGIS` | `qgis` | `qgis` | +| Grafana | `grafana` | `GrafanaLabs.Grafana` | `grafana` | `grafana` | +| Gitea | `gitea` | `Gitea.Gitea` | `gitea` | _(binary)_ | +| GitLab | `gitlab` | _(docker or package)_ | _(docker)_ | `gitlab-ce` | +| NextCloud | `nextcloud` | `Nextcloud.NextcloudDesktop` | `nextcloud` | _(snap/docker)_ | +| Jenkins | `jenkins` | `Jenkins.Jenkins` | `jenkins` | `jenkins` | + +--- + +## Execution Flow (follow every time — use EXACT timeouts listed) + +**CRITICAL: Always pass the timeout shown below to run_shell. Never use the default (30s). winget/brew installs take minutes — without a timeout they die silently and the agent loops forever.** + +**CRITICAL: Never chain commands with `&&` or `;` in a single run_shell call. Use one separate run_shell call per command.** + +### Step 1 — Detect OS +Run with `timeout: 10`: +``` +python -c "import platform; print(platform.system())" +``` +Result: `Windows`, `Darwin`, or `Linux`. + +### Step 2 — Check if the app is installed +Run with `timeout: 10`: +``` +gimp --version +``` +(replace with the correct app: `blender --version`, `libreoffice --version`, etc.) + +- Exit 0 → already installed → skip to Step 4 +- Exit non-zero → not installed → go to Step 3 + +### Step 3 — Install the app (ONE attempt only — never retry install) + +**Windows** — run with `timeout: 600`: +``` +winget install --id --silent --accept-package-agreements --accept-source-agreements +``` + +**macOS** — run with `timeout: 600`: +``` +brew install --cask +``` + +**Linux** — run with `timeout: 300`: +``` +sudo apt-get install -y +``` + +**Special cases:** +- ComfyUI / Stable Diffusion: `git clone` + `pip install -r requirements.txt` — `timeout: 600` +- Mermaid: `npm install -g @mermaid-js/mermaid-cli` — `timeout: 120` +- JupyterLab / AnyGen: `pip install ` — `timeout: 120` +- Web apps (Mubu, NotebookLM): no install needed — use `playwright-mcp` +- Ollama on Linux: `curl -fsSL https://ollama.com/install.sh | sh` — `timeout: 300` + +After install, re-run Step 2 check once (`timeout: 10`). If still fails → tell the user, stop completely. + +### Step 4 — Check if CLI harness is installed +Run with `timeout: 10`: +``` +cli-anything- --version +``` +- Found → skip to Step 6 +- Not found → go to Step 5 + +### Step 5 — Install CLI harness (ONE attempt only) + +**Always try CLI-Hub first** — run with `timeout: 120`: +``` +pip install cli-anything-hub --quiet +``` +Then run with `timeout: 120`: +``` +cli-hub install +``` +(Two separate run_shell calls — do NOT chain with &&) + +If CLI-Hub fails → generate a minimal harness with `write_file` (a Click CLI wrapping the app's real scripting API), then run with `timeout: 60`: +``` +pip install -e cli_anything/ --quiet +``` + +If harness install also fails → tell the user, stop completely. + +### Step 6 — Execute the user's task using the CLI harness ONLY + +**MANDATORY: Use ONLY `cli-anything-` commands. Never call soffice, gimp, blender, or any app binary directly.** + +Run with `timeout: 300` (or `timeout: 600` for renders/exports): + +``` +# Image editing — GIMP +cli-anything-gimp image resize input.jpg output.jpg 1920 1080 +cli-anything-gimp filter blur input.jpg --radius 3 --output out.jpg +cli-anything-gimp export input.xcf output.png + +# 3D / rendering — Blender +cli-anything-blender render scene.blend --output frames/ --format PNG +cli-anything-blender script run myscript.py scene.blend + +# Vector — Inkscape +cli-anything-inkscape export logo.svg logo.png --dpi 300 +cli-anything-inkscape convert input.svg output.pdf + +# Painting — Krita +cli-anything-krita export painting.kra output.png + +# Audio — Audacity +cli-anything-audacity trim audio.mp3 output.mp3 --start 0 --end 30 +cli-anything-audacity export-mp3 project.aup3 output.mp3 + +# Video — Kdenlive / Shotcut +cli-anything-kdenlive render project.kdenlive output.mp4 +cli-anything-shotcut render project.mlt output.mp4 + +# Office — LibreOffice (NEVER use soffice.exe directly) +cli-anything-libreoffice convert doc.docx output.pdf +cli-anything-libreoffice convert spreadsheet.xlsx output.pdf +cli-anything-libreoffice calc run macro.py spreadsheet.xlsx + +# Diagrams +cli-anything-draw-io export diagram.drawio output.png +cli-anything-mermaid render diagram.mmd output.png + +# AI / ML +cli-anything-comfyui run workflow.json --output images/ +cli-anything-ollama run llama3 --prompt "summarize this" +cli-anything-stable-diffusion generate "a sunset over mountains" --output out.png + +# Dev / Infra +cli-anything-jupyterlab execute notebook.ipynb --output result.ipynb +cli-anything-grafana export-dashboard my-dashboard dashboard.json +cli-anything-gitea create-repo myrepo --private + +# GIS / Design +cli-anything-freecad export model.fcstd output.stl +cli-anything-qgis export map.qgz output.png +``` + +**Always run the task. Never print commands and ask the user to run them.** + +If the task command fails → retry once with adjusted args. If it fails again → report the error and stop. + +### Step 7 — Report result +One or two sentences only: +> "Done — rendered `output.mp4` from your Kdenlive project." +> "Converted `report.docx` to PDF at `report.pdf`." + +--- + +## Hard Stop Rules (prevents infinite loops) + +- **Never retry an install** — if `winget install` or `cli-hub install` fails, stop and tell the user. +- **Never loop on a timeout** — if a command times out once, it will time out again. Stop immediately. +- **Max 1 retry on the task command (Step 6) only** — not on installs. +- **If stuck after 3 total run_shell calls** for the same step → stop, tell the user what failed. +- **Never use `&&` or `;` to chain commands** — always use separate run_shell calls. +- **Never use `.exe` extensions** — use the cli-anything harness which is cross-platform. +- **Never hardcode app installation paths** — use the harness, it resolves the path automatically. \ No newline at end of file diff --git a/skills/cli-anything/_meta.json b/skills/cli-anything/_meta.json new file mode 100644 index 00000000..af8c9adc --- /dev/null +++ b/skills/cli-anything/_meta.json @@ -0,0 +1,6 @@ +{ + "ownerId": "kn70pywhg0fyz996kpa8xj89s57yhv26", + "slug": "cli-anything", + "version": "1.0.0", + "publishedAt": 1744574400000 +} diff --git a/skills/docx/scripts/comment.py b/skills/docx/scripts/comment.py index 36e1c935..35600710 100644 --- a/skills/docx/scripts/comment.py +++ b/skills/docx/scripts/comment.py @@ -13,6 +13,8 @@ """ +from __future__ import annotations + import argparse import random import shutil diff --git a/skills/docx/scripts/office/pack.py b/skills/docx/scripts/office/pack.py index db29ed8b..55b53343 100644 --- a/skills/docx/scripts/office/pack.py +++ b/skills/docx/scripts/office/pack.py @@ -10,6 +10,8 @@ python pack.py unpacked/ output.pptx --validate false """ +from __future__ import annotations + import argparse import sys import shutil diff --git a/skills/nano-banana-pro/scripts/generate_image.py b/skills/nano-banana-pro/scripts/generate_image.py index 0ceed2c2..0672c22e 100644 --- a/skills/nano-banana-pro/scripts/generate_image.py +++ b/skills/nano-banana-pro/scripts/generate_image.py @@ -1,3 +1,5 @@ + +from __future__ import annotations #!/usr/bin/env python3 # /// script # requires-python = ">=3.10" diff --git a/skills/ontology/scripts/ontology.py b/skills/ontology/scripts/ontology.py index 040b4354..2c8f8e07 100644 --- a/skills/ontology/scripts/ontology.py +++ b/skills/ontology/scripts/ontology.py @@ -1,3 +1,5 @@ + +from __future__ import annotations #!/usr/bin/env python3 """ Ontology graph operations: create, query, relate, validate. diff --git a/skills/pptx/scripts/office/pack.py b/skills/pptx/scripts/office/pack.py index db29ed8b..55b53343 100644 --- a/skills/pptx/scripts/office/pack.py +++ b/skills/pptx/scripts/office/pack.py @@ -10,6 +10,8 @@ python pack.py unpacked/ output.pptx --validate false """ +from __future__ import annotations + import argparse import sys import shutil diff --git a/skills/tesla-api/scripts/tesla.py b/skills/tesla-api/scripts/tesla.py index 3577107b..b5c10fd5 100644 --- a/skills/tesla-api/scripts/tesla.py +++ b/skills/tesla-api/scripts/tesla.py @@ -1,3 +1,5 @@ + +from __future__ import annotations #!/usr/bin/env python3 # /// script # requires-python = ">=3.10" diff --git a/skills/xlsx/scripts/office/pack.py b/skills/xlsx/scripts/office/pack.py index db29ed8b..55b53343 100644 --- a/skills/xlsx/scripts/office/pack.py +++ b/skills/xlsx/scripts/office/pack.py @@ -10,6 +10,8 @@ python pack.py unpacked/ output.pptx --validate false """ +from __future__ import annotations + import argparse import sys import shutil