REMvisual
diff --git a/‎openviking/models/vlm/backends/openai_vlm.py‎
Lines changed: 177 additions & 13 deletions b/‎openviking/models/vlm/backends/openai_vlm.py‎
Lines changed: 177 additions & 13 deletions
@@ -48,17 +48,179 @@ def get_async_client(self):
             self._async_client = openai.AsyncOpenAI(**client_kwargs)
         return self._async_client
 
-    def _update_token_usage_from_response(self, response):
-        if hasattr(response, "usage") and response.usage:
-            prompt_tokens = response.usage.prompt_tokens
-            completion_tokens = response.usage.completion_tokens
+    def _is_streaming_response(self, response):
+        """Check if response is a streaming response.
+
+        Streaming responses are iterators that yield chunks, while non-streaming
+        responses have a choices attribute directly.
+        """
+        # Check for async streaming first to avoid false positives
+        if hasattr(response, "__aiter__"):
+            return False  # Async responses handled separately
+        # Streaming responses: iterators but not strings/lists/dicts with choices
+        if hasattr(response, "__iter__") and not hasattr(response, "choices"):
+            # Exclude basic iterable types that might slip through
+            if isinstance(response, (str, bytes, list, dict)):
+                return False
+            return True
+        # Some streaming responses might have _iterator attribute
+        if hasattr(response, "_iterator") and not hasattr(response, "choices"):
+            return True
+        return False
+
+    def _is_async_streaming_response(self, response):
+        """Check if response is an async streaming response."""
+        if hasattr(response, "__aiter__") and not hasattr(response, "choices"):
+            # Exclude basic types that should never be treated as streaming
+            if isinstance(response, (str, bytes, list, dict)):
+                return False
+            return True
+        if hasattr(response, "_iterator") and not hasattr(response, "choices"):
+            return True
+        return False
+
+    def _extract_content_from_chunk(self, chunk):
+        """Extract content string from a single chunk."""
+        try:
+            choices = getattr(chunk, "choices", None)
+            if not choices:
+                return None
+            delta = getattr(choices[0], "delta", None)
+            if not delta:
+                return None
+            return getattr(delta, "content", None)
+        except (AttributeError, IndexError):
+            return None
+
+    def _extract_usage_from_chunk(self, chunk):
+        """Extract token usage from a chunk. Returns (prompt_tokens, completion_tokens)."""
+        usage = getattr(chunk, "usage", None)
+        if not usage:
+            return 0, 0
+        prompt_tokens = getattr(usage, "prompt_tokens", 0) or 0
+        completion_tokens = getattr(usage, "completion_tokens", 0) or 0
+        return prompt_tokens, completion_tokens
+
+    def _process_streaming_chunks(self, chunks):
+        """Process streaming chunks and extract content and token usage.
+
+        WARNING: This method consumes the iterator. Do not use the response
+        object after calling this method as it will be exhausted.
+
+        Returns (content, prompt_tokens, completion_tokens).
+        """
+        content_parts = []
+        prompt_tokens = 0
+        completion_tokens = 0
+
+        for chunk in chunks:
+            content = self._extract_content_from_chunk(chunk)
+            if content:
+                content_parts.append(content)
+
+            pt, ct = self._extract_usage_from_chunk(chunk)
+            if pt > 0:
+                prompt_tokens = pt
+            if ct > 0:
+                completion_tokens = ct
+
+        return "".join(content_parts), prompt_tokens, completion_tokens
+
+    def _extract_content_and_usage(self, response):
+        """Extract content from response, handling both streaming and non-streaming.
+
+        Returns (content, prompt_tokens, completion_tokens, is_streaming).
+        """
+        logger.debug(f"[OpenAIVLM] Response type: {type(response)}")
+
+        if self._is_streaming_response(response):
+            content, prompt_tokens, completion_tokens = self._process_streaming_chunks(response)
+            return content, prompt_tokens, completion_tokens, True
+        else:
+            # Non-streaming response
+            content = response.choices[0].message.content or ""
+            usage = getattr(response, "usage", None)
+            prompt_tokens = getattr(usage, "prompt_tokens", 0) or 0
+            completion_tokens = getattr(usage, "completion_tokens", 0) or 0
+            return content, prompt_tokens, completion_tokens, False
+
+    async def _extract_content_and_usage_async(self, response):
+        """Extract content from async response, handling both streaming and non-streaming.
+
+        Returns (content, prompt_tokens, completion_tokens, is_streaming).
+        """
+        logger.debug(f"[OpenAIVLM] Async response type: {type(response)}")
+
+        if self._is_async_streaming_response(response):
+            # Note: This logic mirrors _process_streaming_chunks but uses
+            # async for to handle async iterators. Python's async for and
+            # sync for cannot be unified in a single method.
+            content_parts = []
+            prompt_tokens = 0
+            completion_tokens = 0
+
+            async for chunk in response:
+                content = self._extract_content_from_chunk(chunk)
+                if content:
+                    content_parts.append(content)
+
+                pt, ct = self._extract_usage_from_chunk(chunk)
+                if pt > 0:
+                    prompt_tokens = pt
+                if ct > 0:
+                    completion_tokens = ct
+
+            return "".join(content_parts), prompt_tokens, completion_tokens, True
+        else:
+            # Non-streaming response
+            content = response.choices[0].message.content or ""
+            usage = getattr(response, "usage", None)
+            prompt_tokens = getattr(usage, "prompt_tokens", 0) or 0
+            completion_tokens = getattr(usage, "completion_tokens", 0) or 0
+            return content, prompt_tokens, completion_tokens, False
+
+    def _finalize_response(
+        self, content, prompt_tokens, completion_tokens, is_streaming, operation_name="completion"
+    ):
+        """Finalize response: log warnings and update token usage.
+
+        Common post-processing for both sync and async responses.
+        """
+        if not content:
+            logger.warning(
+                f"[OpenAIVLM] Empty {operation_name} response received (streaming={is_streaming})"
+            )
+
+        if prompt_tokens > 0 or completion_tokens > 0:
             self.update_token_usage(
                 model_name=self.model or "gpt-4o-mini",
                 provider=self.provider,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
             )
-        return
+
+        return content
+
+    def _handle_response(self, response, operation_name="completion"):
+        """Handle response extraction and token usage update."""
+        content, prompt_tokens, completion_tokens, is_streaming = self._extract_content_and_usage(
+            response
+        )
+        return self._finalize_response(
+            content, prompt_tokens, completion_tokens, is_streaming, operation_name
+        )
+
+    async def _handle_response_async(self, response, operation_name="completion"):
+        """Handle async response extraction and token usage update."""
+        (
+            content,
+            prompt_tokens,
+            completion_tokens,
+            is_streaming,
+        ) = await self._extract_content_and_usage_async(response)
+        return self._finalize_response(
+            content, prompt_tokens, completion_tokens, is_streaming, operation_name
+        )
 
     def get_completion(self, prompt: str, thinking: bool = False) -> str:
         """Get text completion"""
@@ -72,8 +234,8 @@ def get_completion(self, prompt: str, thinking: bool = False) -> str:
             kwargs["max_tokens"] = self.max_tokens
 
         response = client.chat.completions.create(**kwargs)
-        self._update_token_usage_from_response(response)
-        return self._clean_response(response.choices[0].message.content or "")
+        content = self._handle_response(response, operation_name="text completion")
+        return self._clean_response(content)
 
     async def get_completion_async(
         self, prompt: str, thinking: bool = False, max_retries: int = 0
@@ -92,8 +254,10 @@ async def get_completion_async(
         for attempt in range(max_retries + 1):
             try:
                 response = await client.chat.completions.create(**kwargs)
-                self._update_token_usage_from_response(response)
-                return self._clean_response(response.choices[0].message.content or "")
+                content = await self._handle_response_async(
+                    response, operation_name="text completion"
+                )
+                return self._clean_response(content)
             except Exception as e:
                 last_error = e
                 if attempt < max_retries:
@@ -179,8 +343,8 @@ def get_vision_completion(
             kwargs["max_tokens"] = self.max_tokens
 
         response = client.chat.completions.create(**kwargs)
-        self._update_token_usage_from_response(response)
-        return self._clean_response(response.choices[0].message.content or "")
+        content = self._handle_response(response, operation_name="vision completion")
+        return self._clean_response(content)
 
     async def get_vision_completion_async(
         self,
@@ -205,5 +369,5 @@ async def get_vision_completion_async(
             kwargs["max_tokens"] = self.max_tokens
 
         response = await client.chat.completions.create(**kwargs)
-        self._update_token_usage_from_response(response)
-        return self._clean_response(response.choices[0].message.content or "")
+        content = await self._handle_response_async(response, operation_name="vision completion")
+        return self._clean_response(content)