fix: stream thinking blocks inline instead of buffering them

hletrd · hletrd · commit 24f6025b8e1e · 2026-02-17T21:51:33.000+09:00
Previously, all reasoning_content (agent_thought_chunk) was buffered and
flushed as a single consolidated block before the first text chunk. This
caused OpenCode to display thinking at the bottom instead of inline.

Changes:
- Stream reasoning chunks immediately as they arrive, preserving the
  natural interleaved order (think → text → think → text)
- Use consistent chunk IDs per response (chatcmpl-{requestId}) instead
  of random UUIDs per chunk, per OpenAI streaming spec
- Remove unused createStreamChunk helper and flush/buffering logic
- Simplify StreamCallbackResult interface
diff --git a/src/server.ts b/src/server.ts
@@ -1040,29 +1040,6 @@ function createChatResponse(content: string, model: string, promptText?: string)
 // System fingerprint for reproducibility tracking
 const SYSTEM_FINGERPRINT = `auggie-wrapper-${process.env['npm_package_version'] ?? '1.0.0'}`;
 
-function createStreamChunk(
-  content: string,
-  model: string,
-  isLast = false,
-  extraDelta: Record<string, unknown> = {}
-): string {
-  const chunk = {
-    id: `chatcmpl-${randomUUID()}`,
-    object: 'chat.completion.chunk',
-    created: Math.floor(Date.now() / 1000),
-    model: model || DEFAULT_MODEL,
-    system_fingerprint: SYSTEM_FINGERPRINT,
-    choices: [
-      {
-        index: 0,
-        delta: isLast ? {} : { content, ...extraDelta },
-        finish_reason: isLast ? 'stop' : null,
-        logprobs: null,
-      },
-    ],
-  };
-  return `data: ${JSON.stringify(chunk)}\n\n`;
-}
 
 // Helper to format tool call content for streaming
 function formatToolCallContent(toolContent?: ToolCallContent[]): string {
@@ -1096,7 +1073,6 @@ let toolCallCounter = 0;
 
 interface StreamCallbackResult {
   callback: (notification: SessionNotification) => void;
-  flush: () => void;
 }
 
 function createStreamCallback(res: ServerResponse, model: string, requestId: string): StreamCallbackResult {
@@ -1108,46 +1084,8 @@ function createStreamCallback(res: ServerResponse, model: string, requestId: str
   let chunkCount = 0;
   let lastChunkTime = Date.now();
 
-  // Buffer for reasoning content - we need to send all reasoning BEFORE any text
-  // to ensure proper ordering in OpenCode's UI
-  let reasoningBuffer: string[] = [];
-  let hasStartedTextContent = false;
-  let hasFlushedReasoning = false;
-
-  // Helper to flush buffered reasoning content
-  const flushReasoningBuffer = (): void => {
-    if (hasFlushedReasoning || reasoningBuffer.length === 0) return;
-    hasFlushedReasoning = true;
-
-    const combinedReasoning = reasoningBuffer.join('');
-    const timestamp = Math.floor(Date.now() / 1000);
-
-    console.log(`[${requestId}] 💭 Flushing ${String(reasoningBuffer.length)} reasoning chunks (${String(combinedReasoning.length)} chars)`);
-
-    // Send all reasoning as a single chunk before text starts
-    const thoughtChunk = {
-      id: `chatcmpl-${requestId}`,
-      object: 'chat.completion.chunk',
-      created: timestamp,
-      model,
-      system_fingerprint: SYSTEM_FINGERPRINT,
-      choices: [
-        {
-          index: 0,
-          delta: {
-            role: 'assistant',
-            reasoning_content: combinedReasoning,
-          },
-          finish_reason: null,
-          logprobs: null,
-        },
-      ],
-    };
-    res.write(`data: ${JSON.stringify(thoughtChunk)}\n\n`);
-
-    // Clear the buffer
-    reasoningBuffer = [];
-  };
+  // Use a consistent chunk ID for all chunks in this response (per OpenAI spec)
+  const chunkId = `chatcmpl-${requestId}`;
 
   const callback = (notification: SessionNotification): void => {
     const update = notification.update;
@@ -1186,13 +1124,23 @@ function createStreamCallback(res: ServerResponse, model: string, requestId: str
 
       case 'agent_message_chunk':
         if (update.content?.type === 'text' && update.content.text) {
-          // Flush any buffered reasoning before sending text content
-          // This ensures reasoning appears BEFORE text in the UI
-          if (!hasStartedTextContent) {
-            hasStartedTextContent = true;
-            flushReasoningBuffer();
-          }
-          res.write(createStreamChunk(update.content.text, model));
+          // Send text content immediately - preserves natural ordering
+          const textChunk = {
+            id: chunkId,
+            object: 'chat.completion.chunk',
+            created: timestamp,
+            model,
+            system_fingerprint: SYSTEM_FINGERPRINT,
+            choices: [
+              {
+                index: 0,
+                delta: { content: update.content.text },
+                finish_reason: null,
+                logprobs: null,
+              },
+            ],
+          };
+          res.write(`data: ${JSON.stringify(textChunk)}\n\n`);
         }
         break;
 
@@ -1203,38 +1151,26 @@ function createStreamCallback(res: ServerResponse, model: string, requestId: str
             `[${requestId}] 💭 Thinking: ${text.substring(0, 100)}${text.length > 100 ? '...' : ''}`
           );
 
-          // If we haven't started text content yet, buffer the reasoning
-          // This handles the case where thought chunks come interleaved with message chunks
-          if (!hasStartedTextContent) {
-            reasoningBuffer.push(text);
-          } else {
-            // If text has already started and we get more reasoning,
-            // we need to send it immediately as a new reasoning block.
-            // However, this is suboptimal - the UI may show it at the end.
-            // Log a warning for debugging.
-            console.log(
-              `[${requestId}] ⚠️ Late reasoning chunk received after text started - may appear at end of output`
-            );
-            const thoughtChunk = {
-              id: `chatcmpl-${requestId}`,
-              object: 'chat.completion.chunk',
-              created: timestamp,
-              model,
-              system_fingerprint: SYSTEM_FINGERPRINT,
-              choices: [
-                {
-                  index: 0,
-                  delta: {
-                    role: 'assistant',
-                    reasoning_content: text,
-                  },
-                  finish_reason: null,
-                  logprobs: null,
+          // Stream reasoning chunks immediately to preserve interleaved ordering
+          // (think → text → think → text) so they appear inline in OpenCode
+          const thoughtChunk = {
+            id: chunkId,
+            object: 'chat.completion.chunk',
+            created: timestamp,
+            model,
+            system_fingerprint: SYSTEM_FINGERPRINT,
+            choices: [
+              {
+                index: 0,
+                delta: {
+                  reasoning_content: text,
                 },
-              ],
-            };
-            res.write(`data: ${JSON.stringify(thoughtChunk)}\n\n`);
-          }
+                finish_reason: null,
+                logprobs: null,
+              },
+            ],
+          };
+          res.write(`data: ${JSON.stringify(thoughtChunk)}\n\n`);
         }
         break;
 
@@ -1333,7 +1269,6 @@ function createStreamCallback(res: ServerResponse, model: string, requestId: str
 
   return {
     callback,
-    flush: flushReasoningBuffer,
   };
 }
 
@@ -1405,10 +1340,6 @@ async function callAugmentAPIStreamingInternal(
     hasError = true;
     caughtError = err as Error;
   } finally {
-    // Flush any buffered reasoning content before ending the stream
-    // This handles the case where reasoning was received but no text content followed
-    streamHandler.flush();
-
     client.onSessionUpdate(null);
     // Discard client on session errors or aborts, otherwise return to pool
     if (hasError && caughtError) {
@@ -1655,7 +1586,16 @@ async function handleChatCompletions(req: IncomingMessage, res: ServerResponse):
 
       try {
         await callAugmentAPIStreaming(prompt, model, res, requestId, model, workspaceRoot ?? undefined, abortController.signal);
-        res.write(createStreamChunk('', model, true));
+        // Send final stop chunk with consistent ID
+        const stopChunk = {
+          id: `chatcmpl-${requestId}`,
+          object: 'chat.completion.chunk',
+          created: Math.floor(Date.now() / 1000),
+          model: model || DEFAULT_MODEL,
+          system_fingerprint: SYSTEM_FINGERPRINT,
+          choices: [{ index: 0, delta: {}, finish_reason: 'stop', logprobs: null }],
+        };
+        res.write(`data: ${JSON.stringify(stopChunk)}\n\n`);
         res.write('data: [DONE]\n\n');
         cleanup(true);
       } catch (err) {