VLLMModel fix whitespace stripping and unwarranted spaces (#70)

bxyu-nvidia · abhibha-nvidia · commit c70fea486cad · 2025-09-28T11:05:03.000-07:00
Signed-off-by: Brian Yu &lt;bxyu@nvidia.com&gt;
Signed-off-by: Abhibha Gupta &lt;abhibhag@nvidia.com&gt;
diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py
@@ -259,7 +259,7 @@ def _parse_think_tags(cls, content: str) -> Tuple[List[str], str]:
         # Extract reasoning content from between <think></think> tags.
         matches = cls.THINK_TAG_PATTERN.findall(content)
         # Remove reasoning from main content
-        cleaned = cls.THINK_TAG_PATTERN.sub("", content).strip()
+        cleaned = cls.THINK_TAG_PATTERN.sub("", content)
         return matches, cleaned
 
     # =======================================================
@@ -375,7 +375,7 @@ def _format_message(
                 # Handle reasoning
                 final_content = ""
                 if isinstance(m["content"], list):
-                    content_str = " ".join([part.get("text", "") for part in m["content"]])
+                    content_str = "".join([part.get("text", "") for part in m["content"]])
                     final_content += content_str
                 elif isinstance(m["content"], str):
                     final_content += m["content"]
@@ -463,8 +463,7 @@ def postprocess_chat_response(self, choice: NeMoGymChoice) -> List[NeMoGymRespon
                 id=f"rs_{uuid4().hex}",
                 type="reasoning",
                 summary=[
-                    NeMoGymSummary(text=reasoning_text.strip(), type="summary_text")
-                    for reasoning_text in reasoning_matches
+                    NeMoGymSummary(text=reasoning_text, type="summary_text") for reasoning_text in reasoning_matches
                 ],
                 status="completed",
             )
diff --git a/responses_api_models/vllm_model/tests/test_app.py b/responses_api_models/vllm_model/tests/test_app.py
@@ -18,6 +18,7 @@
 from fastapi.testclient import TestClient
 from pytest import MonkeyPatch, mark
 
+import nemo_gym.server_utils
 from nemo_gym import PARENT_DIR
 from nemo_gym.openai_utils import (
     NeMoGymAsyncOpenAI,
@@ -659,7 +660,7 @@ class FakeUUID:
 
 
 class TestApp:
-    def _setup_server(self):
+    def _setup_server(self, monkeypatch: MonkeyPatch):
         config = VLLMModelConfig(
             host="0.0.0.0",
             port=8081,
@@ -670,13 +671,18 @@ def _setup_server(self):
             name="",
             return_token_id_information=False,
         )
+
+        get_global_config_dict_mock = MagicMock()
+        get_global_config_dict_mock.return_value = dict()
+        monkeypatch.setattr(nemo_gym.server_utils, "get_global_config_dict", get_global_config_dict_mock)
+
         return VLLMModel(config=config, server_client=MagicMock(spec=ServerClient))
 
-    async def test_sanity(self) -> None:
-        self._setup_server()
+    async def test_sanity(self, monkeypatch: MonkeyPatch) -> None:
+        self._setup_server(monkeypatch)
 
     def test_responses_multistep(self, monkeypatch: MonkeyPatch):
-        server = self._setup_server()
+        server = self._setup_server(monkeypatch)
         app = server.setup_webserver()
         client = TestClient(app)
 
@@ -881,7 +887,7 @@ def _standardize(messages: list) -> list:
         assert expected_sent_tools == actual_sent_tools
 
     def test_responses_multiturn(self, monkeypatch: MonkeyPatch):
-        server = self._setup_server()
+        server = self._setup_server(monkeypatch)
         app = server.setup_webserver()
         client = TestClient(app)
 
@@ -1019,7 +1025,7 @@ def test_responses_multiturn(self, monkeypatch: MonkeyPatch):
         assert expected_sent_messages == sent_messages
 
     def test_responses_multistep_multiturn(self, monkeypatch: MonkeyPatch):
-        server = self._setup_server()
+        server = self._setup_server(monkeypatch)
         app = server.setup_webserver()
         client = TestClient(app)
 
@@ -1375,7 +1381,7 @@ def test_responses_e2e(
         Test entire pipeline from api endpoint -> final output:
         Response Create Params -> Response
         """
-        server = self._setup_server()
+        server = self._setup_server(monkeypatch)
         app = server.setup_webserver()
         client = TestClient(app)
 
@@ -1425,7 +1431,7 @@ def test_responses_to_chat_completion_create_params(
         Tests conversion from api endpoint -> internal request schema
         Response Params -> Chat Completion Params
         """
-        server = self._setup_server()
+        server = self._setup_server(monkeypatch)
         app = server.setup_webserver()
         client = TestClient(app)
 
@@ -2026,3 +2032,115 @@ def test_round_trip_chat_completions_return_token_id_information(self) -> None:
 
         expected_output = test_data["expected_output_return_token_id_information"]
         assert expected_output == chat_completion_create_params.model_dump()
+
+    def test_whitespace_round_trip_chat_completions(self, monkeypatch: MonkeyPatch) -> None:
+        monkeypatch.setattr("responses_api_models.vllm_model.app.uuid4", lambda: FakeUUID())
+
+        message = NeMoGymChatCompletionMessage(
+            content="<think> \n \n I'm thinking \n \n </think> \n \n I'm chatting! \n \n ",
+            role="assistant",
+            tool_calls=[
+                NeMoGymChatCompletionMessageToolCall(
+                    id="tool call 1",
+                    function=NeMoGymFunction(name="get_weather", arguments='{"city_name": "new york"}'),
+                    type="function",
+                ),
+                NeMoGymChatCompletionMessageToolCall(
+                    id="tool call 2",
+                    function=NeMoGymFunction(name="get_weather", arguments='{"city_name": "boston"}'),
+                    type="function",
+                ),
+            ],
+        )
+        actual_response_output_items = self.converter.postprocess_chat_response(
+            choice=NeMoGymChoice(
+                finish_reason="tool_calls",
+                index=0,
+                message=message,
+            )
+        )
+        expected_response_output_items = [
+            NeMoGymResponseReasoningItem(
+                id="rs_123",
+                summary=[NeMoGymSummary(text=" \n \n I'm thinking \n \n ", type="summary_text")],
+                type="reasoning",
+                encrypted_content=None,
+            ),
+            NeMoGymResponseOutputMessage(
+                id="msg_123",
+                content=[
+                    NeMoGymResponseOutputText(
+                        annotations=[], text=" \n \n I'm chatting! \n \n ", type="output_text", logprobs=None
+                    )
+                ],
+                role="assistant",
+                status="completed",
+                type="message",
+            ),
+            NeMoGymResponseFunctionToolCall(
+                arguments='{"city_name": "new york"}',
+                call_id="tool call 1",
+                name="get_weather",
+                type="function_call",
+                id="tool call 1",
+                status="completed",
+            ),
+            NeMoGymResponseFunctionToolCall(
+                arguments='{"city_name": "boston"}',
+                call_id="tool call 2",
+                name="get_weather",
+                type="function_call",
+                id="tool call 2",
+                status="completed",
+            ),
+        ]
+        assert expected_response_output_items == actual_response_output_items
+
+        chat_completion_create_params = self.converter.responses_to_chat_completion_create_params(
+            responses_create_params=NeMoGymResponseCreateParamsNonStreaming(
+                input=[
+                    NeMoGymEasyInputMessage(
+                        content=" \n \n system \n \n ",
+                        role="system",
+                    ),
+                    NeMoGymEasyInputMessage(
+                        content=" \n \n hello! \n \n ",
+                        role="user",
+                    ),
+                    *actual_response_output_items,
+                ],
+            )
+        )
+        actual_messages = chat_completion_create_params.messages
+
+        expected_messages = [
+            NeMoGymChatCompletionSystemMessageParam(
+                content=" \n \n system \n \n ",
+                role="system",
+            ),
+            NeMoGymChatCompletionUserMessageParam(
+                content=" \n \n hello! \n \n ",
+                role="user",
+            ),
+            NeMoGymChatCompletionAssistantMessageParam(
+                role="assistant",
+                content="<think> \n \n I'm thinking \n \n </think> \n \n I'm chatting! \n \n ",
+                tool_calls=[
+                    NeMoGymChatCompletionMessageToolCallParam(
+                        id="tool call 1",
+                        function=NeMoGymChatCompletionMessageToolCallFunctionParam(
+                            name="get_weather", arguments='{"city_name": "new york"}'
+                        ),
+                        type="function",
+                    ),
+                    NeMoGymChatCompletionMessageToolCallParam(
+                        id="tool call 2",
+                        function=NeMoGymChatCompletionMessageToolCallFunctionParam(
+                            name="get_weather", arguments='{"city_name": "boston"}'
+                        ),
+                        type="function",
+                    ),
+                ],
+            ),
+        ]
+        assert expected_messages == actual_messages