NVIDIA-NeMo
diff --git a/‎README.md‎
Lines changed: 4 additions & 3 deletions b/‎README.md‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎nemo_gym/config_types.py‎
Lines changed: 1 addition & 0 deletions b/‎nemo_gym/config_types.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎nemo_gym/openai_utils.py‎
Lines changed: 4 additions & 1 deletion b/‎nemo_gym/openai_utils.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎nemo_gym/rollout_collection.py‎
Lines changed: 7 additions & 2 deletions b/‎nemo_gym/rollout_collection.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎nemo_gym/server_utils.py‎
Lines changed: 93 additions & 31 deletions b/‎nemo_gym/server_utils.py‎
Lines changed: 93 additions & 31 deletions
diff --git a/‎resources_servers/comp_coding/README.md‎
Lines changed: 3 additions & 13 deletions b/‎resources_servers/comp_coding/README.md‎
Lines changed: 3 additions & 13 deletions
@@ -577,14 +577,15 @@ ng_collect_rollouts +agent_name=multineedle_simple_agent \
     +output_jsonl_fpath=results/multineedle_rollout_collection.jsonl \
     +limit=null \
     +num_repeats=null \
-    +num_samples_in_parallel=null
+    +num_samples_in_parallel=null \
+    +responses_create_params.max_output_tokens=32_768
 ```
 
 The supported parameters include:
 - `limit`: Limits how many examples from the input JSONL file to process
 - `num_repeats`: Repeats each input example multiple times to collect multiple rollouts per example
 - `num_samples_in_parallel`: Controls how many rollout collection requests run concurrently
-
+- `responses_create_params`: A dictionary of sampling parameter overrides.
 
 View the rollouts just collected!
 ```
@@ -797,7 +798,7 @@ ng_collect_rollouts +agent_name=library_judge_math_simple_agent \
     +input_jsonl_fpath=resources_servers/library_judge_math/data/dapo17k_bytedtsinghua_train.jsonl \
     +output_jsonl_fpath=temp/library_judge_math_rollouts.jsonl \
     +limit=1024 \
-    +num_repeats 1
+    +num_repeats=1
 ```
 
 After `ng_collect_rollouts` finishes, ctrl+c to quit your servers. You should see some output in the terminal like this:
 
@@ -91,6 +91,7 @@ class DatasetConfig(BaseModel):
     license: Optional[
         Union[
             Literal["Apache 2.0"],
+            Literal["MIT"],
             Literal["Creative Commons Attribution 4.0 International"],
             Literal["Creative Commons Attribution-ShareAlike 4.0 International"],
             Literal["TBD"],
 
@@ -74,7 +74,7 @@
 from pydantic import BaseModel, ConfigDict, Field
 from typing_extensions import TypedDict
 
-from nemo_gym.server_utils import request
+from nemo_gym.server_utils import raise_for_status, request
 
 
 ########################################
@@ -432,6 +432,7 @@ async def create_chat_completion(self, **kwargs):
             json=kwargs,
             headers={"Authorization": f"Bearer {self.api_key}"},
         )
+        raise_for_status(response)
         return await response.json()
 
     async def create_response(self, **kwargs):
@@ -441,6 +442,7 @@ async def create_response(self, **kwargs):
             json=kwargs,
             headers={"Authorization": f"Bearer {self.api_key}"},
         )
+        raise_for_status(response)
         return await response.json()
 
     async def create_tokenize(self, **kwargs):
@@ -451,4 +453,5 @@ async def create_tokenize(self, **kwargs):
             json=kwargs,
             headers={"Authorization": f"Bearer {self.api_key}"},
         )
+        raise_for_status(response)
         return await response.json()
@@ -17,9 +17,9 @@
 from collections import Counter
 from contextlib import nullcontext
 from itertools import chain, repeat
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 from tqdm.asyncio import tqdm
 
 from nemo_gym.config_types import BaseServerConfig
@@ -39,6 +39,7 @@ class RolloutCollectionConfig(BaseModel):
     limit: Optional[int] = None
     num_repeats: Optional[int] = None
     num_samples_in_parallel: Optional[int] = None
+    responses_create_params: Dict[str, Any] = Field(default_factory=dict)
 
 
 class RolloutCollectionHelper(BaseModel):  # pragma: no cover
@@ -68,10 +69,14 @@ async def run_from_config(self, config: RolloutCollectionConfig):
             f"The tqdm progress bar will only update every {tqdm_miniters} samples that finish to ensure that you are not being spammed."
         )
 
+        if config.responses_create_params:
+            print(f"Overriding responses_create_params fields with {config.responses_create_params}")
+
         metrics = Counter()
         with open(config.output_jsonl_fpath, "a") as f:
 
             async def _post_coroutine(row: dict) -> None:
+                row["responses_create_params"] = row["responses_create_params"] | config.responses_create_params
                 async with semaphore:
                     response = await server_client.post(server_name=config.agent_name, url_path="/run", json=row)
                     result = await response.json()
 
@@ -14,6 +14,7 @@
 import asyncio
 import atexit
 import json
+import resource
 from abc import abstractmethod
 from contextlib import asynccontextmanager
 from io import StringIO
@@ -22,6 +23,7 @@
 from os import getenv
 from pathlib import Path
 from threading import Thread
+from traceback import print_exc
 from typing import Literal, Optional, Tuple, Type, Union, Unpack
 from uuid import uuid4
 
@@ -31,6 +33,7 @@
 from aiohttp import ClientResponse, ClientSession, ClientTimeout, DummyCookieJar, ServerDisconnectedError, TCPConnector
 from aiohttp.client import _RequestOptions
 from fastapi import FastAPI, Request, Response
+from fastapi.responses import JSONResponse
 from omegaconf import DictConfig, OmegaConf
 from pydantic import BaseModel, ConfigDict
 from requests.exceptions import ConnectionError
@@ -62,7 +65,7 @@ class GlobalAIOHTTPAsyncClientConfig(BaseModel):
 def get_global_aiohttp_client(
     global_config_dict_parser_config: Optional[GlobalConfigDictParserConfig] = None,
     global_config_dict_parser_cls: Type[GlobalConfigDictParser] = GlobalConfigDictParser,
-) -> ClientSession:
+) -> ClientSession:  # pragma: no cover
     global _GLOBAL_AIOHTTP_CLIENT
 
     if _GLOBAL_AIOHTTP_CLIENT is not None:
@@ -77,7 +80,7 @@ def get_global_aiohttp_client(
     return set_global_aiohttp_client(cfg)
 
 
-def set_global_aiohttp_client(cfg: GlobalAIOHTTPAsyncClientConfig) -> ClientSession:
+def set_global_aiohttp_client(cfg: GlobalAIOHTTPAsyncClientConfig) -> ClientSession:  # pragma: no cover
     assert not is_global_aiohttp_client_setup(), (
         "There is already a global aiohttp client setup. Please refactor your code or call `global_aiohttp_client_exit` if you want to explicitly re-make the client!"
     )
@@ -97,11 +100,11 @@ def set_global_aiohttp_client(cfg: GlobalAIOHTTPAsyncClientConfig) -> ClientSess
     return _GLOBAL_AIOHTTP_CLIENT
 
 
-def is_global_aiohttp_client_setup() -> bool:
+def is_global_aiohttp_client_setup() -> bool:  # pragma: no cover
     return _GLOBAL_AIOHTTP_CLIENT is not None
 
 
-def global_aiohttp_client_exit():
+def global_aiohttp_client_exit():  # pragma: no cover
     if not is_global_aiohttp_client_setup():
         return
 
@@ -118,7 +121,9 @@ def global_aiohttp_client_exit():
 MAX_NUM_TRIES = 3
 
 
-async def request(method: str, url: str, **kwargs: Unpack[_RequestOptions]) -> ClientResponse:
+async def request(
+    method: str, url: str, _internal: bool = False, **kwargs: Unpack[_RequestOptions]
+) -> ClientResponse:  # pragma: no cover
     client = get_global_aiohttp_client()
     num_tries = 1
     while True:
@@ -127,18 +132,27 @@ async def request(method: str, url: str, **kwargs: Unpack[_RequestOptions]) -> C
         except ServerDisconnectedError:
             await asyncio.sleep(0.5)
         except Exception as e:
-            print(
-                f"""Hit an exception while making a request (try {num_tries}): {type(e)}: {e}
+            # Don't increment internal since we know we are ok. If we are not, the head server will shut everything down anyways.
+            if not _internal:
+                print(
+                    f"""Hit an exception while making a request (try {num_tries}): {type(e)}: {e}
 Sleeping 0.5s and retrying...
 """
-            )
-            if num_tries >= MAX_NUM_TRIES:
-                raise e
+                )
+                if num_tries >= MAX_NUM_TRIES:
+                    raise e
+
+                num_tries += 1
 
-            num_tries += 1
             await asyncio.sleep(0.5)
 
 
+def raise_for_status(response: ClientResponse) -> None:  # pragma: no cover
+    if not response.ok:
+        print(response.content)
+        response.raise_for_status()
+
+
 DEFAULT_HEAD_SERVER_PORT = 11000
 
 ServerStatus = Union[Literal["success"], Literal["connection_error"], Literal["timeout"], Literal["unknown_error"]]
@@ -193,7 +207,7 @@ async def request(
             if isinstance(json_obj, BaseModel):
                 kwargs["json"] = json_obj.model_dump(exclude_unset=True)
 
-        return await request(method=method, url=f"{base_url}{url_path}", **kwargs)
+        return await request(method=method, url=f"{base_url}{url_path}", _internal=True, **kwargs)
 
     async def get(
         self,
@@ -324,6 +338,24 @@ async def add_session_id(request: Request, call_next):  # pragma: no cover
         session_middleware_key = self.get_session_middleware_key()
         app.add_middleware(SessionMiddleware, secret_key=session_middleware_key, session_cookie=session_middleware_key)
 
+    def setup_exception_middleware(self, app: FastAPI) -> None:  # pragma: no cover
+        @app.middleware("http")
+        async def exception_handling_middleware(request: Request, call_next):
+            try:
+                return await call_next(request)
+            except Exception as e:
+                print_exc()
+                print(
+                    f"🚨 Caught an exception printed above in {self.config.name} ({self.__class__.__name__}). If you expect this to be fed back into this model, the exception repr i.e. `repr(e)` is returned to the model. However, please make sure this exception is caught in your server and returned to the model as appropriate. See https://fastapi.tiangolo.com/tutorial/handling-errors/#use-httpexception"
+                )
+                return JSONResponse(content=repr(e), status_code=500)
+            except:
+                print_exc()
+                print(
+                    f"🚨 Caught an unknown exception printed above in {self.config.name} ({self.__class__.__name__}). If you expect this to be fed back into this model, nothing meaningful is returned to the model. Please make sure this exception is caught in your server and returned to the model as appropriate. See https://fastapi.tiangolo.com/tutorial/handling-errors/#use-httpexception"
+                )
+                return JSONResponse(content="An unknown error occurred", status_code=500)
+
     def setup_profiling(self, app: FastAPI, profiling_config: ProfilingMiddlewareConfig) -> None:  # pragma: no cover
         base_profile_dir = Path(PARENT_DIR) / profiling_config.profiling_results_dirpath
         server_profile_path = (base_profile_dir / self.get_session_middleware_key()).with_suffix(".log")
@@ -332,18 +364,7 @@ def setup_profiling(self, app: FastAPI, profiling_config: ProfilingMiddlewareCon
 
         main_app_lifespan = app.router.lifespan_context
 
-        @asynccontextmanager
-        async def lifespan_wrapper(app):
-            yappi.set_clock_type("WALL")
-            yappi.start()
-            print(f"🔍 Enabled profiling for {self.config.name}")
-
-            async with main_app_lifespan(app) as maybe_state:
-                yield maybe_state
-
-            print(f"🛑 Stopping profiler for {self.config.name}. Check {server_profile_path} for the metrics!")
-            yappi.stop()
-
+        def _dump_yappi_stats() -> str:
             buffer = StringIO()
             yappi.get_func_stats().print_all(
                 out=buffer,
@@ -357,17 +378,56 @@ async def lifespan_wrapper(app):
             )
 
             buffer.seek(0)
-            with open(server_profile_path, "w") as f:
-                past_header = False
-                for line in buffer:
-                    if not past_header or self.config.entrypoint in line:
-                        f.write(line)
+            res = ""
+            past_header = False
+            for line in buffer:
+                if not past_header or self.config.entrypoint in line:
+                    res += line
+
+                if line.startswith("name"):
+                    past_header = True
 
-                    if line.startswith("name"):
-                        past_header = True
+            return res
+
+        @asynccontextmanager
+        async def lifespan_wrapper(app):
+            yappi.set_clock_type("CPU")
+            yappi.start()
+            print(f"🔍 Enabled profiling for {self.config.name}")
+
+            async with main_app_lifespan(app) as maybe_state:
+                yield maybe_state
+
+            print(f"🛑 Stopping profiler for {self.config.name}. Check {server_profile_path} for the metrics!")
+            yappi.stop()
+
+            with open(server_profile_path, "w") as f:
+                f.write(_dump_yappi_stats())
 
         app.router.lifespan_context = lifespan_wrapper
 
+        @app.get("/stats")
+        def stats():
+            return Response(_dump_yappi_stats())
+
+    def set_ulimit(self, target_soft_limit: int = 65535):  # pragma: no cover
+        # From https://github.com/vllm-project/vllm/blob/fed8a9b107df3e27d57728c6911c7d308b871477/vllm/utils/__init__.py#L2790
+        resource_type = resource.RLIMIT_NOFILE
+        current_soft, current_hard = resource.getrlimit(resource_type)
+
+        if current_soft < target_soft_limit:
+            try:
+                resource.setrlimit(resource_type, (target_soft_limit, current_hard))
+            except ValueError as e:
+                print(
+                    "Found ulimit of %s and failed to automatically increase "
+                    "with error %s. This can cause fd limit errors like "
+                    "`OSError: [Errno 24] Too many open files`. Consider "
+                    "increasing with ulimit -n",
+                    current_soft,
+                    e,
+                )
+
     @classmethod
     def run_webserver(cls) -> None:  # pragma: no cover
         global_config_dict = get_global_config_dict()
@@ -380,6 +440,8 @@ def run_webserver(cls) -> None:  # pragma: no cover
         server = cls(config=server_config, server_client=server_client)
 
         app = server.setup_webserver()
+        server.set_ulimit()
+        server.setup_exception_middleware(app)
 
         profiling_config = ProfilingMiddlewareConfig.model_validate(global_config_dict)
         if profiling_config.profiling_enabled:
 
@@ -2,7 +2,7 @@
 
 ### Overview
 Verifies competitive programming solutions by executing submitted code against unit tests. The server consumes agent trajectories and returns a reward based on whether the assistant's code produces the correct outputs for given test inputs.
-Model registry link: https://gitlab-master.nvidia.com/bxyu/nemo-gym/-/ml/models/53#/ 
+Model registry link: https://gitlab-master.nvidia.com/bxyu/nemo-gym/-/ml/models/53#/
 
 ### Input schema
 - `responses_create_params`: OpenAI Responses create params
@@ -65,7 +65,7 @@ ng_prepare_data "+config_paths=[$config_paths]" \
 # Download train data from gitlab model registry
 ng_download_dataset_from_gitlab \
     +dataset_name=comp_coding \
-    +version=0.0.1 \
+    +version=0.1.1 \
     +run_id=5a1167ef-3533-486f-9c0e-49d1e97fc887 \
     +artifact_fpath=train.jsonl \
     +output_fpath=resources_servers/comp_coding/data/train.jsonl
@@ -90,15 +90,5 @@ uv run python resources_servers/comp_coding/scripts/validate_dataset.py \
     --in data/comp_coding/train.jsonl --fail-fast
 ```
 
-### Error handling
-The server provides specific error messages for different failure modes:
-- `Empty model output`: No text found in the response
-- `Missing verifier_metadata.unit_tests`: Required test data not provided
-- `Invalid unit_tests`: Malformed test case data
-- `Could not extract code`: No valid Python code found in response
-- `INVALID_TEST_FORMAT`: Test inputs/outputs length mismatch or empty
-- `TEST_CASE_N_FAILED`: Specific test case failed with expected vs actual output
-- `TEST_CASE_N_ERROR`: Runtime error during test execution
-
 ## Licensing information
-TODO: @kbhardwaj to confirm data/code licensing information w Vahid and team
+Apache 2.0