From c9276a8009df1bfff899caeb10410d736e3ca867 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sun, 11 Jan 2026 22:27:32 -0800 Subject: [PATCH 01/12] feat: add GET /manifest endpoint for mothership service discovery Implement manifest endpoint on LoadBalancer handlers to serve flash_manifest.json for cross-endpoint routing. The endpoint is conditionally registered when FLASH_IS_MOTHERSHIP=true environment variable is set, enabling child endpoints to fetch function/resource metadata from the mothership. Changes: - Add /manifest to reserved paths in manifest builder - Implement conditional GET /manifest endpoint in lb_handler factory - Returns 200 with manifest JSON on success, 404 if not found - Endpoint only registers for LoadBalancer resources with env var set - Add comprehensive unit and integration tests (18 unit, 4 integration) --- .../cli/commands/build_utils/manifest.py | 2 +- src/tetra_rp/runtime/lb_handler.py | 38 ++- tests/integration/test_lb_remote_execution.py | 146 +++++++++ tests/unit/runtime/test_lb_handler.py | 307 ++++++++++++++++++ 4 files changed, 490 insertions(+), 3 deletions(-) create mode 100644 tests/unit/runtime/test_lb_handler.py diff --git a/src/tetra_rp/cli/commands/build_utils/manifest.py b/src/tetra_rp/cli/commands/build_utils/manifest.py index 2664150f..bd0261cd 100644 --- a/src/tetra_rp/cli/commands/build_utils/manifest.py +++ b/src/tetra_rp/cli/commands/build_utils/manifest.py @@ -8,7 +8,7 @@ from .scanner import RemoteFunctionMetadata -RESERVED_PATHS = ["/execute", "/ping"] +RESERVED_PATHS = ["/execute", "/ping", "/manifest"] @dataclass diff --git a/src/tetra_rp/runtime/lb_handler.py b/src/tetra_rp/runtime/lb_handler.py index 82fec707..4193dd02 100644 --- a/src/tetra_rp/runtime/lb_handler.py +++ b/src/tetra_rp/runtime/lb_handler.py @@ -1,8 +1,10 @@ """Factory for creating FastAPI load-balanced handlers. This module provides the factory function for generating FastAPI applications -that handle load-balanced serverless endpoints. It supports both user-defined -HTTP routes and the framework's /execute endpoint for @remote function execution. +that handle load-balanced serverless endpoints. It supports: +- User-defined HTTP routes +- /execute endpoint for @remote function execution (LiveLoadBalancer only) +- /manifest endpoint for mothership service discovery (when FLASH_IS_MOTHERSHIP=true) Security Model: The /execute endpoint accepts and executes serialized function code. This is @@ -13,14 +15,20 @@ 4. In production, API authentication should protect the /execute endpoint Users should NOT expose the /execute endpoint to untrusted clients. + + The /manifest endpoint returns deployment metadata and is safe to expose + publicly as it contains only structural information about deployed functions. """ import inspect import logging +import os from typing import Any, Callable, Dict from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse +from .generic_handler import load_manifest from .serialization import ( deserialize_args, deserialize_kwargs, @@ -163,6 +171,32 @@ async def execute_remote_function(request: Request) -> Dict[str, Any]: logger.error(f"Unexpected error in /execute endpoint: {e}") return {"success": False, "error": f"Unexpected error: {e}"} + # Register /manifest endpoint for mothership discovery (if enabled) + if os.getenv("FLASH_IS_MOTHERSHIP", "").lower() == "true": + + @app.get("/manifest") + async def get_manifest() -> JSONResponse: + """Mothership discovery endpoint. + + Returns the flash_manifest.json content for service discovery. + Only available when FLASH_IS_MOTHERSHIP=true environment variable is set. + + Returns: + JSONResponse with manifest content or 404 if not found + """ + manifest_dict = load_manifest() + + if not manifest_dict or not manifest_dict.get("resources"): + return JSONResponse( + status_code=404, + content={ + "error": "Manifest not found", + "detail": "flash_manifest.json could not be loaded", + }, + ) + + return JSONResponse(status_code=200, content=manifest_dict) + # Register user-defined routes from registry for (method, path), handler in route_registry.items(): method_upper = method.upper() diff --git a/tests/integration/test_lb_remote_execution.py b/tests/integration/test_lb_remote_execution.py index 4d34abf3..2aca85d0 100644 --- a/tests/integration/test_lb_remote_execution.py +++ b/tests/integration/test_lb_remote_execution.py @@ -304,3 +304,149 @@ def get_status(): assert scanner.resource_types["test-api"] == "LiveLoadBalancer" assert "deployed-api" in scanner.resource_types assert scanner.resource_types["deployed-api"] == "LoadBalancerSlsResource" + + +class TestManifestEndpointIntegration: + """Integration tests for GET /manifest endpoint.""" + + def test_manifest_endpoint_in_live_load_balancer(self, monkeypatch): + """Test manifest endpoint in LiveLoadBalancer with FLASH_IS_MOTHERSHIP=true.""" + from unittest.mock import patch + from fastapi.testclient import TestClient + + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + # Create a LiveLoadBalancer + lb = LiveLoadBalancer(name="test-mothership") + + # Define a simple function on the mothership + @remote(lb, method="GET", path="/api/hello") + async def hello(): + return {"message": "hello"} + + # Create manifest data + test_manifest = { + "version": "1.0", + "generated_at": "2024-01-15T10:30:00Z", + "project_name": "test-app", + "resources": { + "test-mothership": { + "resource_type": "LiveLoadBalancer", + "handler_file": "handler_test_mothership.py", + "functions": [ + { + "name": "hello", + "module": "test_module", + "is_async": True, + "is_class": False, + "http_method": "GET", + "http_path": "/api/hello", + } + ], + } + }, + "function_registry": {"hello": "test-mothership"}, + "routes": {"test-mothership": {"GET /api/hello": "hello"}}, + } + + # Mock load_manifest to return test manifest + with patch( + "tetra_rp.runtime.lb_handler.load_manifest", return_value=test_manifest + ): + from tetra_rp.runtime.lb_handler import create_lb_handler + + # Create handler with manifest endpoint enabled + route_registry = {("GET", "/api/hello"): hello} + app = create_lb_handler(route_registry, include_execute=True) + client = TestClient(app) + + # Verify /manifest endpoint returns manifest + response = client.get("/manifest") + assert response.status_code == 200 + assert response.json() == test_manifest + + def test_manifest_endpoint_excluded_when_env_not_set(self): + """Test manifest endpoint is not available when FLASH_IS_MOTHERSHIP not set.""" + from fastapi.testclient import TestClient + from tetra_rp.runtime.lb_handler import create_lb_handler + + # Create handler without env var set + app = create_lb_handler({}, include_execute=False) + client = TestClient(app) + + # Verify /manifest returns 404 + response = client.get("/manifest") + assert response.status_code == 404 + + def test_manifest_endpoint_with_deployed_lb_resource(self, monkeypatch): + """Test manifest endpoint with LoadBalancerSlsResource.""" + from unittest.mock import patch + from fastapi.testclient import TestClient + + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + # Create test manifest for deployed endpoint + test_manifest = { + "version": "1.0", + "generated_at": "2024-01-15T10:30:00Z", + "project_name": "deployed-app", + "resources": { + "gpu-worker": { + "resource_type": "LoadBalancerSlsResource", + "handler_file": "handler_gpu_worker.py", + "functions": [ + { + "name": "process_image", + "module": "workers.gpu", + "is_async": True, + "is_class": False, + "http_method": "POST", + "http_path": "/api/process", + } + ], + } + }, + "function_registry": {"process_image": "gpu-worker"}, + } + + with patch( + "tetra_rp.runtime.lb_handler.load_manifest", return_value=test_manifest + ): + from tetra_rp.runtime.lb_handler import create_lb_handler + + # Create deployed handler (not LiveLoadBalancer) + app = create_lb_handler({}, include_execute=False) + client = TestClient(app) + + # Verify /manifest endpoint is available + response = client.get("/manifest") + assert response.status_code == 200 + assert response.json() == test_manifest + + def test_manifest_endpoint_coexists_with_ping(self, monkeypatch): + """Test that /manifest endpoint coexists with /ping health check.""" + from unittest.mock import patch + from fastapi.testclient import TestClient + + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + test_manifest = { + "version": "1.0", + "resources": {"test": {}}, + "function_registry": {}, + } + + with patch( + "tetra_rp.runtime.lb_handler.load_manifest", return_value=test_manifest + ): + from tetra_rp.runtime.lb_handler import create_lb_handler + + app = create_lb_handler({}, include_execute=False) + client = TestClient(app) + + # Verify both endpoints exist + manifest_response = client.get("/manifest") + assert manifest_response.status_code == 200 + + ping_response = client.get("/ping") + assert ping_response.status_code == 404 # Ping not auto-added by factory diff --git a/tests/unit/runtime/test_lb_handler.py b/tests/unit/runtime/test_lb_handler.py new file mode 100644 index 00000000..e02c6aa0 --- /dev/null +++ b/tests/unit/runtime/test_lb_handler.py @@ -0,0 +1,307 @@ +"""Unit tests for LoadBalancer handler factory.""" + +from unittest.mock import patch + +import pytest +from fastapi.testclient import TestClient + +from tetra_rp.runtime.lb_handler import create_lb_handler + + +class TestManifestEndpoint: + """Tests for GET /manifest endpoint.""" + + @pytest.fixture + def sample_manifest(self): + """Sample manifest for testing.""" + return { + "version": "1.0", + "generated_at": "2024-01-15T10:30:00Z", + "project_name": "test-app", + "resources": { + "gpu_config": { + "resource_type": "LoadBalancerSlsResource", + "handler_file": "handler_gpu_config.py", + "functions": [ + { + "name": "process_image", + "module": "workers.gpu", + "is_async": True, + "is_class": False, + "http_method": "POST", + "http_path": "/api/process", + } + ], + } + }, + "function_registry": {"process_image": "gpu_config"}, + "routes": {"gpu_config": {"POST /api/process": "process_image"}}, + } + + def test_manifest_endpoint_registered_when_env_var_true( + self, sample_manifest, monkeypatch + ): + """Verify /manifest endpoint exists when FLASH_IS_MOTHERSHIP=true.""" + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + with patch( + "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest + ): + app = create_lb_handler({}, include_execute=False) + routes = [route.path for route in app.routes] + + assert "/manifest" in routes + + def test_manifest_endpoint_not_registered_when_env_var_false( + self, sample_manifest, monkeypatch + ): + """Verify /manifest endpoint doesn't exist when FLASH_IS_MOTHERSHIP=false.""" + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "false") + + app = create_lb_handler({}, include_execute=False) + routes = [route.path for route in app.routes] + + assert "/manifest" not in routes + + def test_manifest_endpoint_not_registered_when_env_var_missing( + self, sample_manifest + ): + """Verify /manifest endpoint doesn't exist when env var not set.""" + app = create_lb_handler({}, include_execute=False) + client = TestClient(app) + + response = client.get("/manifest") + assert response.status_code == 404 + + def test_manifest_endpoint_returns_200_with_valid_manifest( + self, sample_manifest, monkeypatch + ): + """Test happy path - endpoint returns 200 with valid manifest.""" + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + with patch( + "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest + ): + app = create_lb_handler({}, include_execute=False) + client = TestClient(app) + + response = client.get("/manifest") + + assert response.status_code == 200 + assert response.json() == sample_manifest + + def test_manifest_endpoint_returns_404_when_manifest_missing(self, monkeypatch): + """Test endpoint returns 404 when manifest file not found.""" + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + with patch("tetra_rp.runtime.lb_handler.load_manifest", return_value={}): + app = create_lb_handler({}, include_execute=False) + client = TestClient(app) + + response = client.get("/manifest") + + assert response.status_code == 404 + data = response.json() + assert data["error"] == "Manifest not found" + assert "could not be loaded" in data["detail"] + + def test_manifest_endpoint_case_insensitive_env_var_true( + self, sample_manifest, monkeypatch + ): + """Test endpoint registration with different case variations of 'true'.""" + for env_value in ["True", "TRUE", "TrUe"]: + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", env_value) + + with patch( + "tetra_rp.runtime.lb_handler.load_manifest", + return_value=sample_manifest, + ): + app = create_lb_handler({}, include_execute=False) + routes = [route.path for route in app.routes] + + assert "/manifest" in routes + + def test_manifest_endpoint_case_insensitive_env_var_false(self, monkeypatch): + """Test endpoint not registered with non-'true' values.""" + for env_value in ["False", "false", "yes", "1", ""]: + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", env_value) + + app = create_lb_handler({}, include_execute=False) + routes = [route.path for route in app.routes] + + assert "/manifest" not in routes + + def test_manifest_endpoint_response_structure(self, sample_manifest, monkeypatch): + """Test that manifest response has correct structure.""" + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + with patch( + "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest + ): + app = create_lb_handler({}, include_execute=False) + client = TestClient(app) + + response = client.get("/manifest") + data = response.json() + + # Verify structure + assert "version" in data + assert "generated_at" in data + assert "project_name" in data + assert "resources" in data + assert "function_registry" in data + + def test_manifest_endpoint_with_empty_resources(self, monkeypatch): + """Test endpoint behavior when manifest has no resources.""" + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + empty_manifest = { + "version": "1.0", + "project_name": "test", + "resources": {}, + "function_registry": {}, + } + + with patch( + "tetra_rp.runtime.lb_handler.load_manifest", return_value=empty_manifest + ): + app = create_lb_handler({}, include_execute=False) + client = TestClient(app) + + response = client.get("/manifest") + + # Should return 404 if no resources + assert response.status_code == 404 + + def test_manifest_endpoint_with_none_manifest(self, monkeypatch): + """Test endpoint behavior when load_manifest returns None.""" + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + with patch("tetra_rp.runtime.lb_handler.load_manifest", return_value=None): + app = create_lb_handler({}, include_execute=False) + client = TestClient(app) + + response = client.get("/manifest") + + assert response.status_code == 404 + + def test_manifest_endpoint_coexists_with_execute( + self, sample_manifest, monkeypatch + ): + """Test that /manifest endpoint coexists with /execute endpoint.""" + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + with patch( + "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest + ): + app = create_lb_handler({}, include_execute=True) + routes = [route.path for route in app.routes] + + assert "/manifest" in routes + assert "/execute" in routes + + def test_manifest_endpoint_coexists_with_user_routes( + self, sample_manifest, monkeypatch + ): + """Test that /manifest endpoint coexists with user-defined routes.""" + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + async def dummy_handler(): + return {"result": "ok"} + + route_registry = {("GET", "/api/health"): dummy_handler} + + with patch( + "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest + ): + app = create_lb_handler(route_registry, include_execute=False) + routes = [route.path for route in app.routes] + + assert "/manifest" in routes + assert "/api/health" in routes + + def test_manifest_endpoint_content_type(self, sample_manifest, monkeypatch): + """Test that /manifest endpoint returns proper JSON content-type.""" + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + with patch( + "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest + ): + app = create_lb_handler({}, include_execute=False) + client = TestClient(app) + + response = client.get("/manifest") + + assert response.headers["content-type"] == "application/json" + + def test_manifest_endpoint_with_complex_manifest(self, monkeypatch): + """Test endpoint with complex multi-resource manifest.""" + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + complex_manifest = { + "version": "1.0", + "generated_at": "2024-01-15T10:30:00Z", + "project_name": "complex-app", + "resources": { + "gpu_config": { + "resource_type": "LoadBalancerSlsResource", + "handler_file": "handler_gpu.py", + "functions": [ + { + "name": "process_gpu", + "module": "workers.gpu", + "is_async": True, + "is_class": False, + } + ], + }, + "cpu_config": { + "resource_type": "ServerlessEndpoint", + "handler_file": "handler_cpu.py", + "functions": [ + { + "name": "process_cpu", + "module": "workers.cpu", + "is_async": True, + "is_class": False, + } + ], + }, + }, + "function_registry": { + "process_gpu": "gpu_config", + "process_cpu": "cpu_config", + }, + } + + with patch( + "tetra_rp.runtime.lb_handler.load_manifest", return_value=complex_manifest + ): + app = create_lb_handler({}, include_execute=False) + client = TestClient(app) + + response = client.get("/manifest") + + assert response.status_code == 200 + data = response.json() + assert len(data["resources"]) == 2 + assert "gpu_config" in data["resources"] + assert "cpu_config" in data["resources"] + + +class TestExecuteEndpointStillWorks: + """Tests to ensure /execute endpoint still works after manifest changes.""" + + def test_execute_endpoint_still_available_with_live_load_balancer(self): + """Verify /execute endpoint is still registered for LiveLoadBalancer.""" + app = create_lb_handler({}, include_execute=True) + routes = [route.path for route in app.routes] + + assert "/execute" in routes + + def test_execute_endpoint_not_included_for_deployed(self): + """Verify /execute endpoint is not registered for deployed LoadBalancer.""" + app = create_lb_handler({}, include_execute=False) + routes = [route.path for route in app.routes] + + assert "/execute" not in routes From 11fecb2fbb8c1bb317e3cbf25e8ea1c07208fb33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 12 Jan 2026 02:30:09 -0800 Subject: [PATCH 02/12] docs: convert ASCII diagrams to MermaidJS - Local Execution Flow: Shows synchronous path for functions in manifest - Remote Execution Flow: Shows serialization, HTTP, and deserialization steps - Manifest Synchronization: Shows cache-first approach with GQL fallback Uses high-contrast MermaidJS styling with saturated colors and white text for maximum readability as per project guidelines. --- docs/Cross_Endpoint_Routing.md | 191 ++++++++++++++++++++++++++------- 1 file changed, 155 insertions(+), 36 deletions(-) diff --git a/docs/Cross_Endpoint_Routing.md b/docs/Cross_Endpoint_Routing.md index 6d059636..1a4330c6 100644 --- a/docs/Cross_Endpoint_Routing.md +++ b/docs/Cross_Endpoint_Routing.md @@ -640,46 +640,70 @@ Add new configuration by: #### Local Execution Flow -``` -Function Call - ↓ -ProductionWrapper.wrap_function_execution() - ↓ -ServiceRegistry.get_resource_for_function() - ↓ -Manifest Lookup (resource = None) - ↓ -Local Execution (original_stub_func) - ↓ -Result +```mermaid +flowchart TD + A["Function Call"] + B["ProductionWrapper.wrap_function_execution()"] + C["ServiceRegistry.get_resource_for_function()"] + D["Manifest Lookup
resource = None"] + E["Local Execution
original_stub_func"] + F["Result"] + + A --> B + B --> C + C --> D + D --> E + E --> F + + style A fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style B fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style C fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style D fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff + style E fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff + style F fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff ``` #### Remote Execution Flow -``` -Function Call - ↓ -ProductionWrapper.wrap_function_execution() - ↓ -ServiceRegistry.get_resource_for_function() - ↓ -Manifest Lookup (resource found) - ↓ -Ensure Directory Loaded - ↓ -DirectoryClient.get_endpoints() - ↓ -Get Remote Endpoint URL - ↓ -Serialize Arguments (cloudpickle → base64) - ↓ -HTTP POST to Remote Endpoint - ↓ -Remote Function Execution - ↓ -Deserialize Result (base64 → cloudpickle) - ↓ -Result +```mermaid +flowchart TD + A["Function Call"] + B["ProductionWrapper.wrap_function_execution()"] + C["ServiceRegistry.get_resource_for_function()"] + D["Manifest Lookup
resource found"] + E["Ensure Directory Loaded"] + F["DirectoryClient.get_endpoints()"] + G["Get Remote Endpoint URL"] + H["Serialize Arguments
cloudpickle → base64"] + I["HTTP POST to Remote Endpoint"] + J["Remote Function Execution"] + K["Deserialize Result
base64 → cloudpickle"] + L["Result"] + + A --> B + B --> C + C --> D + D --> E + E --> F + F --> G + G --> H + H --> I + I --> J + J --> K + K --> L + + style A fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style B fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style C fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style D fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style E fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style F fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style G fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style H fill:#f57c00,stroke:#e65100,stroke-width:3px,color:#fff + style I fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style J fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style K fill:#f57c00,stroke:#e65100,stroke-width:3px,color:#fff + style L fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff ``` ### Design Decisions @@ -920,6 +944,101 @@ client = DirectoryClient(mothership_url=...) endpoints = await client.get_endpoints() ``` +## Manifest Synchronization with RunPod GraphQL API + +### Overview + +The Mothership's GET /manifest endpoint pulls configuration from RunPod's GraphQL API, +which serves as the single source of truth for manifest data. This enables centralized +configuration management and ensures all child endpoints receive consistent routing +information. + +### Architecture + +```mermaid +flowchart TD + A["Child Endpoint
GET /manifest"] + B["Mothership"] + C["ManifestFetcher"] + D{Cache Valid?} + E["Serve Cached
Manifest"] + F["Fetch from RunPod
GraphQL API"] + G["Update
flash_manifest.json"] + H["Cache Result
TTL: 300s"] + I["Serve Manifest"] + J["Fallback:
Load Local File"] + + A -->|Request| B + B --> C + C --> D + D -->|Yes| E + D -->|No| F + E --> I + F --> G + G --> H + H --> I + F -->|Fails| J + J --> I + + style A fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style B fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff + style C fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff + style D fill:#f57c00,stroke:#e65100,stroke-width:3px,color:#fff + style E fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff + style F fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style G fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style H fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff + style I fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style J fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff +``` + +### How It Works + +1. **Source of Truth**: RunPod GraphQL API holds the authoritative manifest configuration +2. **Caching Proxy**: Mothership fetches from RunPod GQL, caches locally (5 min TTL) +3. **Local Persistence**: Fetched manifest written to `flash_manifest.json` +4. **Graceful Fallback**: If RunPod GQL unavailable, serves local file +5. **Cache Invalidation**: Automatic expiry after TTL, manual invalidation supported + +### Implementation Status + +**Current (Placeholder)**: +- `ManifestFetcher` class with caching infrastructure +- Uses existing `RunpodGraphQLClient` for API communication +- Falls back to local `flash_manifest.json` (GQL fetch raises `NotImplementedError`) +- Cache TTL: 300 seconds (configurable) + +**Future (Full Implementation)**: +- Implement `getManifest` query in `ManifestFetcher._fetch_from_gql()` +- Add `saveManifest` mutation for updating manifest in RunPod +- Real-time cache invalidation via webhooks +- Health checks and retry logic + +### Configuration + +```bash +# Enable Mothership mode (required for /manifest endpoint) +export FLASH_IS_MOTHERSHIP=true + +# Optional: Identify this mothership instance +export RUNPOD_ENDPOINT_ID=mothership-prod-1 + +# Required for RunPod GraphQL API access +export RUNPOD_API_KEY=your-api-key-here +``` + +### Cache Behavior + +- **Default TTL**: 300 seconds (5 minutes) +- **Cache Key**: Per-mothership instance (no cross-instance cache) +- **Thread-Safe**: Uses `asyncio.Lock` for concurrent request handling +- **Manual Invalidation**: `fetcher.invalidate_cache()` for testing + +### Historical Context + +A previous `StateManagerClient` (commit b19bf7c) used REST API. Current placeholder +prepares for GQL-based architecture with improved caching and error handling. + ## Key Implementation Highlights ### Design Focus From 6dd03f13b5809b89e67e742667afb8600920b839 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 12 Jan 2026 04:03:44 -0800 Subject: [PATCH 03/12] feat: add ManifestFetcher for caching manifest from RunPod GraphQL - Add ManifestFetcher class with caching infrastructure (TTL: 300s) - Integrate ManifestFetcher into lb_handler /manifest endpoint - Use RunpodGraphQLClient for API communication - Fall back to local flash_manifest.json when API unavailable - Add comprehensive tests for ManifestFetcher and lb_handler --- src/tetra_rp/runtime/lb_handler.py | 28 ++- src/tetra_rp/runtime/manifest_fetcher.py | 192 ++++++++++++++++++ tests/integration/test_lb_remote_execution.py | 41 ++-- tests/unit/runtime/test_lb_handler.py | 148 +++++++++++--- tests/unit/runtime/test_manifest_fetcher.py | 164 +++++++++++++++ 5 files changed, 523 insertions(+), 50 deletions(-) create mode 100644 src/tetra_rp/runtime/manifest_fetcher.py create mode 100644 tests/unit/runtime/test_manifest_fetcher.py diff --git a/src/tetra_rp/runtime/lb_handler.py b/src/tetra_rp/runtime/lb_handler.py index 4193dd02..495261d2 100644 --- a/src/tetra_rp/runtime/lb_handler.py +++ b/src/tetra_rp/runtime/lb_handler.py @@ -23,12 +23,12 @@ import inspect import logging import os -from typing import Any, Callable, Dict +from typing import Any, Callable, Dict, Optional from fastapi import FastAPI, Request from fastapi.responses import JSONResponse -from .generic_handler import load_manifest +from .manifest_fetcher import ManifestFetcher from .serialization import ( deserialize_args, deserialize_kwargs, @@ -37,6 +37,17 @@ logger = logging.getLogger(__name__) +# Module-level manifest fetcher (singleton, reused across requests) +_manifest_fetcher: Optional[ManifestFetcher] = None + + +def _get_manifest_fetcher() -> ManifestFetcher: + """Get or create the manifest fetcher singleton.""" + global _manifest_fetcher + if _manifest_fetcher is None: + _manifest_fetcher = ManifestFetcher() + return _manifest_fetcher + def create_lb_handler( route_registry: Dict[tuple[str, str], Callable], include_execute: bool = False @@ -178,20 +189,27 @@ async def execute_remote_function(request: Request) -> Dict[str, Any]: async def get_manifest() -> JSONResponse: """Mothership discovery endpoint. - Returns the flash_manifest.json content for service discovery. + Fetches manifest from RunPod GraphQL API (source of truth), caches it + locally, and serves to child endpoints. Falls back to local file if + RunPod API is unavailable. + Only available when FLASH_IS_MOTHERSHIP=true environment variable is set. Returns: JSONResponse with manifest content or 404 if not found """ - manifest_dict = load_manifest() + fetcher = _get_manifest_fetcher() + mothership_id = os.getenv("RUNPOD_ENDPOINT_ID") + + # Fetch manifest (from cache, RunPod GQL, or local file) + manifest_dict = await fetcher.get_manifest(mothership_id) if not manifest_dict or not manifest_dict.get("resources"): return JSONResponse( status_code=404, content={ "error": "Manifest not found", - "detail": "flash_manifest.json could not be loaded", + "detail": "Could not load manifest from RunPod or local file", }, ) diff --git a/src/tetra_rp/runtime/manifest_fetcher.py b/src/tetra_rp/runtime/manifest_fetcher.py new file mode 100644 index 00000000..8815add7 --- /dev/null +++ b/src/tetra_rp/runtime/manifest_fetcher.py @@ -0,0 +1,192 @@ +"""Manifest fetcher with RunPod GQL integration and caching. + +This module provides manifest fetching from RunPod GraphQL API (source of truth) +with local file caching and fallback. +""" + +import asyncio +import json +import logging +import time +from pathlib import Path +from typing import Any, Dict, Optional + +from .config import DEFAULT_CACHE_TTL +from .generic_handler import load_manifest + +logger = logging.getLogger(__name__) + + +class ManifestFetcher: + """Fetches and caches manifest from RunPod GraphQL API. + + RunPod's GraphQL API is the source of truth for manifest data. This + fetcher pulls from it using RunpodGraphQLClient, caches locally, and + falls back to local file if RunPod API is unavailable. + """ + + def __init__( + self, + cache_ttl: int = DEFAULT_CACHE_TTL, + manifest_path: Optional[Path] = None, + ): + """Initialize manifest fetcher. + + Args: + cache_ttl: Cache time-to-live in seconds (default: 300) + manifest_path: Optional path to local manifest file + """ + self.cache_ttl = cache_ttl + self.manifest_path = manifest_path + + # Cache state + self._cached_manifest: Optional[Dict[str, Any]] = None + self._cache_loaded_at: float = 0 + self._cache_lock = asyncio.Lock() + + async def get_manifest( + self, + mothership_id: Optional[str] = None, + ) -> Optional[Dict[str, Any]]: + """Get manifest from cache or fetch from RunPod GraphQL API. + + Flow: + 1. Check if cached and not expired → return cached + 2. If expired/not cached → fetch from RunPod GraphQL API + 3. Update local flash_manifest.json with fetched data + 4. Cache the result + 5. Return manifest + + If RunPod GQL fetch fails, falls back to local file. + + Args: + mothership_id: Optional mothership endpoint ID for tracking + + Returns: + Manifest dictionary or None if unavailable + """ + async with self._cache_lock: + now = time.time() + cache_age = now - self._cache_loaded_at + + # Return cached if still valid + if self._cached_manifest and cache_age < self.cache_ttl: + logger.debug( + f"Serving cached manifest (age: {cache_age:.1f}s, " + f"TTL: {self.cache_ttl}s)" + ) + return self._cached_manifest + + # Cache expired or not loaded - fetch from RunPod GQL + logger.debug("Cache expired or empty, fetching from RunPod GraphQL API") + + try: + # Fetch from RunPod GraphQL API (placeholder) + manifest = await self._fetch_from_gql(mothership_id) + + # Update local flash_manifest.json + if manifest: + self._update_local_file(manifest) + + # Update cache + self._cached_manifest = manifest + self._cache_loaded_at = now + + logger.info( + f"Manifest fetched from RunPod GQL and cached " + f"({len(manifest.get('resources', {}))} resources)" + ) + return manifest + + except NotImplementedError: + logger.debug( + "RunPod GQL fetch not implemented, falling back to local file" + ) + except Exception as e: + logger.warning( + f"RunPod GQL fetch failed: {e}, falling back to local file" + ) + + # Fallback: load from local file + manifest = load_manifest(self.manifest_path) + if manifest: + # Cache the fallback manifest + self._cached_manifest = manifest + self._cache_loaded_at = now + logger.debug("Loaded and cached manifest from local file") + + return manifest + + async def _fetch_from_gql( + self, + mothership_id: Optional[str] = None, + ) -> Dict[str, Any]: + """Fetch manifest from RunPod GraphQL API. + + TBD: Future implementation will query RunPod's GraphQL API + to retrieve the manifest configuration. + + Args: + mothership_id: Optional mothership endpoint ID + + Returns: + Manifest dictionary from RunPod GQL + + Raises: + NotImplementedError: Placeholder for future implementation + + Note: + Future implementation will use RunpodGraphQLClient: + + ```python + async with RunpodGraphQLClient() as client: + query = ''' + query GetManifest($mothershipId: ID!) { + getManifest(mothershipId: $mothershipId) { + version + projectName + generatedAt + resources + functionRegistry + } + } + ''' + result = await client.execute(query, {"mothershipId": mothership_id}) + return result["data"]["getManifest"] + ``` + """ + raise NotImplementedError( + "RunPod manifest query not yet implemented. " + "Falling back to local flash_manifest.json file." + ) + + def _update_local_file(self, manifest: Dict[str, Any]) -> None: + """Update local flash_manifest.json with fetched data. + + Args: + manifest: Manifest dictionary to write + """ + try: + # Determine file path + if self.manifest_path: + file_path = self.manifest_path + else: + file_path = Path.cwd() / "flash_manifest.json" + + # Write manifest to file + with open(file_path, "w") as f: + json.dump(manifest, f, indent=2) + + logger.debug(f"Updated local manifest file: {file_path}") + + except Exception as e: + logger.warning(f"Failed to update local manifest file: {e}") + # Non-critical error - cached manifest still valid + + def invalidate_cache(self) -> None: + """Manually invalidate the cache. + + Next get_manifest() call will fetch from GQL. + """ + self._cache_loaded_at = 0 + logger.debug("Manifest cache invalidated") diff --git a/tests/integration/test_lb_remote_execution.py b/tests/integration/test_lb_remote_execution.py index 2aca85d0..8c45022f 100644 --- a/tests/integration/test_lb_remote_execution.py +++ b/tests/integration/test_lb_remote_execution.py @@ -309,9 +309,18 @@ def get_status(): class TestManifestEndpointIntegration: """Integration tests for GET /manifest endpoint.""" + @pytest.fixture(autouse=True) + def reset_manifest_fetcher(self): + """Reset the global manifest fetcher before each test.""" + import tetra_rp.runtime.lb_handler as lb_handler_module + + lb_handler_module._manifest_fetcher = None + yield + lb_handler_module._manifest_fetcher = None + def test_manifest_endpoint_in_live_load_balancer(self, monkeypatch): """Test manifest endpoint in LiveLoadBalancer with FLASH_IS_MOTHERSHIP=true.""" - from unittest.mock import patch + from unittest.mock import patch, AsyncMock from fastapi.testclient import TestClient monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") @@ -349,10 +358,12 @@ async def hello(): "routes": {"test-mothership": {"GET /api/hello": "hello"}}, } - # Mock load_manifest to return test manifest - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=test_manifest - ): + # Mock ManifestFetcher to return test manifest + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=test_manifest) + MockFetcher.return_value = mock_fetcher + from tetra_rp.runtime.lb_handler import create_lb_handler # Create handler with manifest endpoint enabled @@ -380,7 +391,7 @@ def test_manifest_endpoint_excluded_when_env_not_set(self): def test_manifest_endpoint_with_deployed_lb_resource(self, monkeypatch): """Test manifest endpoint with LoadBalancerSlsResource.""" - from unittest.mock import patch + from unittest.mock import patch, AsyncMock from fastapi.testclient import TestClient monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") @@ -409,9 +420,11 @@ def test_manifest_endpoint_with_deployed_lb_resource(self, monkeypatch): "function_registry": {"process_image": "gpu-worker"}, } - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=test_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=test_manifest) + MockFetcher.return_value = mock_fetcher + from tetra_rp.runtime.lb_handler import create_lb_handler # Create deployed handler (not LiveLoadBalancer) @@ -425,7 +438,7 @@ def test_manifest_endpoint_with_deployed_lb_resource(self, monkeypatch): def test_manifest_endpoint_coexists_with_ping(self, monkeypatch): """Test that /manifest endpoint coexists with /ping health check.""" - from unittest.mock import patch + from unittest.mock import patch, AsyncMock from fastapi.testclient import TestClient monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") @@ -436,9 +449,11 @@ def test_manifest_endpoint_coexists_with_ping(self, monkeypatch): "function_registry": {}, } - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=test_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=test_manifest) + MockFetcher.return_value = mock_fetcher + from tetra_rp.runtime.lb_handler import create_lb_handler app = create_lb_handler({}, include_execute=False) diff --git a/tests/unit/runtime/test_lb_handler.py b/tests/unit/runtime/test_lb_handler.py index e02c6aa0..966a2ab6 100644 --- a/tests/unit/runtime/test_lb_handler.py +++ b/tests/unit/runtime/test_lb_handler.py @@ -11,6 +11,15 @@ class TestManifestEndpoint: """Tests for GET /manifest endpoint.""" + @pytest.fixture(autouse=True) + def reset_manifest_fetcher(self): + """Reset the global manifest fetcher before each test.""" + import tetra_rp.runtime.lb_handler as lb_handler_module + + lb_handler_module._manifest_fetcher = None + yield + lb_handler_module._manifest_fetcher = None + @pytest.fixture def sample_manifest(self): """Sample manifest for testing.""" @@ -42,11 +51,15 @@ def test_manifest_endpoint_registered_when_env_var_true( self, sample_manifest, monkeypatch ): """Verify /manifest endpoint exists when FLASH_IS_MOTHERSHIP=true.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) routes = [route.path for route in app.routes] @@ -77,11 +90,15 @@ def test_manifest_endpoint_returns_200_with_valid_manifest( self, sample_manifest, monkeypatch ): """Test happy path - endpoint returns 200 with valid manifest.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) client = TestClient(app) @@ -92,9 +109,15 @@ def test_manifest_endpoint_returns_200_with_valid_manifest( def test_manifest_endpoint_returns_404_when_manifest_missing(self, monkeypatch): """Test endpoint returns 404 when manifest file not found.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") - with patch("tetra_rp.runtime.lb_handler.load_manifest", return_value={}): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value={}) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) client = TestClient(app) @@ -103,19 +126,22 @@ def test_manifest_endpoint_returns_404_when_manifest_missing(self, monkeypatch): assert response.status_code == 404 data = response.json() assert data["error"] == "Manifest not found" - assert "could not be loaded" in data["detail"] + assert "Could not load" in data["detail"] def test_manifest_endpoint_case_insensitive_env_var_true( self, sample_manifest, monkeypatch ): """Test endpoint registration with different case variations of 'true'.""" + from unittest.mock import AsyncMock + for env_value in ["True", "TRUE", "TrUe"]: monkeypatch.setenv("FLASH_IS_MOTHERSHIP", env_value) - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", - return_value=sample_manifest, - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) routes = [route.path for route in app.routes] @@ -133,11 +159,15 @@ def test_manifest_endpoint_case_insensitive_env_var_false(self, monkeypatch): def test_manifest_endpoint_response_structure(self, sample_manifest, monkeypatch): """Test that manifest response has correct structure.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) client = TestClient(app) @@ -153,6 +183,8 @@ def test_manifest_endpoint_response_structure(self, sample_manifest, monkeypatch def test_manifest_endpoint_with_empty_resources(self, monkeypatch): """Test endpoint behavior when manifest has no resources.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") empty_manifest = { @@ -162,9 +194,11 @@ def test_manifest_endpoint_with_empty_resources(self, monkeypatch): "function_registry": {}, } - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=empty_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=empty_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) client = TestClient(app) @@ -174,10 +208,16 @@ def test_manifest_endpoint_with_empty_resources(self, monkeypatch): assert response.status_code == 404 def test_manifest_endpoint_with_none_manifest(self, monkeypatch): - """Test endpoint behavior when load_manifest returns None.""" + """Test endpoint behavior when get_manifest returns None.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") - with patch("tetra_rp.runtime.lb_handler.load_manifest", return_value=None): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=None) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) client = TestClient(app) @@ -189,11 +229,15 @@ def test_manifest_endpoint_coexists_with_execute( self, sample_manifest, monkeypatch ): """Test that /manifest endpoint coexists with /execute endpoint.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=True) routes = [route.path for route in app.routes] @@ -204,6 +248,8 @@ def test_manifest_endpoint_coexists_with_user_routes( self, sample_manifest, monkeypatch ): """Test that /manifest endpoint coexists with user-defined routes.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") async def dummy_handler(): @@ -211,9 +257,11 @@ async def dummy_handler(): route_registry = {("GET", "/api/health"): dummy_handler} - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler(route_registry, include_execute=False) routes = [route.path for route in app.routes] @@ -222,11 +270,15 @@ async def dummy_handler(): def test_manifest_endpoint_content_type(self, sample_manifest, monkeypatch): """Test that /manifest endpoint returns proper JSON content-type.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) client = TestClient(app) @@ -236,6 +288,8 @@ def test_manifest_endpoint_content_type(self, sample_manifest, monkeypatch): def test_manifest_endpoint_with_complex_manifest(self, monkeypatch): """Test endpoint with complex multi-resource manifest.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") complex_manifest = { @@ -274,9 +328,11 @@ def test_manifest_endpoint_with_complex_manifest(self, monkeypatch): }, } - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=complex_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=complex_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) client = TestClient(app) @@ -288,6 +344,34 @@ def test_manifest_endpoint_with_complex_manifest(self, monkeypatch): assert "gpu_config" in data["resources"] assert "cpu_config" in data["resources"] + def test_manifest_endpoint_uses_fetcher_with_caching( + self, sample_manifest, monkeypatch + ): + """Verify GET /manifest uses ManifestFetcher with caching.""" + from unittest.mock import AsyncMock + + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + + app = create_lb_handler({}, include_execute=False) + client = TestClient(app) + + # First request + response1 = client.get("/manifest") + assert response1.status_code == 200 + assert response1.json() == sample_manifest + + # Second request - should reuse fetcher + response2 = client.get("/manifest") + assert response2.status_code == 200 + + # Verify fetcher was called (once per request) + assert mock_fetcher.get_manifest.call_count == 2 + class TestExecuteEndpointStillWorks: """Tests to ensure /execute endpoint still works after manifest changes.""" diff --git a/tests/unit/runtime/test_manifest_fetcher.py b/tests/unit/runtime/test_manifest_fetcher.py new file mode 100644 index 00000000..f7ae27a1 --- /dev/null +++ b/tests/unit/runtime/test_manifest_fetcher.py @@ -0,0 +1,164 @@ +"""Unit tests for ManifestFetcher.""" + +import asyncio +import json +from pathlib import Path +from unittest.mock import patch + +import pytest + +from tetra_rp.runtime.manifest_fetcher import ManifestFetcher + + +class TestManifestFetcher: + """Test ManifestFetcher caching and GQL integration.""" + + @pytest.fixture + def sample_manifest(self): + """Sample manifest for testing.""" + return { + "version": "1.0", + "project_name": "test-app", + "resources": {"gpu_config": {"resource_type": "ServerlessEndpoint"}}, + "function_registry": {"process_gpu": "gpu_config"}, + } + + @pytest.mark.asyncio + async def test_fetch_falls_back_to_local_file_when_gql_not_implemented( + self, sample_manifest, tmp_path + ): + """Verify fetcher falls back to local file when GQL raises NotImplementedError.""" + # Write sample manifest to temp file + manifest_file = tmp_path / "flash_manifest.json" + with open(manifest_file, "w") as f: + json.dump(sample_manifest, f) + + fetcher = ManifestFetcher(manifest_path=manifest_file) + result = await fetcher.get_manifest() + + assert result == sample_manifest + + @pytest.mark.asyncio + async def test_caching_prevents_multiple_fetches(self, sample_manifest, tmp_path): + """Verify cached manifest is reused within TTL.""" + manifest_file = tmp_path / "flash_manifest.json" + with open(manifest_file, "w") as f: + json.dump(sample_manifest, f) + + fetcher = ManifestFetcher(cache_ttl=300, manifest_path=manifest_file) + + # First call - loads from file + result1 = await fetcher.get_manifest() + assert result1 == sample_manifest + + # Second call immediately - should use cache + result2 = await fetcher.get_manifest() + assert result2 == sample_manifest + assert result2 is result1 # Same object reference (cached) + + @pytest.mark.asyncio + async def test_cache_expiration_triggers_refetch(self, sample_manifest, tmp_path): + """Verify expired cache triggers new fetch.""" + manifest_file = tmp_path / "flash_manifest.json" + with open(manifest_file, "w") as f: + json.dump(sample_manifest, f) + + # Very short TTL + fetcher = ManifestFetcher(cache_ttl=0.1, manifest_path=manifest_file) + + # First call + result1 = await fetcher.get_manifest() + assert result1 == sample_manifest + + # Wait for cache to expire + await asyncio.sleep(0.2) + + # Second call - cache expired, should refetch + result2 = await fetcher.get_manifest() + assert result2 == sample_manifest + + @pytest.mark.asyncio + async def test_fetch_from_gql_raises_not_implemented(self): + """Verify GQL fetch placeholder raises NotImplementedError.""" + fetcher = ManifestFetcher() + + with pytest.raises(NotImplementedError, match="not yet implemented"): + await fetcher._fetch_from_gql() + + @pytest.mark.asyncio + async def test_update_local_file_writes_manifest(self, sample_manifest, tmp_path): + """Verify manifest is written to local file.""" + manifest_file = tmp_path / "flash_manifest.json" + fetcher = ManifestFetcher(manifest_path=manifest_file) + + fetcher._update_local_file(sample_manifest) + + # Verify file was written + assert manifest_file.exists() + with open(manifest_file) as f: + written = json.load(f) + assert written == sample_manifest + + @pytest.mark.asyncio + async def test_cache_invalidation(self, sample_manifest, tmp_path): + """Verify manual cache invalidation works.""" + manifest_file = tmp_path / "flash_manifest.json" + with open(manifest_file, "w") as f: + json.dump(sample_manifest, f) + + fetcher = ManifestFetcher(cache_ttl=300, manifest_path=manifest_file) + + # Load and cache + await fetcher.get_manifest() + assert fetcher._cached_manifest is not None + + # Invalidate + fetcher.invalidate_cache() + + # Next call should refetch (cache_loaded_at is 0) + assert fetcher._cache_loaded_at == 0 + + @pytest.mark.asyncio + async def test_concurrent_requests_use_lock(self, sample_manifest, tmp_path): + """Verify concurrent requests are properly synchronized.""" + manifest_file = tmp_path / "flash_manifest.json" + with open(manifest_file, "w") as f: + json.dump(sample_manifest, f) + + fetcher = ManifestFetcher(manifest_path=manifest_file) + + # Make multiple concurrent requests + results = await asyncio.gather( + fetcher.get_manifest(), + fetcher.get_manifest(), + fetcher.get_manifest(), + ) + + # All should return the same manifest + assert all(r == sample_manifest for r in results) + + @pytest.mark.asyncio + async def test_handles_missing_local_file_gracefully(self): + """Verify fetcher handles missing local file gracefully.""" + # Point to non-existent file + fetcher = ManifestFetcher(manifest_path=Path("/nonexistent/manifest.json")) + + # Should fall back to loading from cwd (which also won't exist in test) + result = await fetcher.get_manifest() + + # load_manifest returns empty dict when no file is found + assert result == {"resources": {}, "function_registry": {}} + + @pytest.mark.asyncio + async def test_mothership_id_passed_to_gql(self): + """Verify mothership_id is passed through to GQL fetch.""" + fetcher = ManifestFetcher() + + # Spy on _fetch_from_gql to capture arguments + with patch.object(fetcher, "_fetch_from_gql") as mock_fetch: + mock_fetch.side_effect = NotImplementedError() + + await fetcher.get_manifest(mothership_id="test-123") + + # Verify mothership_id was passed to fetch + mock_fetch.assert_called_once_with("test-123") From 42edb23f039e4ebabf4e793e2b0ba1978b6328e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 12 Jan 2026 08:38:00 -0800 Subject: [PATCH 04/12] refactor: rename directory terminology to manifest throughout codebase - Rename _directory to _endpoint_registry in ServiceRegistry - Rename directory_client parameter to manifest_client - Change API endpoint from /directory to /manifest - Change JSON response key from "directory" to "manifest" - Update _ensure_directory_loaded() to _ensure_manifest_loaded() - Update refresh_directory() to refresh_manifest() - Update all tests and documentation to reflect new terminology --- docs/Cross_Endpoint_Routing.md | 137 +++++++++--------- docs/Load_Balancer_Endpoints.md | 6 +- src/tetra_rp/runtime/config.py | 2 +- src/tetra_rp/runtime/exceptions.py | 2 +- src/tetra_rp/runtime/manifest_client.py | 33 ++--- src/tetra_rp/runtime/production_wrapper.py | 9 +- src/tetra_rp/runtime/service_registry.py | 64 ++++---- .../test_cross_endpoint_routing.py | 57 ++++---- tests/unit/runtime/test_manifest_client.py | 30 ++-- tests/unit/runtime/test_production_wrapper.py | 10 +- tests/unit/runtime/test_service_registry.py | 114 ++++++++------- 11 files changed, 234 insertions(+), 230 deletions(-) diff --git a/docs/Cross_Endpoint_Routing.md b/docs/Cross_Endpoint_Routing.md index 1a4330c6..341d4b09 100644 --- a/docs/Cross_Endpoint_Routing.md +++ b/docs/Cross_Endpoint_Routing.md @@ -59,7 +59,7 @@ The manifest structure: #### 2. Set Environment Variables -Configure the mothership directory URL (required for remote routing): +Configure the mothership manifest URL (required for remote routing): ```bash # Required for cross-endpoint routing to work @@ -149,7 +149,7 @@ The manifest file (`flash_manifest.json`) defines function routing and resource | Variable | Required | Purpose | |----------|----------|---------| -| `FLASH_MOTHERSHIP_URL` | Yes* | URL of mothership directory service | +| `FLASH_MOTHERSHIP_URL` | Yes* | URL of mothership manifest service | | `RUNPOD_ENDPOINT_ID` | No | Current endpoint ID (for tracing) | | `FLASH_MANIFEST_PATH` | No | Explicit path to manifest file | @@ -255,7 +255,7 @@ Functions gracefully fall back to local execution if routing fails: async def critical_service(request: dict) -> dict: # Routes to critical-endpoint if: # - In function_registry - # - Directory available + # - Manifest available # Otherwise executes locally return handle_critical(request) @@ -269,11 +269,11 @@ async def helper_function(x: int) -> int: #### Common Issues -**Directory Unavailable** +**Manifest Service Unavailable** If `FLASH_MOTHERSHIP_URL` is not set or unreachable: ``` -WARNING: FLASH_MOTHERSHIP_URL not set, directory unavailable +WARNING: FLASH_MOTHERSHIP_URL not set, manifest service unavailable ``` Functions default to local execution. Set the environment variable to enable routing. @@ -342,8 +342,8 @@ graph TD A["Function Call"] -->|"intercepts stub layer"| B["ProductionWrapper"] B -->|"load service configuration"| C["ServiceRegistry"] - C -->|"if not cached"| D["DirectoryClient"] - D -->|"query mothership API"| E["Directory
Endpoint URLs"] + C -->|"if not cached"| D["ManifestClient"] + D -->|"query mothership API"| E["Manifest
Endpoint URLs"] E -->|"cache result
TTL 300s"| C C -->|"lookup in manifest
flash_manifest.json"| F{"Routing
Decision"} @@ -358,7 +358,7 @@ graph TD K --> L["Return Response
base64 → cloudpickle"] L --> M["Deserialized Result"] - N["Error Handling:
- RemoteExecutionError
- SerializationError
- DirectoryUnavailableError"] -.-> H + N["Error Handling:
- RemoteExecutionError
- SerializationError
- ManifestServiceUnavailableError"] -.-> H N -.-> I N -.-> J @@ -405,8 +405,8 @@ class ProductionWrapper: **kwargs: Any, ) -> Any: """Route function execution to local or remote endpoint.""" - # 1. Load directory (if needed) - await self.service_registry._ensure_directory_loaded() + # 1. Load manifest (if needed) + await self.service_registry._ensure_manifest_loaded() # 2. Look up function in manifest resource = self.service_registry.get_resource_for_function(func.__name__) @@ -450,30 +450,29 @@ class ServiceRegistry: """Service discovery and routing for cross-endpoint function calls.""" def __init__(self, manifest_path: Optional[Path] = None): - """Initialize with manifest and optional directory client.""" + """Initialize with manifest and optional manifest client.""" self._load_manifest(manifest_path) - self._directory_client = DirectoryClient(...) - self._directory = {} # Cached endpoint URLs - self._directory_lock = asyncio.Lock() + self._manifest_client = ManifestClient(...) + self._endpoint_registry = {} # Cached endpoint URLs + self._endpoint_registry_lock = asyncio.Lock() def get_resource_for_function(self, func_name: str) -> Optional[ServerlessResource]: """Get resource config for function from manifest.""" - # Returns None if: - # - Function not in manifest - # - Explicitly set to null in manifest - - # Returns ServerlessResource if mapped in manifest - config = self._manifest["functions"].get(func_name) + # Returns the ServerlessResource if function is mapped in manifest + # Returns None if function maps to current endpoint + # Raises ValueError if function not found in manifest + config = self._manifest.function_registry.get(func_name) return self._resolve_resource(config) - async def _ensure_directory_loaded(self) -> None: - """Load directory from mothership with caching (TTL 300s).""" - if self._is_directory_fresh(): - return + async def _ensure_manifest_loaded(self) -> None: + """Load manifest from mothership if cache expired or not loaded.""" + async with self._endpoint_registry_lock: + now = time.time() + cache_age = now - self._endpoint_registry_loaded_at - async with self._directory_lock: - self._directory = await self._directory_client.get_directory() - self._directory_loaded_at = time.time() + if cache_age > self.cache_ttl: + self._endpoint_registry = await self._manifest_client.get_manifest() + self._endpoint_registry_loaded_at = now ``` **Manifest Format**: @@ -499,36 +498,36 @@ class ServiceRegistry: - `function_registry`: Maps function names to resource config names (null = local) - `resources`: Defines resource configurations and their handler details -**Directory Cache**: +**Manifest Cache**: - TTL: 300 seconds (configurable via `DEFAULT_CACHE_TTL`) - Thread-safe with `asyncio.Lock()` -- Graceful fallback if directory unavailable +- Graceful fallback if manifest service unavailable -#### 3. DirectoryClient +#### 3. ManifestClient -**Location**: `src/tetra_rp/runtime/directory_client.py` +**Location**: `src/tetra_rp/runtime/manifest_client.py` -HTTP client for mothership directory service: +HTTP client for mothership manifest service: ```python -class DirectoryClient: - """HTTP client for querying mothership directory. +class ManifestClient: + """HTTP client for querying mothership manifest. - The directory maps resource_config names to their endpoint URLs. + The manifest maps resource_config names to their endpoint URLs. Example: {"gpu_config": "https://api.runpod.io/v2/abc123"} """ - async def get_directory(self) -> Dict[str, str]: - """Fetch endpoint directory from mothership. + async def get_manifest(self) -> Dict[str, str]: + """Fetch endpoint manifest from mothership. Returns: Dictionary mapping resource_config_name → endpoint_url. Example: {"gpu_config": "https://api.runpod.io/v2/abc123"} Raises: - DirectoryUnavailableError: If directory service unavailable after retries. + ManifestServiceUnavailableError: If manifest service unavailable after retries. """ - # Queries {mothership_url}/directory endpoint with retry logic + # Queries {mothership_url}/manifest endpoint with retry logic ``` **Configuration**: @@ -561,8 +560,8 @@ class ManifestError(FlashRuntimeError): """Raised when manifest is invalid, missing, or has unexpected structure.""" pass -class DirectoryUnavailableError(FlashRuntimeError): - """Raised when directory service is unavailable.""" +class ManifestServiceUnavailableError(FlashRuntimeError): + """Raised when manifest service is unavailable.""" pass ``` @@ -576,8 +575,8 @@ except SerializationError as e: logger.error(f"Serialization failed: {e}") except ManifestError as e: logger.error(f"Manifest configuration error: {e}") -except DirectoryUnavailableError as e: - logger.warning(f"Directory unavailable, using fallback") +except ManifestServiceUnavailableError as e: + logger.warning(f"Manifest unavailable, using fallback") ``` ### Integration Points @@ -613,7 +612,7 @@ Functions retrieve remote endpoint info from ResourceManager: # ServiceRegistry uses ResourceManager to find endpoint URLs resource_manager = ResourceManager() endpoint = resource_manager.get_resource_for_function("function_name") -endpoint_url = endpoint.url # e.g., "https://api.runpod.io/v1/abc123" +endpoint_url = endpoint.url # e.g., "https://api.runpod.io/v2/abc123" ``` ### Configuration @@ -671,8 +670,8 @@ flowchart TD B["ProductionWrapper.wrap_function_execution()"] C["ServiceRegistry.get_resource_for_function()"] D["Manifest Lookup
resource found"] - E["Ensure Directory Loaded"] - F["DirectoryClient.get_endpoints()"] + E["Ensure Manifest Loaded"] + F["ManifestClient.get_manifest()"] G["Get Remote Endpoint URL"] H["Serialize Arguments
cloudpickle → base64"] I["HTTP POST to Remote Endpoint"] @@ -720,11 +719,11 @@ flowchart TD #### 2. Thread-Safe Async Caching -**Decision**: Use `asyncio.Lock()` for directory cache synchronization +**Decision**: Use `asyncio.Lock()` for manifest cache synchronization **Rationale**: - Prevents thundering herd on cache expiry -- Efficient - only one coroutine loads directory +- Efficient - only one coroutine loads manifest - Simple to understand and maintain - Follows async/await patterns @@ -740,12 +739,12 @@ flowchart TD #### 4. Graceful Fallback -**Decision**: Default to local execution if directory unavailable +**Decision**: Default to local execution if manifest service unavailable **Rationale**: - Maintains application resilience - Doesn't fail if mothership unreachable -- Allows local testing without directory +- Allows local testing without manifest service - Gradual degradation vs catastrophic failure #### 5. Transparent Routing @@ -779,15 +778,15 @@ class JsonSerializer: 2. Update ProductionWrapper to select serializer based on config 3. Add tests for new format -#### Adding New Directory Backends +#### Adding New Manifest Backends To support directories other than mothership: -1. Create client class with `get_directory()` method: +1. Create client class with `get_manifest()` method: ```python -class CustomDirectoryClient: - async def get_directory(self) -> Dict[str, str]: - """Fetch directory mapping resource_config_name → endpoint_url.""" +class CustomManifestClient: + async def get_manifest(self) -> Dict[str, str]: + """Fetch manifest mapping resource_config_name → endpoint_url.""" # Implementation specific to backend return {"resource_name": "https://endpoint.url"} ``` @@ -796,11 +795,11 @@ class CustomDirectoryClient: ```python registry = ServiceRegistry( manifest_path=Path("manifest.json"), - directory_client=CustomDirectoryClient(...) + manifest_client=CustomManifestClient(...) ) ``` -3. Update environment variable handling if needed (CustomDirectoryClient can read from env vars) +3. Update environment variable handling if needed (CustomManifestClient can read from env vars) #### Adding Routing Policies @@ -830,11 +829,11 @@ class RoutingPolicy: **ServiceRegistry Tests** (`tests/unit/runtime/test_service_registry.py`): - Manifest loading - Resource lookup -- Directory caching +- Manifest caching - TTL expiry - Lock behavior under concurrency -**DirectoryClient Tests** (`tests/unit/runtime/test_directory_client.py`): +**ManifestClient Tests** (`tests/unit/runtime/test_manifest_client.py`): - Successful HTTP requests - Error handling - Retry logic @@ -855,7 +854,7 @@ class RoutingPolicy: - End-to-end remote execution - Function call across endpoints - Error handling in real scenarios -- Directory caching behavior +- Manifest caching behavior - Serialization of complex objects #### Test Patterns @@ -904,7 +903,7 @@ logging.basicConfig(level=logging.DEBUG) # ProductionWrapper logs # ServiceRegistry logs -# DirectoryClient logs +# ManifestClient logs ``` #### Common Debug Scenarios @@ -914,8 +913,8 @@ logging.basicConfig(level=logging.DEBUG) # Check manifest print(registry._manifest) -# Check directory -print(registry._directory) +# Check cached endpoint URLs +print(registry._endpoint_registry) # Check resource lookup resource = registry.get_resource_for_function("function_name") @@ -932,16 +931,16 @@ except Exception as e: print(f"Not serializable: {e}") ``` -**Directory unavailable**: +**Manifest unavailable**: ```python # Check environment variables import os print(f"FLASH_MOTHERSHIP_URL: {os.getenv('FLASH_MOTHERSHIP_URL')}") print(f"RUNPOD_ENDPOINT_ID: {os.getenv('RUNPOD_ENDPOINT_ID')}") -# Check directory client directly -client = DirectoryClient(mothership_url=...) -endpoints = await client.get_endpoints() +# Check manifest client directly +client = ManifestClient(mothership_url=...) +endpoints = await client.get_manifest() ``` ## Manifest Synchronization with RunPod GraphQL API @@ -1044,7 +1043,7 @@ prepares for GQL-based architecture with improved caching and error handling. ### Design Focus 1. **Transparent Routing**: Functions route automatically without code changes -2. **Graceful Degradation**: Defaults to local execution if directory unavailable +2. **Graceful Degradation**: Defaults to local execution if manifest service unavailable 3. **Type Safety**: Full type hints throughout for IDE support and static analysis 4. **Thread-Safe Async**: Proper `asyncio.Lock()` usage for concurrent operations 5. **Clear Error Hierarchy**: Custom exceptions provide actionable error context @@ -1055,7 +1054,7 @@ Cross-endpoint routing provides: - **Transparency**: Functions route automatically without manual HTTP calls - **Flexibility**: Manifest-based routing enables environment-specific configurations -- **Resilience**: Graceful fallback to local execution if directory unavailable +- **Resilience**: Graceful fallback to local execution if manifest service unavailable - **Simplicity**: No changes to function code or signatures - **Debuggability**: Clear error messages and logging for troubleshooting diff --git a/docs/Load_Balancer_Endpoints.md b/docs/Load_Balancer_Endpoints.md index ea551884..62db7c7a 100644 --- a/docs/Load_Balancer_Endpoints.md +++ b/docs/Load_Balancer_Endpoints.md @@ -35,9 +35,9 @@ Load-balanced endpoints require different provisioning and health check logic th ### Why This Matters -The Mothership needs to serve as a directory server for child endpoints. This requires: +The Mothership needs to serve as a manifest server for child endpoints. This requires: - HTTP-based service discovery (not queue-based) -- Ability to expose custom endpoints (`/directory`, `/ping`) +- Ability to expose custom endpoints (`/manifest`, `/ping`) - Health checking to verify children are ready before routing traffic ## Architecture @@ -401,6 +401,6 @@ endpoint = LoadBalancerSlsResource( ## Next Steps - **Mothership integration**: Use LoadBalancerSlsResource for Mothership endpoints -- **Service discovery**: Implement `/directory` endpoint for child endpoint discovery +- **Service discovery**: Implement `/manifest` endpoint for child endpoint discovery - **Auto-provisioning**: Automatic child endpoint deployment on Mothership startup - **Cross-endpoint routing**: Route requests between endpoints using service discovery diff --git a/src/tetra_rp/runtime/config.py b/src/tetra_rp/runtime/config.py index c0efc11f..974bb5d5 100644 --- a/src/tetra_rp/runtime/config.py +++ b/src/tetra_rp/runtime/config.py @@ -5,7 +5,7 @@ DEFAULT_MAX_RETRIES = 3 DEFAULT_BACKOFF_BASE = 2 -# Directory cache configuration +# Manifest cache configuration DEFAULT_CACHE_TTL = 300 # seconds # Serialization limits diff --git a/src/tetra_rp/runtime/exceptions.py b/src/tetra_rp/runtime/exceptions.py index fec800fd..e072a6ea 100644 --- a/src/tetra_rp/runtime/exceptions.py +++ b/src/tetra_rp/runtime/exceptions.py @@ -26,6 +26,6 @@ class ManifestError(FlashRuntimeError): class ManifestServiceUnavailableError(FlashRuntimeError): - """Raised when manifest directory service is unavailable.""" + """Raised when manifest service is unavailable.""" pass diff --git a/src/tetra_rp/runtime/manifest_client.py b/src/tetra_rp/runtime/manifest_client.py index bfe69ca8..db845a63 100644 --- a/src/tetra_rp/runtime/manifest_client.py +++ b/src/tetra_rp/runtime/manifest_client.py @@ -1,4 +1,4 @@ -"""HTTP client for mothership manifest directory API.""" +"""HTTP client for mothership manifest API.""" import asyncio import logging @@ -17,13 +17,12 @@ class ManifestClient: - """HTTP client for querying mothership manifest directory service. + """HTTP client for querying mothership manifest service. - Fetches the endpoint registry that maps resource_config names to their - deployment URLs. This is the "manifest directory service" - an endpoint - registry showing where resources are deployed. + Fetches the manifest (endpoint registry) that maps resource_config names to + their deployment URLs. The manifest provides service discovery for remote + resource endpoints. - The directory maps resource_config names to their endpoint URLs. Example: {"gpu_config": "https://api.runpod.io/v2/abc123"} """ @@ -55,15 +54,15 @@ def __init__( self.max_retries = max_retries self._client: Optional[httpx.AsyncClient] = None - async def get_directory(self) -> Dict[str, str]: - """Fetch endpoint directory from mothership. + async def get_manifest(self) -> Dict[str, str]: + """Fetch endpoint manifest from mothership. Returns: Dictionary mapping resource_config_name → endpoint_url. Example: {"gpu_config": "https://api.runpod.io/v2/abc123"} Raises: - ManifestServiceUnavailableError: If manifest directory service unavailable after retries. + ManifestServiceUnavailableError: If manifest service unavailable after retries. """ if httpx is None: raise ImportError( @@ -76,25 +75,25 @@ async def get_directory(self) -> Dict[str, str]: try: client = await self._get_client() response = await client.get( - f"{self.mothership_url}/directory", + f"{self.mothership_url}/manifest", timeout=self.timeout, ) if response.status_code >= 400: raise ManifestServiceUnavailableError( - f"Directory API returned {response.status_code}: " + f"Manifest API returned {response.status_code}: " f"{response.text[:200]}" ) data = response.json() - if "directory" not in data: + if "manifest" not in data: raise ManifestServiceUnavailableError( - "Invalid directory response: missing 'directory' key" + "Invalid manifest response: missing 'manifest' key" ) - directory = data["directory"] - logger.debug(f"Directory loaded: {len(directory)} endpoints") - return directory + manifest = data["manifest"] + logger.debug(f"Manifest loaded: {len(manifest)} endpoints") + return manifest except ( asyncio.TimeoutError, @@ -112,7 +111,7 @@ async def get_directory(self) -> Dict[str, str]: continue raise ManifestServiceUnavailableError( - f"Failed to fetch manifest directory after {self.max_retries} attempts: {last_exception}" + f"Failed to fetch manifest after {self.max_retries} attempts: {last_exception}" ) async def _get_client(self) -> httpx.AsyncClient: diff --git a/src/tetra_rp/runtime/production_wrapper.py b/src/tetra_rp/runtime/production_wrapper.py index 65ce815d..22a48f9e 100644 --- a/src/tetra_rp/runtime/production_wrapper.py +++ b/src/tetra_rp/runtime/production_wrapper.py @@ -26,7 +26,6 @@ def __init__(self, service_registry: ServiceRegistry): service_registry: Service registry for routing decisions. """ self.service_registry = service_registry - self._directory_loaded = False async def wrap_function_execution( self, @@ -57,8 +56,8 @@ async def wrap_function_execution( """ function_name = func.__name__ - # Ensure directory is loaded - await self.service_registry._ensure_directory_loaded() + # Ensure manifest is loaded + await self.service_registry._ensure_manifest_loaded() # Determine routing try: @@ -116,8 +115,8 @@ async def wrap_class_method_execution( Raises: Exception: If execution fails. """ - # Ensure directory is loaded - await self.service_registry._ensure_directory_loaded() + # Ensure manifest is loaded + await self.service_registry._ensure_manifest_loaded() class_name = getattr(request, "class_name", None) diff --git a/src/tetra_rp/runtime/service_registry.py b/src/tetra_rp/runtime/service_registry.py index ddcbcd84..2a2fb865 100644 --- a/src/tetra_rp/runtime/service_registry.py +++ b/src/tetra_rp/runtime/service_registry.py @@ -22,14 +22,14 @@ class ServiceRegistry: """Service discovery and routing for cross-endpoint function calls. Loads manifest to map functions to resource configs, queries mothership - directory for endpoint URLs, and determines if function calls are local + manifest for endpoint URLs, and determines if function calls are local or remote. """ def __init__( self, manifest_path: Optional[Path] = None, - directory_client: Optional[ManifestClient] = None, + manifest_client: Optional[ManifestClient] = None, cache_ttl: int = DEFAULT_CACHE_TTL, ): """Initialize service registry. @@ -37,17 +37,17 @@ def __init__( Args: manifest_path: Path to flash_manifest.json. Defaults to FLASH_MANIFEST_PATH env var or auto-detection. - directory_client: Manifest service client for mothership API. If None, creates one + manifest_client: Manifest service client for mothership API. If None, creates one from FLASH_MOTHERSHIP_URL env var. - cache_ttl: Directory cache lifetime in seconds (default: 300). + cache_ttl: Manifest cache lifetime in seconds (default: 300). Raises: FileNotFoundError: If manifest_path doesn't exist. - ValueError: If required env vars missing for directory_client. + ValueError: If required env vars missing for manifest_client. """ self.cache_ttl = cache_ttl - self._directory: Dict[str, str] = {} - self._directory_loaded_at = 0.0 + self._endpoint_registry: Dict[str, str] = {} + self._endpoint_registry_loaded_at = 0.0 self._manifest: Manifest = Manifest( version="1.0", generated_at="", @@ -55,21 +55,23 @@ def __init__( function_registry={}, resources={}, ) - self._directory_lock = asyncio.Lock() + self._endpoint_registry_lock = asyncio.Lock() # Load manifest self._load_manifest(manifest_path) # Initialize manifest client - if directory_client is None: + if manifest_client is None: mothership_url = os.getenv("FLASH_MOTHERSHIP_URL") if mothership_url: - directory_client = ManifestClient(mothership_url=mothership_url) + manifest_client = ManifestClient(mothership_url=mothership_url) else: - logger.warning("FLASH_MOTHERSHIP_URL not set, directory unavailable") - directory_client = None + logger.warning( + "FLASH_MOTHERSHIP_URL not set, manifest service unavailable" + ) + manifest_client = None - self._directory_client = directory_client + self._manifest_client = manifest_client self._current_endpoint = os.getenv("RUNPOD_ENDPOINT_ID") def _load_manifest(self, manifest_path: Optional[Path]) -> None: @@ -127,30 +129,30 @@ def _load_manifest(self, manifest_path: Optional[Path]) -> None: resources={}, ) - async def _ensure_directory_loaded(self) -> None: - """Load directory from mothership if cache expired or not loaded.""" - async with self._directory_lock: + async def _ensure_manifest_loaded(self) -> None: + """Load manifest from mothership if cache expired or not loaded.""" + async with self._endpoint_registry_lock: now = time.time() - cache_age = now - self._directory_loaded_at + cache_age = now - self._endpoint_registry_loaded_at if cache_age > self.cache_ttl: - if self._directory_client is None: - logger.debug("Directory client not available, skipping refresh") + if self._manifest_client is None: + logger.debug("Manifest client not available, skipping refresh") return try: - self._directory = await self._directory_client.get_directory() - self._directory_loaded_at = now + self._endpoint_registry = await self._manifest_client.get_manifest() + self._endpoint_registry_loaded_at = now logger.debug( - f"Directory loaded: {len(self._directory)} endpoints, " + f"Manifest loaded: {len(self._endpoint_registry)} endpoints, " f"cache TTL {self.cache_ttl}s" ) except ManifestServiceUnavailableError as e: logger.warning( - f"Failed to load manifest directory: {e}. " + f"Failed to load manifest: {e}. " f"Cross-endpoint routing unavailable." ) - self._directory = {} + self._endpoint_registry = {} def get_endpoint_for_function(self, function_name: str) -> Optional[str]: """Get endpoint URL for a function. @@ -181,12 +183,12 @@ def get_endpoint_for_function(self, function_name: str) -> Optional[str]: if resource_config_name == self._current_endpoint: return None - # Check directory for remote endpoint URL - endpoint_url = self._directory.get(resource_config_name) + # Check manifest for remote endpoint URL + endpoint_url = self._endpoint_registry.get(resource_config_name) if not endpoint_url: logger.debug( - f"Endpoint URL for '{resource_config_name}' not in directory. " - f"Directory has: {list(self._directory.keys())}" + f"Endpoint URL for '{resource_config_name}' not in manifest. " + f"Manifest has: {list(self._endpoint_registry.keys())}" ) return endpoint_url @@ -260,9 +262,9 @@ def get_current_endpoint_id(self) -> Optional[str]: """ return self._current_endpoint - def refresh_directory(self) -> None: - """Force refresh directory from mothership on next access.""" - self._directory_loaded_at = 0 + def refresh_manifest(self) -> None: + """Force refresh manifest from mothership on next access.""" + self._endpoint_registry_loaded_at = 0 def get_manifest(self) -> Manifest: """Get loaded manifest. diff --git a/tests/integration/test_cross_endpoint_routing.py b/tests/integration/test_cross_endpoint_routing.py index 1b67967e..aab993d1 100644 --- a/tests/integration/test_cross_endpoint_routing.py +++ b/tests/integration/test_cross_endpoint_routing.py @@ -74,7 +74,7 @@ async def test_local_function_execution(self, manifest): "FLASH_MOTHERSHIP_URL": "https://mothership.example.com", }, ): - directory = { + endpoint_registry = { "gpu_config": "https://gpu.example.com", "cpu_config": "https://cpu.example.com", } @@ -88,12 +88,12 @@ async def test_local_function_execution(self, manifest): try: registry = ServiceRegistry(manifest_path=manifest_path) - mock_dir_client = AsyncMock(spec=ManifestClient) - mock_dir_client.get_directory.return_value = directory + mock_manifest_client = AsyncMock(spec=ManifestClient) + mock_manifest_client.get_manifest.return_value = endpoint_registry - registry._directory_client = mock_dir_client - registry._directory = directory - registry._directory_loaded_at = float("inf") + registry._manifest_client = mock_manifest_client + registry._endpoint_registry = endpoint_registry + registry._endpoint_registry_loaded_at = float("inf") wrapper = ProductionWrapper(registry) @@ -128,7 +128,7 @@ async def test_remote_function_execution_routing(self, manifest): "FLASH_MOTHERSHIP_URL": "https://mothership.example.com", }, ): - directory = { + endpoint_registry = { "gpu_config": "https://gpu.example.com", "cpu_config": "https://cpu.example.com", } @@ -141,11 +141,11 @@ async def test_remote_function_execution_routing(self, manifest): try: registry = ServiceRegistry(manifest_path=manifest_path) - mock_dir_client = AsyncMock(spec=ManifestClient) - mock_dir_client.get_directory.return_value = directory - registry._directory_client = mock_dir_client - registry._directory = directory - registry._directory_loaded_at = float("inf") + mock_manifest_client = AsyncMock(spec=ManifestClient) + mock_manifest_client.get_manifest.return_value = endpoint_registry + registry._manifest_client = mock_manifest_client + registry._endpoint_registry = endpoint_registry + registry._endpoint_registry_loaded_at = float("inf") # Mock ServerlessResource mock_resource = AsyncMock() @@ -183,8 +183,8 @@ async def cpu_task(x): manifest_path.unlink() @pytest.mark.asyncio - async def test_directory_loading_on_demand(self, manifest): - """Test that directory is loaded on-demand before routing decision.""" + async def test_manifest_loading_on_demand(self, manifest): + """Test that manifest is loaded on-demand before routing decision.""" with patch.dict( "os.environ", { @@ -192,7 +192,7 @@ async def test_directory_loading_on_demand(self, manifest): "FLASH_MOTHERSHIP_URL": "https://mothership.example.com", }, ): - directory = { + endpoint_registry = { "gpu_config": "https://gpu.example.com", "cpu_config": "https://cpu.example.com", } @@ -205,11 +205,11 @@ async def test_directory_loading_on_demand(self, manifest): try: registry = ServiceRegistry(manifest_path=manifest_path) - mock_dir_client = AsyncMock(spec=ManifestClient) - mock_dir_client.get_directory.return_value = directory - registry._directory_client = mock_dir_client + mock_manifest_client = AsyncMock(spec=ManifestClient) + mock_manifest_client.get_manifest.return_value = endpoint_registry + registry._manifest_client = mock_manifest_client - assert registry._directory == {} + assert registry._endpoint_registry == {} wrapper = ProductionWrapper(registry) @@ -230,8 +230,11 @@ async def cpu_task(x): original_stub, cpu_task, None, None, True ) - assert len(registry._directory) > 0 - assert registry._directory["gpu_config"] == "https://gpu.example.com" + assert len(registry._endpoint_registry) > 0 + assert ( + registry._endpoint_registry["gpu_config"] + == "https://gpu.example.com" + ) finally: manifest_path.unlink() @@ -246,7 +249,7 @@ async def test_error_handling_in_remote_execution(self, manifest): "FLASH_MOTHERSHIP_URL": "https://mothership.example.com", }, ): - directory = { + endpoint_registry = { "gpu_config": "https://gpu.example.com", "cpu_config": "https://cpu.example.com", } @@ -259,11 +262,11 @@ async def test_error_handling_in_remote_execution(self, manifest): try: registry = ServiceRegistry(manifest_path=manifest_path) - mock_dir_client = AsyncMock(spec=ManifestClient) - mock_dir_client.get_directory.return_value = directory - registry._directory_client = mock_dir_client - registry._directory = directory - registry._directory_loaded_at = float("inf") + mock_manifest_client = AsyncMock(spec=ManifestClient) + mock_manifest_client.get_manifest.return_value = endpoint_registry + registry._manifest_client = mock_manifest_client + registry._endpoint_registry = endpoint_registry + registry._endpoint_registry_loaded_at = float("inf") # Mock ServerlessResource that returns error mock_resource = AsyncMock() diff --git a/tests/unit/runtime/test_manifest_client.py b/tests/unit/runtime/test_manifest_client.py index 27bb12cc..be48a38c 100644 --- a/tests/unit/runtime/test_manifest_client.py +++ b/tests/unit/runtime/test_manifest_client.py @@ -21,7 +21,7 @@ def mock_response(self): response = MagicMock() response.status_code = 200 response.json.return_value = { - "directory": { + "manifest": { "gpu_config": "https://api.runpod.io/v2/gpu123", "cpu_config": "https://api.runpod.io/v2/cpu456", }, @@ -53,8 +53,8 @@ def test_init_explicit_over_env(self): assert client.mothership_url == "https://explicit.com" @pytest.mark.asyncio - async def test_get_directory_success(self, mock_response): - """Test successful directory fetch.""" + async def test_get_manifest_success(self, mock_response): + """Test successful manifest fetch.""" client = ManifestClient(mothership_url="https://mothership.example.com") with patch("tetra_rp.runtime.manifest_client.httpx"): @@ -63,15 +63,15 @@ async def test_get_directory_success(self, mock_response): mock_client.get.return_value = mock_response with patch.object(client, "_get_client", return_value=mock_client): - directory = await client.get_directory() + manifest = await client.get_manifest() - assert directory == { + assert manifest == { "gpu_config": "https://api.runpod.io/v2/gpu123", "cpu_config": "https://api.runpod.io/v2/cpu456", } @pytest.mark.asyncio - async def test_get_directory_http_error(self): + async def test_get_manifest_http_error(self): """Test handling of HTTP errors.""" client = ManifestClient(mothership_url="https://mothership.example.com") @@ -86,10 +86,10 @@ async def test_get_directory_http_error(self): mock_get_client.return_value = mock_http_client with pytest.raises(ManifestServiceUnavailableError, match="500"): - await client.get_directory() + await client.get_manifest() @pytest.mark.asyncio - async def test_get_directory_timeout(self): + async def test_get_manifest_timeout(self): """Test handling of request timeout.""" client = ManifestClient( mothership_url="https://mothership.example.com", timeout=0.1 @@ -104,10 +104,10 @@ async def test_get_directory_timeout(self): with pytest.raises( ManifestServiceUnavailableError, match="after \\d+ attempts" ): - await client.get_directory() + await client.get_manifest() @pytest.mark.asyncio - async def test_get_directory_retry(self): + async def test_get_manifest_retry(self): """Test retry logic on transient failure.""" client = ManifestClient( mothership_url="https://mothership.example.com", max_retries=3 @@ -115,7 +115,7 @@ async def test_get_directory_retry(self): response = MagicMock() response.status_code = 200 - response.json.return_value = {"directory": {"gpu": "https://gpu.example.com"}} + response.json.return_value = {"manifest": {"gpu": "https://gpu.example.com"}} with patch.object(client, "_get_client") as mock_get_client: mock_http_client = AsyncMock() @@ -133,12 +133,12 @@ async def test_get_directory_retry(self): "tetra_rp.runtime.manifest_client.asyncio.sleep", new_callable=AsyncMock, ): - directory = await client.get_directory() - assert directory == {"gpu": "https://gpu.example.com"} + manifest = await client.get_manifest() + assert manifest == {"gpu": "https://gpu.example.com"} assert mock_http_client.get.call_count == 3 @pytest.mark.asyncio - async def test_get_directory_exhaust_retries(self): + async def test_get_manifest_exhaust_retries(self): """Test failure after exhausting retries.""" client = ManifestClient( mothership_url="https://mothership.example.com", max_retries=2 @@ -157,7 +157,7 @@ async def test_get_directory_exhaust_retries(self): with pytest.raises( ManifestServiceUnavailableError, match="after 2 attempts" ): - await client.get_directory() + await client.get_manifest() @pytest.mark.asyncio async def test_context_manager(self): diff --git a/tests/unit/runtime/test_production_wrapper.py b/tests/unit/runtime/test_production_wrapper.py index cc628047..bda5c31d 100644 --- a/tests/unit/runtime/test_production_wrapper.py +++ b/tests/unit/runtime/test_production_wrapper.py @@ -19,7 +19,7 @@ class TestProductionWrapper: def mock_registry(self): """Mock service registry.""" registry = AsyncMock(spec=ServiceRegistry) - registry._ensure_directory_loaded = AsyncMock() + registry._ensure_manifest_loaded = AsyncMock() return registry @pytest.fixture @@ -135,8 +135,8 @@ async def test_wrap_function_remote_error( ) @pytest.mark.asyncio - async def test_wrap_function_loads_directory(self, wrapper, mock_registry): - """Test that directory is loaded before routing decision.""" + async def test_wrap_function_loads_manifest(self, wrapper, mock_registry): + """Test that manifest is loaded before routing decision.""" mock_registry.get_resource_for_function.return_value = None async def sample_func(): @@ -147,8 +147,8 @@ async def sample_func(): original_stub, sample_func, None, None, True ) - # Should ensure directory is loaded - mock_registry._ensure_directory_loaded.assert_called_once() + # Should ensure manifest is loaded + mock_registry._ensure_manifest_loaded.assert_called_once() @pytest.mark.asyncio async def test_wrap_class_method_local(self, wrapper, mock_registry, original_stub): diff --git a/tests/unit/runtime/test_service_registry.py b/tests/unit/runtime/test_service_registry.py index 8dc88aa1..c7c83aaf 100644 --- a/tests/unit/runtime/test_service_registry.py +++ b/tests/unit/runtime/test_service_registry.py @@ -103,19 +103,19 @@ def test_is_local_function_local(self, manifest_file): assert registry.is_local_function("inference") is True def test_is_local_function_remote(self, manifest_file): - """Test determining remote function (with directory loaded).""" + """Test determining remote function (with manifest loaded).""" with patch.dict(os.environ, {"RUNPOD_ENDPOINT_ID": "gpu_config"}): mock_client = AsyncMock() - mock_client.get_directory.return_value = { + mock_client.get_manifest.return_value = { "cpu_config": "https://cpu.example.com" } registry = ServiceRegistry( - manifest_path=manifest_file, directory_client=mock_client + manifest_path=manifest_file, manifest_client=mock_client ) - # After directory is loaded, CPU tasks should be recognized as remote + # After manifest is loaded, CPU tasks should be recognized as remote # (but is_local_function doesn't async load, so returns True for now) - # This is actually expected behavior - sync method can't load async directory + # This is actually expected behavior - sync method can't load async manifest assert registry.is_local_function("preprocess") is True def test_is_local_function_not_in_manifest(self, manifest_file): @@ -131,11 +131,11 @@ def test_get_endpoint_for_function_local(self, manifest_file): endpoint = registry.get_endpoint_for_function("gpu_task") assert endpoint is None # Local returns None - def test_get_endpoint_for_function_remote_no_directory(self, manifest_file): - """Test getting endpoint for remote function without directory.""" + def test_get_endpoint_for_function_remote_no_manifest(self, manifest_file): + """Test getting endpoint for remote function without manifest.""" with patch.dict(os.environ, {"RUNPOD_ENDPOINT_ID": "gpu_config"}): registry = ServiceRegistry(manifest_path=manifest_file) - # CPU function is remote, but no directory loaded + # CPU function is remote, but no manifest loaded endpoint = registry.get_endpoint_for_function("preprocess") assert endpoint is None @@ -157,15 +157,17 @@ def test_get_resource_for_function_remote(self, manifest_file): """Test getting ServerlessResource for remote function.""" with patch.dict(os.environ, {"RUNPOD_ENDPOINT_ID": "gpu_config"}): mock_client = AsyncMock() - mock_client.get_directory.return_value = { + mock_client.get_manifest.return_value = { "cpu_config": "https://api.runpod.io/v2/abc123" } registry = ServiceRegistry( - manifest_path=manifest_file, directory_client=mock_client + manifest_path=manifest_file, manifest_client=mock_client ) - # Manually set directory to simulate loaded state - registry._directory = {"cpu_config": "https://api.runpod.io/v2/abc123"} + # Manually set endpoint registry to simulate loaded state + registry._endpoint_registry = { + "cpu_config": "https://api.runpod.io/v2/abc123" + } resource = registry.get_resource_for_function("preprocess") @@ -182,77 +184,77 @@ def test_get_resource_for_function_not_in_manifest(self, manifest_file): registry.get_resource_for_function("unknown_function") @pytest.mark.asyncio - async def test_ensure_directory_loaded(self, manifest_file): - """Test lazy loading of directory from client.""" - mock_directory = { + async def test_ensure_manifest_loaded(self, manifest_file): + """Test lazy loading of manifest from client.""" + mock_endpoint_registry = { "gpu_config": "https://gpu.example.com", "cpu_config": "https://cpu.example.com", } mock_client = AsyncMock() - mock_client.get_directory.return_value = mock_directory + mock_client.get_manifest.return_value = mock_endpoint_registry registry = ServiceRegistry( - manifest_path=manifest_file, directory_client=mock_client, cache_ttl=10 + manifest_path=manifest_file, manifest_client=mock_client, cache_ttl=10 ) - # Directory not loaded yet - assert registry._directory == {} + # Endpoint registry not loaded yet + assert registry._endpoint_registry == {} - # Load directory - await registry._ensure_directory_loaded() + # Load manifest + await registry._ensure_manifest_loaded() - # Should now have loaded directory - assert registry._directory == mock_directory - mock_client.get_directory.assert_called_once() + # Should now have loaded endpoint registry + assert registry._endpoint_registry == mock_endpoint_registry + mock_client.get_manifest.assert_called_once() @pytest.mark.asyncio - async def test_ensure_directory_cache_respects_ttl(self, manifest_file): - """Test that directory cache respects TTL.""" - mock_directory = {"gpu_config": "https://gpu.example.com"} + async def test_ensure_manifest_cache_respects_ttl(self, manifest_file): + """Test that manifest cache respects TTL.""" + mock_endpoint_registry = {"gpu_config": "https://gpu.example.com"} mock_client = AsyncMock() - mock_client.get_directory.return_value = mock_directory + mock_client.get_manifest.return_value = mock_endpoint_registry registry = ServiceRegistry( - manifest_path=manifest_file, directory_client=mock_client, cache_ttl=1 + manifest_path=manifest_file, manifest_client=mock_client, cache_ttl=1 ) - # Load directory - await registry._ensure_directory_loaded() - assert mock_client.get_directory.call_count == 1 + # Load manifest + await registry._ensure_manifest_loaded() + assert mock_client.get_manifest.call_count == 1 # Immediate reload should use cache - await registry._ensure_directory_loaded() - assert mock_client.get_directory.call_count == 1 + await registry._ensure_manifest_loaded() + assert mock_client.get_manifest.call_count == 1 # After TTL, should reload - registry._directory_loaded_at = time.time() - 2 # 2 seconds ago - await registry._ensure_directory_loaded() - assert mock_client.get_directory.call_count == 2 + registry._endpoint_registry_loaded_at = time.time() - 2 # 2 seconds ago + await registry._ensure_manifest_loaded() + assert mock_client.get_manifest.call_count == 2 @pytest.mark.asyncio - async def test_refresh_directory(self, manifest_file): - """Test forcing directory refresh.""" - mock_directory = {"gpu_config": "https://gpu.example.com"} + async def test_refresh_manifest(self, manifest_file): + """Test forcing manifest refresh.""" + mock_endpoint_registry = {"gpu_config": "https://gpu.example.com"} mock_client = AsyncMock() - mock_client.get_directory.return_value = mock_directory + mock_client.get_manifest.return_value = mock_endpoint_registry registry = ServiceRegistry( - manifest_path=manifest_file, directory_client=mock_client, cache_ttl=3600 + manifest_path=manifest_file, manifest_client=mock_client, cache_ttl=3600 ) - # Load directory - await registry._ensure_directory_loaded() - assert mock_client.get_directory.call_count == 1 + # Load manifest + await registry._ensure_manifest_loaded() + assert mock_client.get_manifest.call_count == 1 # Force refresh - registry.refresh_directory() + registry.refresh_manifest() # Next load should fetch again - await registry._ensure_directory_loaded() - assert mock_client.get_directory.call_count == 2 + await registry._ensure_manifest_loaded() + assert mock_client.get_manifest.call_count == 2 def test_get_manifest(self, manifest_file): """Test getting manifest.""" @@ -282,16 +284,16 @@ def test_get_resource_functions_not_found(self, manifest_file): functions = registry.get_resource_functions("nonexistent") assert functions == [] - def test_init_no_directory_client_no_mothership_url(self, manifest_file): - """Test initialization without directory client or URL.""" + def test_init_no_manifest_client_no_mothership_url(self, manifest_file): + """Test initialization without manifest client or URL.""" with patch.dict(os.environ, {}, clear=True): registry = ServiceRegistry(manifest_path=manifest_file) - assert registry._directory_client is None + assert registry._manifest_client is None @pytest.mark.asyncio - async def test_ensure_directory_loaded_unavailable_client(self, manifest_file): - """Test directory loading when client is None.""" - registry = ServiceRegistry(manifest_path=manifest_file, directory_client=None) + async def test_ensure_manifest_loaded_unavailable_client(self, manifest_file): + """Test manifest loading when client is None.""" + registry = ServiceRegistry(manifest_path=manifest_file, manifest_client=None) # Should not fail, just log warning - await registry._ensure_directory_loaded() - assert registry._directory == {} + await registry._ensure_manifest_loaded() + assert registry._endpoint_registry == {} From 9bc59b4903d61f04a96bb9b0f36ebd243ff6f0ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 12 Jan 2026 13:24:41 -0800 Subject: [PATCH 05/12] fix: align GET /manifest response format to Deployment_Architecture spec Remove {"manifest": ...} wrapper and return manifest directly per spec (Deployment_Architecture.md:235-273). Update ManifestClient parser to expect manifest directly without unwrap logic. Changes: - Remove wrapper from GET /manifest endpoint (lb_handler.py:215) - Update ManifestClient to validate manifest has "resources" key directly - Replace global _manifest_fetcher with @lru_cache(maxsize=1) for thread safety - Update all test assertions to expect unwrapped manifest format All 636 tests pass, coverage: 66.48% --- src/tetra_rp/runtime/lb_handler.py | 17 +++--- src/tetra_rp/runtime/manifest_client.py | 11 ++-- tests/integration/test_lb_remote_execution.py | 57 +++++++++++++++++++ tests/unit/runtime/test_lb_handler.py | 32 +++++------ tests/unit/runtime/test_manifest_client.py | 37 +++++++++--- 5 files changed, 115 insertions(+), 39 deletions(-) diff --git a/src/tetra_rp/runtime/lb_handler.py b/src/tetra_rp/runtime/lb_handler.py index 495261d2..3647aa52 100644 --- a/src/tetra_rp/runtime/lb_handler.py +++ b/src/tetra_rp/runtime/lb_handler.py @@ -23,7 +23,8 @@ import inspect import logging import os -from typing import Any, Callable, Dict, Optional +from functools import lru_cache +from typing import Any, Callable, Dict from fastapi import FastAPI, Request from fastapi.responses import JSONResponse @@ -37,16 +38,14 @@ logger = logging.getLogger(__name__) -# Module-level manifest fetcher (singleton, reused across requests) -_manifest_fetcher: Optional[ManifestFetcher] = None - +@lru_cache(maxsize=1) def _get_manifest_fetcher() -> ManifestFetcher: - """Get or create the manifest fetcher singleton.""" - global _manifest_fetcher - if _manifest_fetcher is None: - _manifest_fetcher = ManifestFetcher() - return _manifest_fetcher + """Get or create the manifest fetcher singleton. + + Uses @lru_cache for thread-safe lazy initialization. + """ + return ManifestFetcher() def create_lb_handler( diff --git a/src/tetra_rp/runtime/manifest_client.py b/src/tetra_rp/runtime/manifest_client.py index db845a63..eb234cbd 100644 --- a/src/tetra_rp/runtime/manifest_client.py +++ b/src/tetra_rp/runtime/manifest_client.py @@ -85,14 +85,15 @@ async def get_manifest(self) -> Dict[str, str]: f"{response.text[:200]}" ) - data = response.json() - if "manifest" not in data: + manifest = response.json() + if not isinstance(manifest, dict) or "resources" not in manifest: raise ManifestServiceUnavailableError( - "Invalid manifest response: missing 'manifest' key" + "Invalid manifest response: missing 'resources'" ) - manifest = data["manifest"] - logger.debug(f"Manifest loaded: {len(manifest)} endpoints") + logger.debug( + f"Manifest loaded: {len(manifest.get('resources', {}))} resources" + ) return manifest except ( diff --git a/tests/integration/test_lb_remote_execution.py b/tests/integration/test_lb_remote_execution.py index 8c45022f..d1413a93 100644 --- a/tests/integration/test_lb_remote_execution.py +++ b/tests/integration/test_lb_remote_execution.py @@ -393,8 +393,10 @@ def test_manifest_endpoint_with_deployed_lb_resource(self, monkeypatch): """Test manifest endpoint with LoadBalancerSlsResource.""" from unittest.mock import patch, AsyncMock from fastapi.testclient import TestClient + from tetra_rp.runtime.lb_handler import _get_manifest_fetcher monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + _get_manifest_fetcher.cache_clear() # Create test manifest for deployed endpoint test_manifest = { @@ -436,12 +438,16 @@ def test_manifest_endpoint_with_deployed_lb_resource(self, monkeypatch): assert response.status_code == 200 assert response.json() == test_manifest + _get_manifest_fetcher.cache_clear() + def test_manifest_endpoint_coexists_with_ping(self, monkeypatch): """Test that /manifest endpoint coexists with /ping health check.""" from unittest.mock import patch, AsyncMock from fastapi.testclient import TestClient + from tetra_rp.runtime.lb_handler import _get_manifest_fetcher monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + _get_manifest_fetcher.cache_clear() test_manifest = { "version": "1.0", @@ -465,3 +471,54 @@ def test_manifest_endpoint_coexists_with_ping(self, monkeypatch): ping_response = client.get("/ping") assert ping_response.status_code == 404 # Ping not auto-added by factory + + _get_manifest_fetcher.cache_clear() + + +class TestManifestClientToEndpointIntegration: + """Integration tests for ManifestClient calling GET /manifest endpoint.""" + + def test_manifest_client_can_parse_response(self): + """Test ManifestClient can parse manifest response directly.""" + import asyncio + from unittest.mock import patch, AsyncMock, MagicMock + from tetra_rp.runtime.manifest_client import ManifestClient + + # Create a manifest to simulate + test_manifest = { + "version": "1.0", + "generated_at": "2024-01-15T10:30:00Z", + "project_name": "test-app", + "resources": { + "gpu_config": { + "resource_type": "LoadBalancerSlsResource", + "handler_file": "handler_gpu.py", + "endpoint_url": "https://api.runpod.io/v2/gpu123", + } + }, + "function_registry": {"process_gpu": "gpu_config"}, + } + + async def test_client_parsing(): + # Create a mock httpx client that returns the manifest directly + mock_http_client = AsyncMock() + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = test_manifest + mock_http_client.get = AsyncMock(return_value=mock_response) + + # Create ManifestClient + client = ManifestClient(mothership_url="http://localhost:8000") + + # Mock the _get_client to return our mock + with patch.object(client, "_get_client", return_value=mock_http_client): + # Call get_manifest - should parse the response + result = await client.get_manifest() + + # Verify it successfully parsed the manifest + assert result == test_manifest + assert "gpu_config" in result["resources"] + assert result["function_registry"]["process_gpu"] == "gpu_config" + + # Run the async test + asyncio.run(test_client_parsing()) diff --git a/tests/unit/runtime/test_lb_handler.py b/tests/unit/runtime/test_lb_handler.py index 966a2ab6..1da78a4f 100644 --- a/tests/unit/runtime/test_lb_handler.py +++ b/tests/unit/runtime/test_lb_handler.py @@ -5,7 +5,7 @@ import pytest from fastapi.testclient import TestClient -from tetra_rp.runtime.lb_handler import create_lb_handler +from tetra_rp.runtime.lb_handler import create_lb_handler, _get_manifest_fetcher class TestManifestEndpoint: @@ -13,12 +13,10 @@ class TestManifestEndpoint: @pytest.fixture(autouse=True) def reset_manifest_fetcher(self): - """Reset the global manifest fetcher before each test.""" - import tetra_rp.runtime.lb_handler as lb_handler_module - - lb_handler_module._manifest_fetcher = None + """Reset the manifest fetcher cache before each test.""" + _get_manifest_fetcher.cache_clear() yield - lb_handler_module._manifest_fetcher = None + _get_manifest_fetcher.cache_clear() @pytest.fixture def sample_manifest(self): @@ -172,14 +170,14 @@ def test_manifest_endpoint_response_structure(self, sample_manifest, monkeypatch client = TestClient(app) response = client.get("/manifest") - data = response.json() + manifest = response.json() - # Verify structure - assert "version" in data - assert "generated_at" in data - assert "project_name" in data - assert "resources" in data - assert "function_registry" in data + # Verify manifest structure + assert "version" in manifest + assert "generated_at" in manifest + assert "project_name" in manifest + assert "resources" in manifest + assert "function_registry" in manifest def test_manifest_endpoint_with_empty_resources(self, monkeypatch): """Test endpoint behavior when manifest has no resources.""" @@ -339,10 +337,10 @@ def test_manifest_endpoint_with_complex_manifest(self, monkeypatch): response = client.get("/manifest") assert response.status_code == 200 - data = response.json() - assert len(data["resources"]) == 2 - assert "gpu_config" in data["resources"] - assert "cpu_config" in data["resources"] + manifest = response.json() + assert len(manifest["resources"]) == 2 + assert "gpu_config" in manifest["resources"] + assert "cpu_config" in manifest["resources"] def test_manifest_endpoint_uses_fetcher_with_caching( self, sample_manifest, monkeypatch diff --git a/tests/unit/runtime/test_manifest_client.py b/tests/unit/runtime/test_manifest_client.py index be48a38c..0578613e 100644 --- a/tests/unit/runtime/test_manifest_client.py +++ b/tests/unit/runtime/test_manifest_client.py @@ -21,11 +21,14 @@ def mock_response(self): response = MagicMock() response.status_code = 200 response.json.return_value = { - "manifest": { - "gpu_config": "https://api.runpod.io/v2/gpu123", - "cpu_config": "https://api.runpod.io/v2/cpu456", + "version": "1.0", + "generated_at": "2025-01-03T12:00:00Z", + "project_name": "test-app", + "resources": { + "gpu_config": {"endpoint_url": "https://api.runpod.io/v2/gpu123"}, + "cpu_config": {"endpoint_url": "https://api.runpod.io/v2/cpu456"}, }, - "updated_at": "2025-01-03T12:00:00Z", + "function_registry": {}, } return response @@ -66,8 +69,18 @@ async def test_get_manifest_success(self, mock_response): manifest = await client.get_manifest() assert manifest == { - "gpu_config": "https://api.runpod.io/v2/gpu123", - "cpu_config": "https://api.runpod.io/v2/cpu456", + "version": "1.0", + "generated_at": "2025-01-03T12:00:00Z", + "project_name": "test-app", + "resources": { + "gpu_config": { + "endpoint_url": "https://api.runpod.io/v2/gpu123" + }, + "cpu_config": { + "endpoint_url": "https://api.runpod.io/v2/cpu456" + }, + }, + "function_registry": {}, } @pytest.mark.asyncio @@ -115,7 +128,11 @@ async def test_get_manifest_retry(self): response = MagicMock() response.status_code = 200 - response.json.return_value = {"manifest": {"gpu": "https://gpu.example.com"}} + response.json.return_value = { + "version": "1.0", + "resources": {"gpu": {"endpoint_url": "https://gpu.example.com"}}, + "function_registry": {}, + } with patch.object(client, "_get_client") as mock_get_client: mock_http_client = AsyncMock() @@ -134,7 +151,11 @@ async def test_get_manifest_retry(self): new_callable=AsyncMock, ): manifest = await client.get_manifest() - assert manifest == {"gpu": "https://gpu.example.com"} + assert manifest == { + "version": "1.0", + "resources": {"gpu": {"endpoint_url": "https://gpu.example.com"}}, + "function_registry": {}, + } assert mock_http_client.get.call_count == 3 @pytest.mark.asyncio From 632fd9ebf6b132cfffde106af71f6629c7b68b0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 12 Jan 2026 02:30:09 -0800 Subject: [PATCH 06/12] docs: convert ASCII diagrams to MermaidJS - Local Execution Flow: Shows synchronous path for functions in manifest - Remote Execution Flow: Shows serialization, HTTP, and deserialization steps - Manifest Synchronization: Shows cache-first approach with GQL fallback Uses high-contrast MermaidJS styling with saturated colors and white text for maximum readability as per project guidelines. --- docs/Cross_Endpoint_Routing.md | 191 ++++++++++++++++++++++++++------- 1 file changed, 155 insertions(+), 36 deletions(-) diff --git a/docs/Cross_Endpoint_Routing.md b/docs/Cross_Endpoint_Routing.md index 6d059636..1a4330c6 100644 --- a/docs/Cross_Endpoint_Routing.md +++ b/docs/Cross_Endpoint_Routing.md @@ -640,46 +640,70 @@ Add new configuration by: #### Local Execution Flow -``` -Function Call - ↓ -ProductionWrapper.wrap_function_execution() - ↓ -ServiceRegistry.get_resource_for_function() - ↓ -Manifest Lookup (resource = None) - ↓ -Local Execution (original_stub_func) - ↓ -Result +```mermaid +flowchart TD + A["Function Call"] + B["ProductionWrapper.wrap_function_execution()"] + C["ServiceRegistry.get_resource_for_function()"] + D["Manifest Lookup
resource = None"] + E["Local Execution
original_stub_func"] + F["Result"] + + A --> B + B --> C + C --> D + D --> E + E --> F + + style A fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style B fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style C fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style D fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff + style E fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff + style F fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff ``` #### Remote Execution Flow -``` -Function Call - ↓ -ProductionWrapper.wrap_function_execution() - ↓ -ServiceRegistry.get_resource_for_function() - ↓ -Manifest Lookup (resource found) - ↓ -Ensure Directory Loaded - ↓ -DirectoryClient.get_endpoints() - ↓ -Get Remote Endpoint URL - ↓ -Serialize Arguments (cloudpickle → base64) - ↓ -HTTP POST to Remote Endpoint - ↓ -Remote Function Execution - ↓ -Deserialize Result (base64 → cloudpickle) - ↓ -Result +```mermaid +flowchart TD + A["Function Call"] + B["ProductionWrapper.wrap_function_execution()"] + C["ServiceRegistry.get_resource_for_function()"] + D["Manifest Lookup
resource found"] + E["Ensure Directory Loaded"] + F["DirectoryClient.get_endpoints()"] + G["Get Remote Endpoint URL"] + H["Serialize Arguments
cloudpickle → base64"] + I["HTTP POST to Remote Endpoint"] + J["Remote Function Execution"] + K["Deserialize Result
base64 → cloudpickle"] + L["Result"] + + A --> B + B --> C + C --> D + D --> E + E --> F + F --> G + G --> H + H --> I + I --> J + J --> K + K --> L + + style A fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style B fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style C fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style D fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style E fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style F fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style G fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style H fill:#f57c00,stroke:#e65100,stroke-width:3px,color:#fff + style I fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style J fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style K fill:#f57c00,stroke:#e65100,stroke-width:3px,color:#fff + style L fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff ``` ### Design Decisions @@ -920,6 +944,101 @@ client = DirectoryClient(mothership_url=...) endpoints = await client.get_endpoints() ``` +## Manifest Synchronization with RunPod GraphQL API + +### Overview + +The Mothership's GET /manifest endpoint pulls configuration from RunPod's GraphQL API, +which serves as the single source of truth for manifest data. This enables centralized +configuration management and ensures all child endpoints receive consistent routing +information. + +### Architecture + +```mermaid +flowchart TD + A["Child Endpoint
GET /manifest"] + B["Mothership"] + C["ManifestFetcher"] + D{Cache Valid?} + E["Serve Cached
Manifest"] + F["Fetch from RunPod
GraphQL API"] + G["Update
flash_manifest.json"] + H["Cache Result
TTL: 300s"] + I["Serve Manifest"] + J["Fallback:
Load Local File"] + + A -->|Request| B + B --> C + C --> D + D -->|Yes| E + D -->|No| F + E --> I + F --> G + G --> H + H --> I + F -->|Fails| J + J --> I + + style A fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style B fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff + style C fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff + style D fill:#f57c00,stroke:#e65100,stroke-width:3px,color:#fff + style E fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff + style F fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style G fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff + style H fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff + style I fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff + style J fill:#d32f2f,stroke:#b71c1c,stroke-width:3px,color:#fff +``` + +### How It Works + +1. **Source of Truth**: RunPod GraphQL API holds the authoritative manifest configuration +2. **Caching Proxy**: Mothership fetches from RunPod GQL, caches locally (5 min TTL) +3. **Local Persistence**: Fetched manifest written to `flash_manifest.json` +4. **Graceful Fallback**: If RunPod GQL unavailable, serves local file +5. **Cache Invalidation**: Automatic expiry after TTL, manual invalidation supported + +### Implementation Status + +**Current (Placeholder)**: +- `ManifestFetcher` class with caching infrastructure +- Uses existing `RunpodGraphQLClient` for API communication +- Falls back to local `flash_manifest.json` (GQL fetch raises `NotImplementedError`) +- Cache TTL: 300 seconds (configurable) + +**Future (Full Implementation)**: +- Implement `getManifest` query in `ManifestFetcher._fetch_from_gql()` +- Add `saveManifest` mutation for updating manifest in RunPod +- Real-time cache invalidation via webhooks +- Health checks and retry logic + +### Configuration + +```bash +# Enable Mothership mode (required for /manifest endpoint) +export FLASH_IS_MOTHERSHIP=true + +# Optional: Identify this mothership instance +export RUNPOD_ENDPOINT_ID=mothership-prod-1 + +# Required for RunPod GraphQL API access +export RUNPOD_API_KEY=your-api-key-here +``` + +### Cache Behavior + +- **Default TTL**: 300 seconds (5 minutes) +- **Cache Key**: Per-mothership instance (no cross-instance cache) +- **Thread-Safe**: Uses `asyncio.Lock` for concurrent request handling +- **Manual Invalidation**: `fetcher.invalidate_cache()` for testing + +### Historical Context + +A previous `StateManagerClient` (commit b19bf7c) used REST API. Current placeholder +prepares for GQL-based architecture with improved caching and error handling. + ## Key Implementation Highlights ### Design Focus From 375f1caa9add3408a755c44a08bbb10536dbc45e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 12 Jan 2026 04:03:44 -0800 Subject: [PATCH 07/12] feat: add ManifestFetcher for caching manifest from RunPod GraphQL - Add ManifestFetcher class with caching infrastructure (TTL: 300s) - Integrate ManifestFetcher into lb_handler /manifest endpoint - Use RunpodGraphQLClient for API communication - Fall back to local flash_manifest.json when API unavailable - Add comprehensive tests for ManifestFetcher and lb_handler --- src/tetra_rp/runtime/lb_handler.py | 28 ++- src/tetra_rp/runtime/manifest_fetcher.py | 192 ++++++++++++++++++ tests/integration/test_lb_remote_execution.py | 41 ++-- tests/unit/runtime/test_lb_handler.py | 148 +++++++++++--- tests/unit/runtime/test_manifest_fetcher.py | 164 +++++++++++++++ 5 files changed, 523 insertions(+), 50 deletions(-) create mode 100644 src/tetra_rp/runtime/manifest_fetcher.py create mode 100644 tests/unit/runtime/test_manifest_fetcher.py diff --git a/src/tetra_rp/runtime/lb_handler.py b/src/tetra_rp/runtime/lb_handler.py index 4193dd02..495261d2 100644 --- a/src/tetra_rp/runtime/lb_handler.py +++ b/src/tetra_rp/runtime/lb_handler.py @@ -23,12 +23,12 @@ import inspect import logging import os -from typing import Any, Callable, Dict +from typing import Any, Callable, Dict, Optional from fastapi import FastAPI, Request from fastapi.responses import JSONResponse -from .generic_handler import load_manifest +from .manifest_fetcher import ManifestFetcher from .serialization import ( deserialize_args, deserialize_kwargs, @@ -37,6 +37,17 @@ logger = logging.getLogger(__name__) +# Module-level manifest fetcher (singleton, reused across requests) +_manifest_fetcher: Optional[ManifestFetcher] = None + + +def _get_manifest_fetcher() -> ManifestFetcher: + """Get or create the manifest fetcher singleton.""" + global _manifest_fetcher + if _manifest_fetcher is None: + _manifest_fetcher = ManifestFetcher() + return _manifest_fetcher + def create_lb_handler( route_registry: Dict[tuple[str, str], Callable], include_execute: bool = False @@ -178,20 +189,27 @@ async def execute_remote_function(request: Request) -> Dict[str, Any]: async def get_manifest() -> JSONResponse: """Mothership discovery endpoint. - Returns the flash_manifest.json content for service discovery. + Fetches manifest from RunPod GraphQL API (source of truth), caches it + locally, and serves to child endpoints. Falls back to local file if + RunPod API is unavailable. + Only available when FLASH_IS_MOTHERSHIP=true environment variable is set. Returns: JSONResponse with manifest content or 404 if not found """ - manifest_dict = load_manifest() + fetcher = _get_manifest_fetcher() + mothership_id = os.getenv("RUNPOD_ENDPOINT_ID") + + # Fetch manifest (from cache, RunPod GQL, or local file) + manifest_dict = await fetcher.get_manifest(mothership_id) if not manifest_dict or not manifest_dict.get("resources"): return JSONResponse( status_code=404, content={ "error": "Manifest not found", - "detail": "flash_manifest.json could not be loaded", + "detail": "Could not load manifest from RunPod or local file", }, ) diff --git a/src/tetra_rp/runtime/manifest_fetcher.py b/src/tetra_rp/runtime/manifest_fetcher.py new file mode 100644 index 00000000..8815add7 --- /dev/null +++ b/src/tetra_rp/runtime/manifest_fetcher.py @@ -0,0 +1,192 @@ +"""Manifest fetcher with RunPod GQL integration and caching. + +This module provides manifest fetching from RunPod GraphQL API (source of truth) +with local file caching and fallback. +""" + +import asyncio +import json +import logging +import time +from pathlib import Path +from typing import Any, Dict, Optional + +from .config import DEFAULT_CACHE_TTL +from .generic_handler import load_manifest + +logger = logging.getLogger(__name__) + + +class ManifestFetcher: + """Fetches and caches manifest from RunPod GraphQL API. + + RunPod's GraphQL API is the source of truth for manifest data. This + fetcher pulls from it using RunpodGraphQLClient, caches locally, and + falls back to local file if RunPod API is unavailable. + """ + + def __init__( + self, + cache_ttl: int = DEFAULT_CACHE_TTL, + manifest_path: Optional[Path] = None, + ): + """Initialize manifest fetcher. + + Args: + cache_ttl: Cache time-to-live in seconds (default: 300) + manifest_path: Optional path to local manifest file + """ + self.cache_ttl = cache_ttl + self.manifest_path = manifest_path + + # Cache state + self._cached_manifest: Optional[Dict[str, Any]] = None + self._cache_loaded_at: float = 0 + self._cache_lock = asyncio.Lock() + + async def get_manifest( + self, + mothership_id: Optional[str] = None, + ) -> Optional[Dict[str, Any]]: + """Get manifest from cache or fetch from RunPod GraphQL API. + + Flow: + 1. Check if cached and not expired → return cached + 2. If expired/not cached → fetch from RunPod GraphQL API + 3. Update local flash_manifest.json with fetched data + 4. Cache the result + 5. Return manifest + + If RunPod GQL fetch fails, falls back to local file. + + Args: + mothership_id: Optional mothership endpoint ID for tracking + + Returns: + Manifest dictionary or None if unavailable + """ + async with self._cache_lock: + now = time.time() + cache_age = now - self._cache_loaded_at + + # Return cached if still valid + if self._cached_manifest and cache_age < self.cache_ttl: + logger.debug( + f"Serving cached manifest (age: {cache_age:.1f}s, " + f"TTL: {self.cache_ttl}s)" + ) + return self._cached_manifest + + # Cache expired or not loaded - fetch from RunPod GQL + logger.debug("Cache expired or empty, fetching from RunPod GraphQL API") + + try: + # Fetch from RunPod GraphQL API (placeholder) + manifest = await self._fetch_from_gql(mothership_id) + + # Update local flash_manifest.json + if manifest: + self._update_local_file(manifest) + + # Update cache + self._cached_manifest = manifest + self._cache_loaded_at = now + + logger.info( + f"Manifest fetched from RunPod GQL and cached " + f"({len(manifest.get('resources', {}))} resources)" + ) + return manifest + + except NotImplementedError: + logger.debug( + "RunPod GQL fetch not implemented, falling back to local file" + ) + except Exception as e: + logger.warning( + f"RunPod GQL fetch failed: {e}, falling back to local file" + ) + + # Fallback: load from local file + manifest = load_manifest(self.manifest_path) + if manifest: + # Cache the fallback manifest + self._cached_manifest = manifest + self._cache_loaded_at = now + logger.debug("Loaded and cached manifest from local file") + + return manifest + + async def _fetch_from_gql( + self, + mothership_id: Optional[str] = None, + ) -> Dict[str, Any]: + """Fetch manifest from RunPod GraphQL API. + + TBD: Future implementation will query RunPod's GraphQL API + to retrieve the manifest configuration. + + Args: + mothership_id: Optional mothership endpoint ID + + Returns: + Manifest dictionary from RunPod GQL + + Raises: + NotImplementedError: Placeholder for future implementation + + Note: + Future implementation will use RunpodGraphQLClient: + + ```python + async with RunpodGraphQLClient() as client: + query = ''' + query GetManifest($mothershipId: ID!) { + getManifest(mothershipId: $mothershipId) { + version + projectName + generatedAt + resources + functionRegistry + } + } + ''' + result = await client.execute(query, {"mothershipId": mothership_id}) + return result["data"]["getManifest"] + ``` + """ + raise NotImplementedError( + "RunPod manifest query not yet implemented. " + "Falling back to local flash_manifest.json file." + ) + + def _update_local_file(self, manifest: Dict[str, Any]) -> None: + """Update local flash_manifest.json with fetched data. + + Args: + manifest: Manifest dictionary to write + """ + try: + # Determine file path + if self.manifest_path: + file_path = self.manifest_path + else: + file_path = Path.cwd() / "flash_manifest.json" + + # Write manifest to file + with open(file_path, "w") as f: + json.dump(manifest, f, indent=2) + + logger.debug(f"Updated local manifest file: {file_path}") + + except Exception as e: + logger.warning(f"Failed to update local manifest file: {e}") + # Non-critical error - cached manifest still valid + + def invalidate_cache(self) -> None: + """Manually invalidate the cache. + + Next get_manifest() call will fetch from GQL. + """ + self._cache_loaded_at = 0 + logger.debug("Manifest cache invalidated") diff --git a/tests/integration/test_lb_remote_execution.py b/tests/integration/test_lb_remote_execution.py index 2aca85d0..8c45022f 100644 --- a/tests/integration/test_lb_remote_execution.py +++ b/tests/integration/test_lb_remote_execution.py @@ -309,9 +309,18 @@ def get_status(): class TestManifestEndpointIntegration: """Integration tests for GET /manifest endpoint.""" + @pytest.fixture(autouse=True) + def reset_manifest_fetcher(self): + """Reset the global manifest fetcher before each test.""" + import tetra_rp.runtime.lb_handler as lb_handler_module + + lb_handler_module._manifest_fetcher = None + yield + lb_handler_module._manifest_fetcher = None + def test_manifest_endpoint_in_live_load_balancer(self, monkeypatch): """Test manifest endpoint in LiveLoadBalancer with FLASH_IS_MOTHERSHIP=true.""" - from unittest.mock import patch + from unittest.mock import patch, AsyncMock from fastapi.testclient import TestClient monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") @@ -349,10 +358,12 @@ async def hello(): "routes": {"test-mothership": {"GET /api/hello": "hello"}}, } - # Mock load_manifest to return test manifest - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=test_manifest - ): + # Mock ManifestFetcher to return test manifest + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=test_manifest) + MockFetcher.return_value = mock_fetcher + from tetra_rp.runtime.lb_handler import create_lb_handler # Create handler with manifest endpoint enabled @@ -380,7 +391,7 @@ def test_manifest_endpoint_excluded_when_env_not_set(self): def test_manifest_endpoint_with_deployed_lb_resource(self, monkeypatch): """Test manifest endpoint with LoadBalancerSlsResource.""" - from unittest.mock import patch + from unittest.mock import patch, AsyncMock from fastapi.testclient import TestClient monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") @@ -409,9 +420,11 @@ def test_manifest_endpoint_with_deployed_lb_resource(self, monkeypatch): "function_registry": {"process_image": "gpu-worker"}, } - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=test_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=test_manifest) + MockFetcher.return_value = mock_fetcher + from tetra_rp.runtime.lb_handler import create_lb_handler # Create deployed handler (not LiveLoadBalancer) @@ -425,7 +438,7 @@ def test_manifest_endpoint_with_deployed_lb_resource(self, monkeypatch): def test_manifest_endpoint_coexists_with_ping(self, monkeypatch): """Test that /manifest endpoint coexists with /ping health check.""" - from unittest.mock import patch + from unittest.mock import patch, AsyncMock from fastapi.testclient import TestClient monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") @@ -436,9 +449,11 @@ def test_manifest_endpoint_coexists_with_ping(self, monkeypatch): "function_registry": {}, } - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=test_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=test_manifest) + MockFetcher.return_value = mock_fetcher + from tetra_rp.runtime.lb_handler import create_lb_handler app = create_lb_handler({}, include_execute=False) diff --git a/tests/unit/runtime/test_lb_handler.py b/tests/unit/runtime/test_lb_handler.py index e02c6aa0..966a2ab6 100644 --- a/tests/unit/runtime/test_lb_handler.py +++ b/tests/unit/runtime/test_lb_handler.py @@ -11,6 +11,15 @@ class TestManifestEndpoint: """Tests for GET /manifest endpoint.""" + @pytest.fixture(autouse=True) + def reset_manifest_fetcher(self): + """Reset the global manifest fetcher before each test.""" + import tetra_rp.runtime.lb_handler as lb_handler_module + + lb_handler_module._manifest_fetcher = None + yield + lb_handler_module._manifest_fetcher = None + @pytest.fixture def sample_manifest(self): """Sample manifest for testing.""" @@ -42,11 +51,15 @@ def test_manifest_endpoint_registered_when_env_var_true( self, sample_manifest, monkeypatch ): """Verify /manifest endpoint exists when FLASH_IS_MOTHERSHIP=true.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) routes = [route.path for route in app.routes] @@ -77,11 +90,15 @@ def test_manifest_endpoint_returns_200_with_valid_manifest( self, sample_manifest, monkeypatch ): """Test happy path - endpoint returns 200 with valid manifest.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) client = TestClient(app) @@ -92,9 +109,15 @@ def test_manifest_endpoint_returns_200_with_valid_manifest( def test_manifest_endpoint_returns_404_when_manifest_missing(self, monkeypatch): """Test endpoint returns 404 when manifest file not found.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") - with patch("tetra_rp.runtime.lb_handler.load_manifest", return_value={}): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value={}) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) client = TestClient(app) @@ -103,19 +126,22 @@ def test_manifest_endpoint_returns_404_when_manifest_missing(self, monkeypatch): assert response.status_code == 404 data = response.json() assert data["error"] == "Manifest not found" - assert "could not be loaded" in data["detail"] + assert "Could not load" in data["detail"] def test_manifest_endpoint_case_insensitive_env_var_true( self, sample_manifest, monkeypatch ): """Test endpoint registration with different case variations of 'true'.""" + from unittest.mock import AsyncMock + for env_value in ["True", "TRUE", "TrUe"]: monkeypatch.setenv("FLASH_IS_MOTHERSHIP", env_value) - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", - return_value=sample_manifest, - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) routes = [route.path for route in app.routes] @@ -133,11 +159,15 @@ def test_manifest_endpoint_case_insensitive_env_var_false(self, monkeypatch): def test_manifest_endpoint_response_structure(self, sample_manifest, monkeypatch): """Test that manifest response has correct structure.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) client = TestClient(app) @@ -153,6 +183,8 @@ def test_manifest_endpoint_response_structure(self, sample_manifest, monkeypatch def test_manifest_endpoint_with_empty_resources(self, monkeypatch): """Test endpoint behavior when manifest has no resources.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") empty_manifest = { @@ -162,9 +194,11 @@ def test_manifest_endpoint_with_empty_resources(self, monkeypatch): "function_registry": {}, } - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=empty_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=empty_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) client = TestClient(app) @@ -174,10 +208,16 @@ def test_manifest_endpoint_with_empty_resources(self, monkeypatch): assert response.status_code == 404 def test_manifest_endpoint_with_none_manifest(self, monkeypatch): - """Test endpoint behavior when load_manifest returns None.""" + """Test endpoint behavior when get_manifest returns None.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") - with patch("tetra_rp.runtime.lb_handler.load_manifest", return_value=None): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=None) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) client = TestClient(app) @@ -189,11 +229,15 @@ def test_manifest_endpoint_coexists_with_execute( self, sample_manifest, monkeypatch ): """Test that /manifest endpoint coexists with /execute endpoint.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=True) routes = [route.path for route in app.routes] @@ -204,6 +248,8 @@ def test_manifest_endpoint_coexists_with_user_routes( self, sample_manifest, monkeypatch ): """Test that /manifest endpoint coexists with user-defined routes.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") async def dummy_handler(): @@ -211,9 +257,11 @@ async def dummy_handler(): route_registry = {("GET", "/api/health"): dummy_handler} - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler(route_registry, include_execute=False) routes = [route.path for route in app.routes] @@ -222,11 +270,15 @@ async def dummy_handler(): def test_manifest_endpoint_content_type(self, sample_manifest, monkeypatch): """Test that /manifest endpoint returns proper JSON content-type.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=sample_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) client = TestClient(app) @@ -236,6 +288,8 @@ def test_manifest_endpoint_content_type(self, sample_manifest, monkeypatch): def test_manifest_endpoint_with_complex_manifest(self, monkeypatch): """Test endpoint with complex multi-resource manifest.""" + from unittest.mock import AsyncMock + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") complex_manifest = { @@ -274,9 +328,11 @@ def test_manifest_endpoint_with_complex_manifest(self, monkeypatch): }, } - with patch( - "tetra_rp.runtime.lb_handler.load_manifest", return_value=complex_manifest - ): + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=complex_manifest) + MockFetcher.return_value = mock_fetcher + app = create_lb_handler({}, include_execute=False) client = TestClient(app) @@ -288,6 +344,34 @@ def test_manifest_endpoint_with_complex_manifest(self, monkeypatch): assert "gpu_config" in data["resources"] assert "cpu_config" in data["resources"] + def test_manifest_endpoint_uses_fetcher_with_caching( + self, sample_manifest, monkeypatch + ): + """Verify GET /manifest uses ManifestFetcher with caching.""" + from unittest.mock import AsyncMock + + monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + + with patch("tetra_rp.runtime.lb_handler.ManifestFetcher") as MockFetcher: + mock_fetcher = AsyncMock() + mock_fetcher.get_manifest = AsyncMock(return_value=sample_manifest) + MockFetcher.return_value = mock_fetcher + + app = create_lb_handler({}, include_execute=False) + client = TestClient(app) + + # First request + response1 = client.get("/manifest") + assert response1.status_code == 200 + assert response1.json() == sample_manifest + + # Second request - should reuse fetcher + response2 = client.get("/manifest") + assert response2.status_code == 200 + + # Verify fetcher was called (once per request) + assert mock_fetcher.get_manifest.call_count == 2 + class TestExecuteEndpointStillWorks: """Tests to ensure /execute endpoint still works after manifest changes.""" diff --git a/tests/unit/runtime/test_manifest_fetcher.py b/tests/unit/runtime/test_manifest_fetcher.py new file mode 100644 index 00000000..f7ae27a1 --- /dev/null +++ b/tests/unit/runtime/test_manifest_fetcher.py @@ -0,0 +1,164 @@ +"""Unit tests for ManifestFetcher.""" + +import asyncio +import json +from pathlib import Path +from unittest.mock import patch + +import pytest + +from tetra_rp.runtime.manifest_fetcher import ManifestFetcher + + +class TestManifestFetcher: + """Test ManifestFetcher caching and GQL integration.""" + + @pytest.fixture + def sample_manifest(self): + """Sample manifest for testing.""" + return { + "version": "1.0", + "project_name": "test-app", + "resources": {"gpu_config": {"resource_type": "ServerlessEndpoint"}}, + "function_registry": {"process_gpu": "gpu_config"}, + } + + @pytest.mark.asyncio + async def test_fetch_falls_back_to_local_file_when_gql_not_implemented( + self, sample_manifest, tmp_path + ): + """Verify fetcher falls back to local file when GQL raises NotImplementedError.""" + # Write sample manifest to temp file + manifest_file = tmp_path / "flash_manifest.json" + with open(manifest_file, "w") as f: + json.dump(sample_manifest, f) + + fetcher = ManifestFetcher(manifest_path=manifest_file) + result = await fetcher.get_manifest() + + assert result == sample_manifest + + @pytest.mark.asyncio + async def test_caching_prevents_multiple_fetches(self, sample_manifest, tmp_path): + """Verify cached manifest is reused within TTL.""" + manifest_file = tmp_path / "flash_manifest.json" + with open(manifest_file, "w") as f: + json.dump(sample_manifest, f) + + fetcher = ManifestFetcher(cache_ttl=300, manifest_path=manifest_file) + + # First call - loads from file + result1 = await fetcher.get_manifest() + assert result1 == sample_manifest + + # Second call immediately - should use cache + result2 = await fetcher.get_manifest() + assert result2 == sample_manifest + assert result2 is result1 # Same object reference (cached) + + @pytest.mark.asyncio + async def test_cache_expiration_triggers_refetch(self, sample_manifest, tmp_path): + """Verify expired cache triggers new fetch.""" + manifest_file = tmp_path / "flash_manifest.json" + with open(manifest_file, "w") as f: + json.dump(sample_manifest, f) + + # Very short TTL + fetcher = ManifestFetcher(cache_ttl=0.1, manifest_path=manifest_file) + + # First call + result1 = await fetcher.get_manifest() + assert result1 == sample_manifest + + # Wait for cache to expire + await asyncio.sleep(0.2) + + # Second call - cache expired, should refetch + result2 = await fetcher.get_manifest() + assert result2 == sample_manifest + + @pytest.mark.asyncio + async def test_fetch_from_gql_raises_not_implemented(self): + """Verify GQL fetch placeholder raises NotImplementedError.""" + fetcher = ManifestFetcher() + + with pytest.raises(NotImplementedError, match="not yet implemented"): + await fetcher._fetch_from_gql() + + @pytest.mark.asyncio + async def test_update_local_file_writes_manifest(self, sample_manifest, tmp_path): + """Verify manifest is written to local file.""" + manifest_file = tmp_path / "flash_manifest.json" + fetcher = ManifestFetcher(manifest_path=manifest_file) + + fetcher._update_local_file(sample_manifest) + + # Verify file was written + assert manifest_file.exists() + with open(manifest_file) as f: + written = json.load(f) + assert written == sample_manifest + + @pytest.mark.asyncio + async def test_cache_invalidation(self, sample_manifest, tmp_path): + """Verify manual cache invalidation works.""" + manifest_file = tmp_path / "flash_manifest.json" + with open(manifest_file, "w") as f: + json.dump(sample_manifest, f) + + fetcher = ManifestFetcher(cache_ttl=300, manifest_path=manifest_file) + + # Load and cache + await fetcher.get_manifest() + assert fetcher._cached_manifest is not None + + # Invalidate + fetcher.invalidate_cache() + + # Next call should refetch (cache_loaded_at is 0) + assert fetcher._cache_loaded_at == 0 + + @pytest.mark.asyncio + async def test_concurrent_requests_use_lock(self, sample_manifest, tmp_path): + """Verify concurrent requests are properly synchronized.""" + manifest_file = tmp_path / "flash_manifest.json" + with open(manifest_file, "w") as f: + json.dump(sample_manifest, f) + + fetcher = ManifestFetcher(manifest_path=manifest_file) + + # Make multiple concurrent requests + results = await asyncio.gather( + fetcher.get_manifest(), + fetcher.get_manifest(), + fetcher.get_manifest(), + ) + + # All should return the same manifest + assert all(r == sample_manifest for r in results) + + @pytest.mark.asyncio + async def test_handles_missing_local_file_gracefully(self): + """Verify fetcher handles missing local file gracefully.""" + # Point to non-existent file + fetcher = ManifestFetcher(manifest_path=Path("/nonexistent/manifest.json")) + + # Should fall back to loading from cwd (which also won't exist in test) + result = await fetcher.get_manifest() + + # load_manifest returns empty dict when no file is found + assert result == {"resources": {}, "function_registry": {}} + + @pytest.mark.asyncio + async def test_mothership_id_passed_to_gql(self): + """Verify mothership_id is passed through to GQL fetch.""" + fetcher = ManifestFetcher() + + # Spy on _fetch_from_gql to capture arguments + with patch.object(fetcher, "_fetch_from_gql") as mock_fetch: + mock_fetch.side_effect = NotImplementedError() + + await fetcher.get_manifest(mothership_id="test-123") + + # Verify mothership_id was passed to fetch + mock_fetch.assert_called_once_with("test-123") From 1da8ee2c095e9b06853f19a7b2a23d2a541d5f38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 12 Jan 2026 08:38:00 -0800 Subject: [PATCH 08/12] refactor: rename directory terminology to manifest throughout codebase - Rename _directory to _endpoint_registry in ServiceRegistry - Rename directory_client parameter to manifest_client - Change API endpoint from /directory to /manifest - Change JSON response key from "directory" to "manifest" - Update _ensure_directory_loaded() to _ensure_manifest_loaded() - Update refresh_directory() to refresh_manifest() - Update all tests and documentation to reflect new terminology --- docs/Cross_Endpoint_Routing.md | 137 +++++++++--------- docs/Load_Balancer_Endpoints.md | 6 +- src/tetra_rp/runtime/config.py | 2 +- src/tetra_rp/runtime/exceptions.py | 2 +- src/tetra_rp/runtime/manifest_client.py | 33 ++--- src/tetra_rp/runtime/production_wrapper.py | 9 +- src/tetra_rp/runtime/service_registry.py | 64 ++++---- .../test_cross_endpoint_routing.py | 57 ++++---- tests/unit/runtime/test_manifest_client.py | 30 ++-- tests/unit/runtime/test_production_wrapper.py | 10 +- tests/unit/runtime/test_service_registry.py | 114 ++++++++------- 11 files changed, 234 insertions(+), 230 deletions(-) diff --git a/docs/Cross_Endpoint_Routing.md b/docs/Cross_Endpoint_Routing.md index 1a4330c6..341d4b09 100644 --- a/docs/Cross_Endpoint_Routing.md +++ b/docs/Cross_Endpoint_Routing.md @@ -59,7 +59,7 @@ The manifest structure: #### 2. Set Environment Variables -Configure the mothership directory URL (required for remote routing): +Configure the mothership manifest URL (required for remote routing): ```bash # Required for cross-endpoint routing to work @@ -149,7 +149,7 @@ The manifest file (`flash_manifest.json`) defines function routing and resource | Variable | Required | Purpose | |----------|----------|---------| -| `FLASH_MOTHERSHIP_URL` | Yes* | URL of mothership directory service | +| `FLASH_MOTHERSHIP_URL` | Yes* | URL of mothership manifest service | | `RUNPOD_ENDPOINT_ID` | No | Current endpoint ID (for tracing) | | `FLASH_MANIFEST_PATH` | No | Explicit path to manifest file | @@ -255,7 +255,7 @@ Functions gracefully fall back to local execution if routing fails: async def critical_service(request: dict) -> dict: # Routes to critical-endpoint if: # - In function_registry - # - Directory available + # - Manifest available # Otherwise executes locally return handle_critical(request) @@ -269,11 +269,11 @@ async def helper_function(x: int) -> int: #### Common Issues -**Directory Unavailable** +**Manifest Service Unavailable** If `FLASH_MOTHERSHIP_URL` is not set or unreachable: ``` -WARNING: FLASH_MOTHERSHIP_URL not set, directory unavailable +WARNING: FLASH_MOTHERSHIP_URL not set, manifest service unavailable ``` Functions default to local execution. Set the environment variable to enable routing. @@ -342,8 +342,8 @@ graph TD A["Function Call"] -->|"intercepts stub layer"| B["ProductionWrapper"] B -->|"load service configuration"| C["ServiceRegistry"] - C -->|"if not cached"| D["DirectoryClient"] - D -->|"query mothership API"| E["Directory
Endpoint URLs"] + C -->|"if not cached"| D["ManifestClient"] + D -->|"query mothership API"| E["Manifest
Endpoint URLs"] E -->|"cache result
TTL 300s"| C C -->|"lookup in manifest
flash_manifest.json"| F{"Routing
Decision"} @@ -358,7 +358,7 @@ graph TD K --> L["Return Response
base64 → cloudpickle"] L --> M["Deserialized Result"] - N["Error Handling:
- RemoteExecutionError
- SerializationError
- DirectoryUnavailableError"] -.-> H + N["Error Handling:
- RemoteExecutionError
- SerializationError
- ManifestServiceUnavailableError"] -.-> H N -.-> I N -.-> J @@ -405,8 +405,8 @@ class ProductionWrapper: **kwargs: Any, ) -> Any: """Route function execution to local or remote endpoint.""" - # 1. Load directory (if needed) - await self.service_registry._ensure_directory_loaded() + # 1. Load manifest (if needed) + await self.service_registry._ensure_manifest_loaded() # 2. Look up function in manifest resource = self.service_registry.get_resource_for_function(func.__name__) @@ -450,30 +450,29 @@ class ServiceRegistry: """Service discovery and routing for cross-endpoint function calls.""" def __init__(self, manifest_path: Optional[Path] = None): - """Initialize with manifest and optional directory client.""" + """Initialize with manifest and optional manifest client.""" self._load_manifest(manifest_path) - self._directory_client = DirectoryClient(...) - self._directory = {} # Cached endpoint URLs - self._directory_lock = asyncio.Lock() + self._manifest_client = ManifestClient(...) + self._endpoint_registry = {} # Cached endpoint URLs + self._endpoint_registry_lock = asyncio.Lock() def get_resource_for_function(self, func_name: str) -> Optional[ServerlessResource]: """Get resource config for function from manifest.""" - # Returns None if: - # - Function not in manifest - # - Explicitly set to null in manifest - - # Returns ServerlessResource if mapped in manifest - config = self._manifest["functions"].get(func_name) + # Returns the ServerlessResource if function is mapped in manifest + # Returns None if function maps to current endpoint + # Raises ValueError if function not found in manifest + config = self._manifest.function_registry.get(func_name) return self._resolve_resource(config) - async def _ensure_directory_loaded(self) -> None: - """Load directory from mothership with caching (TTL 300s).""" - if self._is_directory_fresh(): - return + async def _ensure_manifest_loaded(self) -> None: + """Load manifest from mothership if cache expired or not loaded.""" + async with self._endpoint_registry_lock: + now = time.time() + cache_age = now - self._endpoint_registry_loaded_at - async with self._directory_lock: - self._directory = await self._directory_client.get_directory() - self._directory_loaded_at = time.time() + if cache_age > self.cache_ttl: + self._endpoint_registry = await self._manifest_client.get_manifest() + self._endpoint_registry_loaded_at = now ``` **Manifest Format**: @@ -499,36 +498,36 @@ class ServiceRegistry: - `function_registry`: Maps function names to resource config names (null = local) - `resources`: Defines resource configurations and their handler details -**Directory Cache**: +**Manifest Cache**: - TTL: 300 seconds (configurable via `DEFAULT_CACHE_TTL`) - Thread-safe with `asyncio.Lock()` -- Graceful fallback if directory unavailable +- Graceful fallback if manifest service unavailable -#### 3. DirectoryClient +#### 3. ManifestClient -**Location**: `src/tetra_rp/runtime/directory_client.py` +**Location**: `src/tetra_rp/runtime/manifest_client.py` -HTTP client for mothership directory service: +HTTP client for mothership manifest service: ```python -class DirectoryClient: - """HTTP client for querying mothership directory. +class ManifestClient: + """HTTP client for querying mothership manifest. - The directory maps resource_config names to their endpoint URLs. + The manifest maps resource_config names to their endpoint URLs. Example: {"gpu_config": "https://api.runpod.io/v2/abc123"} """ - async def get_directory(self) -> Dict[str, str]: - """Fetch endpoint directory from mothership. + async def get_manifest(self) -> Dict[str, str]: + """Fetch endpoint manifest from mothership. Returns: Dictionary mapping resource_config_name → endpoint_url. Example: {"gpu_config": "https://api.runpod.io/v2/abc123"} Raises: - DirectoryUnavailableError: If directory service unavailable after retries. + ManifestServiceUnavailableError: If manifest service unavailable after retries. """ - # Queries {mothership_url}/directory endpoint with retry logic + # Queries {mothership_url}/manifest endpoint with retry logic ``` **Configuration**: @@ -561,8 +560,8 @@ class ManifestError(FlashRuntimeError): """Raised when manifest is invalid, missing, or has unexpected structure.""" pass -class DirectoryUnavailableError(FlashRuntimeError): - """Raised when directory service is unavailable.""" +class ManifestServiceUnavailableError(FlashRuntimeError): + """Raised when manifest service is unavailable.""" pass ``` @@ -576,8 +575,8 @@ except SerializationError as e: logger.error(f"Serialization failed: {e}") except ManifestError as e: logger.error(f"Manifest configuration error: {e}") -except DirectoryUnavailableError as e: - logger.warning(f"Directory unavailable, using fallback") +except ManifestServiceUnavailableError as e: + logger.warning(f"Manifest unavailable, using fallback") ``` ### Integration Points @@ -613,7 +612,7 @@ Functions retrieve remote endpoint info from ResourceManager: # ServiceRegistry uses ResourceManager to find endpoint URLs resource_manager = ResourceManager() endpoint = resource_manager.get_resource_for_function("function_name") -endpoint_url = endpoint.url # e.g., "https://api.runpod.io/v1/abc123" +endpoint_url = endpoint.url # e.g., "https://api.runpod.io/v2/abc123" ``` ### Configuration @@ -671,8 +670,8 @@ flowchart TD B["ProductionWrapper.wrap_function_execution()"] C["ServiceRegistry.get_resource_for_function()"] D["Manifest Lookup
resource found"] - E["Ensure Directory Loaded"] - F["DirectoryClient.get_endpoints()"] + E["Ensure Manifest Loaded"] + F["ManifestClient.get_manifest()"] G["Get Remote Endpoint URL"] H["Serialize Arguments
cloudpickle → base64"] I["HTTP POST to Remote Endpoint"] @@ -720,11 +719,11 @@ flowchart TD #### 2. Thread-Safe Async Caching -**Decision**: Use `asyncio.Lock()` for directory cache synchronization +**Decision**: Use `asyncio.Lock()` for manifest cache synchronization **Rationale**: - Prevents thundering herd on cache expiry -- Efficient - only one coroutine loads directory +- Efficient - only one coroutine loads manifest - Simple to understand and maintain - Follows async/await patterns @@ -740,12 +739,12 @@ flowchart TD #### 4. Graceful Fallback -**Decision**: Default to local execution if directory unavailable +**Decision**: Default to local execution if manifest service unavailable **Rationale**: - Maintains application resilience - Doesn't fail if mothership unreachable -- Allows local testing without directory +- Allows local testing without manifest service - Gradual degradation vs catastrophic failure #### 5. Transparent Routing @@ -779,15 +778,15 @@ class JsonSerializer: 2. Update ProductionWrapper to select serializer based on config 3. Add tests for new format -#### Adding New Directory Backends +#### Adding New Manifest Backends To support directories other than mothership: -1. Create client class with `get_directory()` method: +1. Create client class with `get_manifest()` method: ```python -class CustomDirectoryClient: - async def get_directory(self) -> Dict[str, str]: - """Fetch directory mapping resource_config_name → endpoint_url.""" +class CustomManifestClient: + async def get_manifest(self) -> Dict[str, str]: + """Fetch manifest mapping resource_config_name → endpoint_url.""" # Implementation specific to backend return {"resource_name": "https://endpoint.url"} ``` @@ -796,11 +795,11 @@ class CustomDirectoryClient: ```python registry = ServiceRegistry( manifest_path=Path("manifest.json"), - directory_client=CustomDirectoryClient(...) + manifest_client=CustomManifestClient(...) ) ``` -3. Update environment variable handling if needed (CustomDirectoryClient can read from env vars) +3. Update environment variable handling if needed (CustomManifestClient can read from env vars) #### Adding Routing Policies @@ -830,11 +829,11 @@ class RoutingPolicy: **ServiceRegistry Tests** (`tests/unit/runtime/test_service_registry.py`): - Manifest loading - Resource lookup -- Directory caching +- Manifest caching - TTL expiry - Lock behavior under concurrency -**DirectoryClient Tests** (`tests/unit/runtime/test_directory_client.py`): +**ManifestClient Tests** (`tests/unit/runtime/test_manifest_client.py`): - Successful HTTP requests - Error handling - Retry logic @@ -855,7 +854,7 @@ class RoutingPolicy: - End-to-end remote execution - Function call across endpoints - Error handling in real scenarios -- Directory caching behavior +- Manifest caching behavior - Serialization of complex objects #### Test Patterns @@ -904,7 +903,7 @@ logging.basicConfig(level=logging.DEBUG) # ProductionWrapper logs # ServiceRegistry logs -# DirectoryClient logs +# ManifestClient logs ``` #### Common Debug Scenarios @@ -914,8 +913,8 @@ logging.basicConfig(level=logging.DEBUG) # Check manifest print(registry._manifest) -# Check directory -print(registry._directory) +# Check cached endpoint URLs +print(registry._endpoint_registry) # Check resource lookup resource = registry.get_resource_for_function("function_name") @@ -932,16 +931,16 @@ except Exception as e: print(f"Not serializable: {e}") ``` -**Directory unavailable**: +**Manifest unavailable**: ```python # Check environment variables import os print(f"FLASH_MOTHERSHIP_URL: {os.getenv('FLASH_MOTHERSHIP_URL')}") print(f"RUNPOD_ENDPOINT_ID: {os.getenv('RUNPOD_ENDPOINT_ID')}") -# Check directory client directly -client = DirectoryClient(mothership_url=...) -endpoints = await client.get_endpoints() +# Check manifest client directly +client = ManifestClient(mothership_url=...) +endpoints = await client.get_manifest() ``` ## Manifest Synchronization with RunPod GraphQL API @@ -1044,7 +1043,7 @@ prepares for GQL-based architecture with improved caching and error handling. ### Design Focus 1. **Transparent Routing**: Functions route automatically without code changes -2. **Graceful Degradation**: Defaults to local execution if directory unavailable +2. **Graceful Degradation**: Defaults to local execution if manifest service unavailable 3. **Type Safety**: Full type hints throughout for IDE support and static analysis 4. **Thread-Safe Async**: Proper `asyncio.Lock()` usage for concurrent operations 5. **Clear Error Hierarchy**: Custom exceptions provide actionable error context @@ -1055,7 +1054,7 @@ Cross-endpoint routing provides: - **Transparency**: Functions route automatically without manual HTTP calls - **Flexibility**: Manifest-based routing enables environment-specific configurations -- **Resilience**: Graceful fallback to local execution if directory unavailable +- **Resilience**: Graceful fallback to local execution if manifest service unavailable - **Simplicity**: No changes to function code or signatures - **Debuggability**: Clear error messages and logging for troubleshooting diff --git a/docs/Load_Balancer_Endpoints.md b/docs/Load_Balancer_Endpoints.md index ea551884..62db7c7a 100644 --- a/docs/Load_Balancer_Endpoints.md +++ b/docs/Load_Balancer_Endpoints.md @@ -35,9 +35,9 @@ Load-balanced endpoints require different provisioning and health check logic th ### Why This Matters -The Mothership needs to serve as a directory server for child endpoints. This requires: +The Mothership needs to serve as a manifest server for child endpoints. This requires: - HTTP-based service discovery (not queue-based) -- Ability to expose custom endpoints (`/directory`, `/ping`) +- Ability to expose custom endpoints (`/manifest`, `/ping`) - Health checking to verify children are ready before routing traffic ## Architecture @@ -401,6 +401,6 @@ endpoint = LoadBalancerSlsResource( ## Next Steps - **Mothership integration**: Use LoadBalancerSlsResource for Mothership endpoints -- **Service discovery**: Implement `/directory` endpoint for child endpoint discovery +- **Service discovery**: Implement `/manifest` endpoint for child endpoint discovery - **Auto-provisioning**: Automatic child endpoint deployment on Mothership startup - **Cross-endpoint routing**: Route requests between endpoints using service discovery diff --git a/src/tetra_rp/runtime/config.py b/src/tetra_rp/runtime/config.py index c0efc11f..974bb5d5 100644 --- a/src/tetra_rp/runtime/config.py +++ b/src/tetra_rp/runtime/config.py @@ -5,7 +5,7 @@ DEFAULT_MAX_RETRIES = 3 DEFAULT_BACKOFF_BASE = 2 -# Directory cache configuration +# Manifest cache configuration DEFAULT_CACHE_TTL = 300 # seconds # Serialization limits diff --git a/src/tetra_rp/runtime/exceptions.py b/src/tetra_rp/runtime/exceptions.py index fec800fd..e072a6ea 100644 --- a/src/tetra_rp/runtime/exceptions.py +++ b/src/tetra_rp/runtime/exceptions.py @@ -26,6 +26,6 @@ class ManifestError(FlashRuntimeError): class ManifestServiceUnavailableError(FlashRuntimeError): - """Raised when manifest directory service is unavailable.""" + """Raised when manifest service is unavailable.""" pass diff --git a/src/tetra_rp/runtime/manifest_client.py b/src/tetra_rp/runtime/manifest_client.py index bfe69ca8..db845a63 100644 --- a/src/tetra_rp/runtime/manifest_client.py +++ b/src/tetra_rp/runtime/manifest_client.py @@ -1,4 +1,4 @@ -"""HTTP client for mothership manifest directory API.""" +"""HTTP client for mothership manifest API.""" import asyncio import logging @@ -17,13 +17,12 @@ class ManifestClient: - """HTTP client for querying mothership manifest directory service. + """HTTP client for querying mothership manifest service. - Fetches the endpoint registry that maps resource_config names to their - deployment URLs. This is the "manifest directory service" - an endpoint - registry showing where resources are deployed. + Fetches the manifest (endpoint registry) that maps resource_config names to + their deployment URLs. The manifest provides service discovery for remote + resource endpoints. - The directory maps resource_config names to their endpoint URLs. Example: {"gpu_config": "https://api.runpod.io/v2/abc123"} """ @@ -55,15 +54,15 @@ def __init__( self.max_retries = max_retries self._client: Optional[httpx.AsyncClient] = None - async def get_directory(self) -> Dict[str, str]: - """Fetch endpoint directory from mothership. + async def get_manifest(self) -> Dict[str, str]: + """Fetch endpoint manifest from mothership. Returns: Dictionary mapping resource_config_name → endpoint_url. Example: {"gpu_config": "https://api.runpod.io/v2/abc123"} Raises: - ManifestServiceUnavailableError: If manifest directory service unavailable after retries. + ManifestServiceUnavailableError: If manifest service unavailable after retries. """ if httpx is None: raise ImportError( @@ -76,25 +75,25 @@ async def get_directory(self) -> Dict[str, str]: try: client = await self._get_client() response = await client.get( - f"{self.mothership_url}/directory", + f"{self.mothership_url}/manifest", timeout=self.timeout, ) if response.status_code >= 400: raise ManifestServiceUnavailableError( - f"Directory API returned {response.status_code}: " + f"Manifest API returned {response.status_code}: " f"{response.text[:200]}" ) data = response.json() - if "directory" not in data: + if "manifest" not in data: raise ManifestServiceUnavailableError( - "Invalid directory response: missing 'directory' key" + "Invalid manifest response: missing 'manifest' key" ) - directory = data["directory"] - logger.debug(f"Directory loaded: {len(directory)} endpoints") - return directory + manifest = data["manifest"] + logger.debug(f"Manifest loaded: {len(manifest)} endpoints") + return manifest except ( asyncio.TimeoutError, @@ -112,7 +111,7 @@ async def get_directory(self) -> Dict[str, str]: continue raise ManifestServiceUnavailableError( - f"Failed to fetch manifest directory after {self.max_retries} attempts: {last_exception}" + f"Failed to fetch manifest after {self.max_retries} attempts: {last_exception}" ) async def _get_client(self) -> httpx.AsyncClient: diff --git a/src/tetra_rp/runtime/production_wrapper.py b/src/tetra_rp/runtime/production_wrapper.py index 65ce815d..22a48f9e 100644 --- a/src/tetra_rp/runtime/production_wrapper.py +++ b/src/tetra_rp/runtime/production_wrapper.py @@ -26,7 +26,6 @@ def __init__(self, service_registry: ServiceRegistry): service_registry: Service registry for routing decisions. """ self.service_registry = service_registry - self._directory_loaded = False async def wrap_function_execution( self, @@ -57,8 +56,8 @@ async def wrap_function_execution( """ function_name = func.__name__ - # Ensure directory is loaded - await self.service_registry._ensure_directory_loaded() + # Ensure manifest is loaded + await self.service_registry._ensure_manifest_loaded() # Determine routing try: @@ -116,8 +115,8 @@ async def wrap_class_method_execution( Raises: Exception: If execution fails. """ - # Ensure directory is loaded - await self.service_registry._ensure_directory_loaded() + # Ensure manifest is loaded + await self.service_registry._ensure_manifest_loaded() class_name = getattr(request, "class_name", None) diff --git a/src/tetra_rp/runtime/service_registry.py b/src/tetra_rp/runtime/service_registry.py index ddcbcd84..2a2fb865 100644 --- a/src/tetra_rp/runtime/service_registry.py +++ b/src/tetra_rp/runtime/service_registry.py @@ -22,14 +22,14 @@ class ServiceRegistry: """Service discovery and routing for cross-endpoint function calls. Loads manifest to map functions to resource configs, queries mothership - directory for endpoint URLs, and determines if function calls are local + manifest for endpoint URLs, and determines if function calls are local or remote. """ def __init__( self, manifest_path: Optional[Path] = None, - directory_client: Optional[ManifestClient] = None, + manifest_client: Optional[ManifestClient] = None, cache_ttl: int = DEFAULT_CACHE_TTL, ): """Initialize service registry. @@ -37,17 +37,17 @@ def __init__( Args: manifest_path: Path to flash_manifest.json. Defaults to FLASH_MANIFEST_PATH env var or auto-detection. - directory_client: Manifest service client for mothership API. If None, creates one + manifest_client: Manifest service client for mothership API. If None, creates one from FLASH_MOTHERSHIP_URL env var. - cache_ttl: Directory cache lifetime in seconds (default: 300). + cache_ttl: Manifest cache lifetime in seconds (default: 300). Raises: FileNotFoundError: If manifest_path doesn't exist. - ValueError: If required env vars missing for directory_client. + ValueError: If required env vars missing for manifest_client. """ self.cache_ttl = cache_ttl - self._directory: Dict[str, str] = {} - self._directory_loaded_at = 0.0 + self._endpoint_registry: Dict[str, str] = {} + self._endpoint_registry_loaded_at = 0.0 self._manifest: Manifest = Manifest( version="1.0", generated_at="", @@ -55,21 +55,23 @@ def __init__( function_registry={}, resources={}, ) - self._directory_lock = asyncio.Lock() + self._endpoint_registry_lock = asyncio.Lock() # Load manifest self._load_manifest(manifest_path) # Initialize manifest client - if directory_client is None: + if manifest_client is None: mothership_url = os.getenv("FLASH_MOTHERSHIP_URL") if mothership_url: - directory_client = ManifestClient(mothership_url=mothership_url) + manifest_client = ManifestClient(mothership_url=mothership_url) else: - logger.warning("FLASH_MOTHERSHIP_URL not set, directory unavailable") - directory_client = None + logger.warning( + "FLASH_MOTHERSHIP_URL not set, manifest service unavailable" + ) + manifest_client = None - self._directory_client = directory_client + self._manifest_client = manifest_client self._current_endpoint = os.getenv("RUNPOD_ENDPOINT_ID") def _load_manifest(self, manifest_path: Optional[Path]) -> None: @@ -127,30 +129,30 @@ def _load_manifest(self, manifest_path: Optional[Path]) -> None: resources={}, ) - async def _ensure_directory_loaded(self) -> None: - """Load directory from mothership if cache expired or not loaded.""" - async with self._directory_lock: + async def _ensure_manifest_loaded(self) -> None: + """Load manifest from mothership if cache expired or not loaded.""" + async with self._endpoint_registry_lock: now = time.time() - cache_age = now - self._directory_loaded_at + cache_age = now - self._endpoint_registry_loaded_at if cache_age > self.cache_ttl: - if self._directory_client is None: - logger.debug("Directory client not available, skipping refresh") + if self._manifest_client is None: + logger.debug("Manifest client not available, skipping refresh") return try: - self._directory = await self._directory_client.get_directory() - self._directory_loaded_at = now + self._endpoint_registry = await self._manifest_client.get_manifest() + self._endpoint_registry_loaded_at = now logger.debug( - f"Directory loaded: {len(self._directory)} endpoints, " + f"Manifest loaded: {len(self._endpoint_registry)} endpoints, " f"cache TTL {self.cache_ttl}s" ) except ManifestServiceUnavailableError as e: logger.warning( - f"Failed to load manifest directory: {e}. " + f"Failed to load manifest: {e}. " f"Cross-endpoint routing unavailable." ) - self._directory = {} + self._endpoint_registry = {} def get_endpoint_for_function(self, function_name: str) -> Optional[str]: """Get endpoint URL for a function. @@ -181,12 +183,12 @@ def get_endpoint_for_function(self, function_name: str) -> Optional[str]: if resource_config_name == self._current_endpoint: return None - # Check directory for remote endpoint URL - endpoint_url = self._directory.get(resource_config_name) + # Check manifest for remote endpoint URL + endpoint_url = self._endpoint_registry.get(resource_config_name) if not endpoint_url: logger.debug( - f"Endpoint URL for '{resource_config_name}' not in directory. " - f"Directory has: {list(self._directory.keys())}" + f"Endpoint URL for '{resource_config_name}' not in manifest. " + f"Manifest has: {list(self._endpoint_registry.keys())}" ) return endpoint_url @@ -260,9 +262,9 @@ def get_current_endpoint_id(self) -> Optional[str]: """ return self._current_endpoint - def refresh_directory(self) -> None: - """Force refresh directory from mothership on next access.""" - self._directory_loaded_at = 0 + def refresh_manifest(self) -> None: + """Force refresh manifest from mothership on next access.""" + self._endpoint_registry_loaded_at = 0 def get_manifest(self) -> Manifest: """Get loaded manifest. diff --git a/tests/integration/test_cross_endpoint_routing.py b/tests/integration/test_cross_endpoint_routing.py index 1b67967e..aab993d1 100644 --- a/tests/integration/test_cross_endpoint_routing.py +++ b/tests/integration/test_cross_endpoint_routing.py @@ -74,7 +74,7 @@ async def test_local_function_execution(self, manifest): "FLASH_MOTHERSHIP_URL": "https://mothership.example.com", }, ): - directory = { + endpoint_registry = { "gpu_config": "https://gpu.example.com", "cpu_config": "https://cpu.example.com", } @@ -88,12 +88,12 @@ async def test_local_function_execution(self, manifest): try: registry = ServiceRegistry(manifest_path=manifest_path) - mock_dir_client = AsyncMock(spec=ManifestClient) - mock_dir_client.get_directory.return_value = directory + mock_manifest_client = AsyncMock(spec=ManifestClient) + mock_manifest_client.get_manifest.return_value = endpoint_registry - registry._directory_client = mock_dir_client - registry._directory = directory - registry._directory_loaded_at = float("inf") + registry._manifest_client = mock_manifest_client + registry._endpoint_registry = endpoint_registry + registry._endpoint_registry_loaded_at = float("inf") wrapper = ProductionWrapper(registry) @@ -128,7 +128,7 @@ async def test_remote_function_execution_routing(self, manifest): "FLASH_MOTHERSHIP_URL": "https://mothership.example.com", }, ): - directory = { + endpoint_registry = { "gpu_config": "https://gpu.example.com", "cpu_config": "https://cpu.example.com", } @@ -141,11 +141,11 @@ async def test_remote_function_execution_routing(self, manifest): try: registry = ServiceRegistry(manifest_path=manifest_path) - mock_dir_client = AsyncMock(spec=ManifestClient) - mock_dir_client.get_directory.return_value = directory - registry._directory_client = mock_dir_client - registry._directory = directory - registry._directory_loaded_at = float("inf") + mock_manifest_client = AsyncMock(spec=ManifestClient) + mock_manifest_client.get_manifest.return_value = endpoint_registry + registry._manifest_client = mock_manifest_client + registry._endpoint_registry = endpoint_registry + registry._endpoint_registry_loaded_at = float("inf") # Mock ServerlessResource mock_resource = AsyncMock() @@ -183,8 +183,8 @@ async def cpu_task(x): manifest_path.unlink() @pytest.mark.asyncio - async def test_directory_loading_on_demand(self, manifest): - """Test that directory is loaded on-demand before routing decision.""" + async def test_manifest_loading_on_demand(self, manifest): + """Test that manifest is loaded on-demand before routing decision.""" with patch.dict( "os.environ", { @@ -192,7 +192,7 @@ async def test_directory_loading_on_demand(self, manifest): "FLASH_MOTHERSHIP_URL": "https://mothership.example.com", }, ): - directory = { + endpoint_registry = { "gpu_config": "https://gpu.example.com", "cpu_config": "https://cpu.example.com", } @@ -205,11 +205,11 @@ async def test_directory_loading_on_demand(self, manifest): try: registry = ServiceRegistry(manifest_path=manifest_path) - mock_dir_client = AsyncMock(spec=ManifestClient) - mock_dir_client.get_directory.return_value = directory - registry._directory_client = mock_dir_client + mock_manifest_client = AsyncMock(spec=ManifestClient) + mock_manifest_client.get_manifest.return_value = endpoint_registry + registry._manifest_client = mock_manifest_client - assert registry._directory == {} + assert registry._endpoint_registry == {} wrapper = ProductionWrapper(registry) @@ -230,8 +230,11 @@ async def cpu_task(x): original_stub, cpu_task, None, None, True ) - assert len(registry._directory) > 0 - assert registry._directory["gpu_config"] == "https://gpu.example.com" + assert len(registry._endpoint_registry) > 0 + assert ( + registry._endpoint_registry["gpu_config"] + == "https://gpu.example.com" + ) finally: manifest_path.unlink() @@ -246,7 +249,7 @@ async def test_error_handling_in_remote_execution(self, manifest): "FLASH_MOTHERSHIP_URL": "https://mothership.example.com", }, ): - directory = { + endpoint_registry = { "gpu_config": "https://gpu.example.com", "cpu_config": "https://cpu.example.com", } @@ -259,11 +262,11 @@ async def test_error_handling_in_remote_execution(self, manifest): try: registry = ServiceRegistry(manifest_path=manifest_path) - mock_dir_client = AsyncMock(spec=ManifestClient) - mock_dir_client.get_directory.return_value = directory - registry._directory_client = mock_dir_client - registry._directory = directory - registry._directory_loaded_at = float("inf") + mock_manifest_client = AsyncMock(spec=ManifestClient) + mock_manifest_client.get_manifest.return_value = endpoint_registry + registry._manifest_client = mock_manifest_client + registry._endpoint_registry = endpoint_registry + registry._endpoint_registry_loaded_at = float("inf") # Mock ServerlessResource that returns error mock_resource = AsyncMock() diff --git a/tests/unit/runtime/test_manifest_client.py b/tests/unit/runtime/test_manifest_client.py index 27bb12cc..be48a38c 100644 --- a/tests/unit/runtime/test_manifest_client.py +++ b/tests/unit/runtime/test_manifest_client.py @@ -21,7 +21,7 @@ def mock_response(self): response = MagicMock() response.status_code = 200 response.json.return_value = { - "directory": { + "manifest": { "gpu_config": "https://api.runpod.io/v2/gpu123", "cpu_config": "https://api.runpod.io/v2/cpu456", }, @@ -53,8 +53,8 @@ def test_init_explicit_over_env(self): assert client.mothership_url == "https://explicit.com" @pytest.mark.asyncio - async def test_get_directory_success(self, mock_response): - """Test successful directory fetch.""" + async def test_get_manifest_success(self, mock_response): + """Test successful manifest fetch.""" client = ManifestClient(mothership_url="https://mothership.example.com") with patch("tetra_rp.runtime.manifest_client.httpx"): @@ -63,15 +63,15 @@ async def test_get_directory_success(self, mock_response): mock_client.get.return_value = mock_response with patch.object(client, "_get_client", return_value=mock_client): - directory = await client.get_directory() + manifest = await client.get_manifest() - assert directory == { + assert manifest == { "gpu_config": "https://api.runpod.io/v2/gpu123", "cpu_config": "https://api.runpod.io/v2/cpu456", } @pytest.mark.asyncio - async def test_get_directory_http_error(self): + async def test_get_manifest_http_error(self): """Test handling of HTTP errors.""" client = ManifestClient(mothership_url="https://mothership.example.com") @@ -86,10 +86,10 @@ async def test_get_directory_http_error(self): mock_get_client.return_value = mock_http_client with pytest.raises(ManifestServiceUnavailableError, match="500"): - await client.get_directory() + await client.get_manifest() @pytest.mark.asyncio - async def test_get_directory_timeout(self): + async def test_get_manifest_timeout(self): """Test handling of request timeout.""" client = ManifestClient( mothership_url="https://mothership.example.com", timeout=0.1 @@ -104,10 +104,10 @@ async def test_get_directory_timeout(self): with pytest.raises( ManifestServiceUnavailableError, match="after \\d+ attempts" ): - await client.get_directory() + await client.get_manifest() @pytest.mark.asyncio - async def test_get_directory_retry(self): + async def test_get_manifest_retry(self): """Test retry logic on transient failure.""" client = ManifestClient( mothership_url="https://mothership.example.com", max_retries=3 @@ -115,7 +115,7 @@ async def test_get_directory_retry(self): response = MagicMock() response.status_code = 200 - response.json.return_value = {"directory": {"gpu": "https://gpu.example.com"}} + response.json.return_value = {"manifest": {"gpu": "https://gpu.example.com"}} with patch.object(client, "_get_client") as mock_get_client: mock_http_client = AsyncMock() @@ -133,12 +133,12 @@ async def test_get_directory_retry(self): "tetra_rp.runtime.manifest_client.asyncio.sleep", new_callable=AsyncMock, ): - directory = await client.get_directory() - assert directory == {"gpu": "https://gpu.example.com"} + manifest = await client.get_manifest() + assert manifest == {"gpu": "https://gpu.example.com"} assert mock_http_client.get.call_count == 3 @pytest.mark.asyncio - async def test_get_directory_exhaust_retries(self): + async def test_get_manifest_exhaust_retries(self): """Test failure after exhausting retries.""" client = ManifestClient( mothership_url="https://mothership.example.com", max_retries=2 @@ -157,7 +157,7 @@ async def test_get_directory_exhaust_retries(self): with pytest.raises( ManifestServiceUnavailableError, match="after 2 attempts" ): - await client.get_directory() + await client.get_manifest() @pytest.mark.asyncio async def test_context_manager(self): diff --git a/tests/unit/runtime/test_production_wrapper.py b/tests/unit/runtime/test_production_wrapper.py index cc628047..bda5c31d 100644 --- a/tests/unit/runtime/test_production_wrapper.py +++ b/tests/unit/runtime/test_production_wrapper.py @@ -19,7 +19,7 @@ class TestProductionWrapper: def mock_registry(self): """Mock service registry.""" registry = AsyncMock(spec=ServiceRegistry) - registry._ensure_directory_loaded = AsyncMock() + registry._ensure_manifest_loaded = AsyncMock() return registry @pytest.fixture @@ -135,8 +135,8 @@ async def test_wrap_function_remote_error( ) @pytest.mark.asyncio - async def test_wrap_function_loads_directory(self, wrapper, mock_registry): - """Test that directory is loaded before routing decision.""" + async def test_wrap_function_loads_manifest(self, wrapper, mock_registry): + """Test that manifest is loaded before routing decision.""" mock_registry.get_resource_for_function.return_value = None async def sample_func(): @@ -147,8 +147,8 @@ async def sample_func(): original_stub, sample_func, None, None, True ) - # Should ensure directory is loaded - mock_registry._ensure_directory_loaded.assert_called_once() + # Should ensure manifest is loaded + mock_registry._ensure_manifest_loaded.assert_called_once() @pytest.mark.asyncio async def test_wrap_class_method_local(self, wrapper, mock_registry, original_stub): diff --git a/tests/unit/runtime/test_service_registry.py b/tests/unit/runtime/test_service_registry.py index 8dc88aa1..c7c83aaf 100644 --- a/tests/unit/runtime/test_service_registry.py +++ b/tests/unit/runtime/test_service_registry.py @@ -103,19 +103,19 @@ def test_is_local_function_local(self, manifest_file): assert registry.is_local_function("inference") is True def test_is_local_function_remote(self, manifest_file): - """Test determining remote function (with directory loaded).""" + """Test determining remote function (with manifest loaded).""" with patch.dict(os.environ, {"RUNPOD_ENDPOINT_ID": "gpu_config"}): mock_client = AsyncMock() - mock_client.get_directory.return_value = { + mock_client.get_manifest.return_value = { "cpu_config": "https://cpu.example.com" } registry = ServiceRegistry( - manifest_path=manifest_file, directory_client=mock_client + manifest_path=manifest_file, manifest_client=mock_client ) - # After directory is loaded, CPU tasks should be recognized as remote + # After manifest is loaded, CPU tasks should be recognized as remote # (but is_local_function doesn't async load, so returns True for now) - # This is actually expected behavior - sync method can't load async directory + # This is actually expected behavior - sync method can't load async manifest assert registry.is_local_function("preprocess") is True def test_is_local_function_not_in_manifest(self, manifest_file): @@ -131,11 +131,11 @@ def test_get_endpoint_for_function_local(self, manifest_file): endpoint = registry.get_endpoint_for_function("gpu_task") assert endpoint is None # Local returns None - def test_get_endpoint_for_function_remote_no_directory(self, manifest_file): - """Test getting endpoint for remote function without directory.""" + def test_get_endpoint_for_function_remote_no_manifest(self, manifest_file): + """Test getting endpoint for remote function without manifest.""" with patch.dict(os.environ, {"RUNPOD_ENDPOINT_ID": "gpu_config"}): registry = ServiceRegistry(manifest_path=manifest_file) - # CPU function is remote, but no directory loaded + # CPU function is remote, but no manifest loaded endpoint = registry.get_endpoint_for_function("preprocess") assert endpoint is None @@ -157,15 +157,17 @@ def test_get_resource_for_function_remote(self, manifest_file): """Test getting ServerlessResource for remote function.""" with patch.dict(os.environ, {"RUNPOD_ENDPOINT_ID": "gpu_config"}): mock_client = AsyncMock() - mock_client.get_directory.return_value = { + mock_client.get_manifest.return_value = { "cpu_config": "https://api.runpod.io/v2/abc123" } registry = ServiceRegistry( - manifest_path=manifest_file, directory_client=mock_client + manifest_path=manifest_file, manifest_client=mock_client ) - # Manually set directory to simulate loaded state - registry._directory = {"cpu_config": "https://api.runpod.io/v2/abc123"} + # Manually set endpoint registry to simulate loaded state + registry._endpoint_registry = { + "cpu_config": "https://api.runpod.io/v2/abc123" + } resource = registry.get_resource_for_function("preprocess") @@ -182,77 +184,77 @@ def test_get_resource_for_function_not_in_manifest(self, manifest_file): registry.get_resource_for_function("unknown_function") @pytest.mark.asyncio - async def test_ensure_directory_loaded(self, manifest_file): - """Test lazy loading of directory from client.""" - mock_directory = { + async def test_ensure_manifest_loaded(self, manifest_file): + """Test lazy loading of manifest from client.""" + mock_endpoint_registry = { "gpu_config": "https://gpu.example.com", "cpu_config": "https://cpu.example.com", } mock_client = AsyncMock() - mock_client.get_directory.return_value = mock_directory + mock_client.get_manifest.return_value = mock_endpoint_registry registry = ServiceRegistry( - manifest_path=manifest_file, directory_client=mock_client, cache_ttl=10 + manifest_path=manifest_file, manifest_client=mock_client, cache_ttl=10 ) - # Directory not loaded yet - assert registry._directory == {} + # Endpoint registry not loaded yet + assert registry._endpoint_registry == {} - # Load directory - await registry._ensure_directory_loaded() + # Load manifest + await registry._ensure_manifest_loaded() - # Should now have loaded directory - assert registry._directory == mock_directory - mock_client.get_directory.assert_called_once() + # Should now have loaded endpoint registry + assert registry._endpoint_registry == mock_endpoint_registry + mock_client.get_manifest.assert_called_once() @pytest.mark.asyncio - async def test_ensure_directory_cache_respects_ttl(self, manifest_file): - """Test that directory cache respects TTL.""" - mock_directory = {"gpu_config": "https://gpu.example.com"} + async def test_ensure_manifest_cache_respects_ttl(self, manifest_file): + """Test that manifest cache respects TTL.""" + mock_endpoint_registry = {"gpu_config": "https://gpu.example.com"} mock_client = AsyncMock() - mock_client.get_directory.return_value = mock_directory + mock_client.get_manifest.return_value = mock_endpoint_registry registry = ServiceRegistry( - manifest_path=manifest_file, directory_client=mock_client, cache_ttl=1 + manifest_path=manifest_file, manifest_client=mock_client, cache_ttl=1 ) - # Load directory - await registry._ensure_directory_loaded() - assert mock_client.get_directory.call_count == 1 + # Load manifest + await registry._ensure_manifest_loaded() + assert mock_client.get_manifest.call_count == 1 # Immediate reload should use cache - await registry._ensure_directory_loaded() - assert mock_client.get_directory.call_count == 1 + await registry._ensure_manifest_loaded() + assert mock_client.get_manifest.call_count == 1 # After TTL, should reload - registry._directory_loaded_at = time.time() - 2 # 2 seconds ago - await registry._ensure_directory_loaded() - assert mock_client.get_directory.call_count == 2 + registry._endpoint_registry_loaded_at = time.time() - 2 # 2 seconds ago + await registry._ensure_manifest_loaded() + assert mock_client.get_manifest.call_count == 2 @pytest.mark.asyncio - async def test_refresh_directory(self, manifest_file): - """Test forcing directory refresh.""" - mock_directory = {"gpu_config": "https://gpu.example.com"} + async def test_refresh_manifest(self, manifest_file): + """Test forcing manifest refresh.""" + mock_endpoint_registry = {"gpu_config": "https://gpu.example.com"} mock_client = AsyncMock() - mock_client.get_directory.return_value = mock_directory + mock_client.get_manifest.return_value = mock_endpoint_registry registry = ServiceRegistry( - manifest_path=manifest_file, directory_client=mock_client, cache_ttl=3600 + manifest_path=manifest_file, manifest_client=mock_client, cache_ttl=3600 ) - # Load directory - await registry._ensure_directory_loaded() - assert mock_client.get_directory.call_count == 1 + # Load manifest + await registry._ensure_manifest_loaded() + assert mock_client.get_manifest.call_count == 1 # Force refresh - registry.refresh_directory() + registry.refresh_manifest() # Next load should fetch again - await registry._ensure_directory_loaded() - assert mock_client.get_directory.call_count == 2 + await registry._ensure_manifest_loaded() + assert mock_client.get_manifest.call_count == 2 def test_get_manifest(self, manifest_file): """Test getting manifest.""" @@ -282,16 +284,16 @@ def test_get_resource_functions_not_found(self, manifest_file): functions = registry.get_resource_functions("nonexistent") assert functions == [] - def test_init_no_directory_client_no_mothership_url(self, manifest_file): - """Test initialization without directory client or URL.""" + def test_init_no_manifest_client_no_mothership_url(self, manifest_file): + """Test initialization without manifest client or URL.""" with patch.dict(os.environ, {}, clear=True): registry = ServiceRegistry(manifest_path=manifest_file) - assert registry._directory_client is None + assert registry._manifest_client is None @pytest.mark.asyncio - async def test_ensure_directory_loaded_unavailable_client(self, manifest_file): - """Test directory loading when client is None.""" - registry = ServiceRegistry(manifest_path=manifest_file, directory_client=None) + async def test_ensure_manifest_loaded_unavailable_client(self, manifest_file): + """Test manifest loading when client is None.""" + registry = ServiceRegistry(manifest_path=manifest_file, manifest_client=None) # Should not fail, just log warning - await registry._ensure_directory_loaded() - assert registry._directory == {} + await registry._ensure_manifest_loaded() + assert registry._endpoint_registry == {} From 9bed355b55391d5188b541e935e691134daf9132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 12 Jan 2026 13:24:41 -0800 Subject: [PATCH 09/12] fix: align GET /manifest response format to Deployment_Architecture spec Remove {"manifest": ...} wrapper and return manifest directly per spec (Deployment_Architecture.md:235-273). Update ManifestClient parser to expect manifest directly without unwrap logic. Changes: - Remove wrapper from GET /manifest endpoint (lb_handler.py:215) - Update ManifestClient to validate manifest has "resources" key directly - Replace global _manifest_fetcher with @lru_cache(maxsize=1) for thread safety - Update all test assertions to expect unwrapped manifest format All 636 tests pass, coverage: 66.48% --- src/tetra_rp/runtime/lb_handler.py | 17 +++--- src/tetra_rp/runtime/manifest_client.py | 11 ++-- tests/integration/test_lb_remote_execution.py | 57 +++++++++++++++++++ tests/unit/runtime/test_lb_handler.py | 32 +++++------ tests/unit/runtime/test_manifest_client.py | 37 +++++++++--- 5 files changed, 115 insertions(+), 39 deletions(-) diff --git a/src/tetra_rp/runtime/lb_handler.py b/src/tetra_rp/runtime/lb_handler.py index 495261d2..3647aa52 100644 --- a/src/tetra_rp/runtime/lb_handler.py +++ b/src/tetra_rp/runtime/lb_handler.py @@ -23,7 +23,8 @@ import inspect import logging import os -from typing import Any, Callable, Dict, Optional +from functools import lru_cache +from typing import Any, Callable, Dict from fastapi import FastAPI, Request from fastapi.responses import JSONResponse @@ -37,16 +38,14 @@ logger = logging.getLogger(__name__) -# Module-level manifest fetcher (singleton, reused across requests) -_manifest_fetcher: Optional[ManifestFetcher] = None - +@lru_cache(maxsize=1) def _get_manifest_fetcher() -> ManifestFetcher: - """Get or create the manifest fetcher singleton.""" - global _manifest_fetcher - if _manifest_fetcher is None: - _manifest_fetcher = ManifestFetcher() - return _manifest_fetcher + """Get or create the manifest fetcher singleton. + + Uses @lru_cache for thread-safe lazy initialization. + """ + return ManifestFetcher() def create_lb_handler( diff --git a/src/tetra_rp/runtime/manifest_client.py b/src/tetra_rp/runtime/manifest_client.py index db845a63..eb234cbd 100644 --- a/src/tetra_rp/runtime/manifest_client.py +++ b/src/tetra_rp/runtime/manifest_client.py @@ -85,14 +85,15 @@ async def get_manifest(self) -> Dict[str, str]: f"{response.text[:200]}" ) - data = response.json() - if "manifest" not in data: + manifest = response.json() + if not isinstance(manifest, dict) or "resources" not in manifest: raise ManifestServiceUnavailableError( - "Invalid manifest response: missing 'manifest' key" + "Invalid manifest response: missing 'resources'" ) - manifest = data["manifest"] - logger.debug(f"Manifest loaded: {len(manifest)} endpoints") + logger.debug( + f"Manifest loaded: {len(manifest.get('resources', {}))} resources" + ) return manifest except ( diff --git a/tests/integration/test_lb_remote_execution.py b/tests/integration/test_lb_remote_execution.py index 8c45022f..d1413a93 100644 --- a/tests/integration/test_lb_remote_execution.py +++ b/tests/integration/test_lb_remote_execution.py @@ -393,8 +393,10 @@ def test_manifest_endpoint_with_deployed_lb_resource(self, monkeypatch): """Test manifest endpoint with LoadBalancerSlsResource.""" from unittest.mock import patch, AsyncMock from fastapi.testclient import TestClient + from tetra_rp.runtime.lb_handler import _get_manifest_fetcher monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + _get_manifest_fetcher.cache_clear() # Create test manifest for deployed endpoint test_manifest = { @@ -436,12 +438,16 @@ def test_manifest_endpoint_with_deployed_lb_resource(self, monkeypatch): assert response.status_code == 200 assert response.json() == test_manifest + _get_manifest_fetcher.cache_clear() + def test_manifest_endpoint_coexists_with_ping(self, monkeypatch): """Test that /manifest endpoint coexists with /ping health check.""" from unittest.mock import patch, AsyncMock from fastapi.testclient import TestClient + from tetra_rp.runtime.lb_handler import _get_manifest_fetcher monkeypatch.setenv("FLASH_IS_MOTHERSHIP", "true") + _get_manifest_fetcher.cache_clear() test_manifest = { "version": "1.0", @@ -465,3 +471,54 @@ def test_manifest_endpoint_coexists_with_ping(self, monkeypatch): ping_response = client.get("/ping") assert ping_response.status_code == 404 # Ping not auto-added by factory + + _get_manifest_fetcher.cache_clear() + + +class TestManifestClientToEndpointIntegration: + """Integration tests for ManifestClient calling GET /manifest endpoint.""" + + def test_manifest_client_can_parse_response(self): + """Test ManifestClient can parse manifest response directly.""" + import asyncio + from unittest.mock import patch, AsyncMock, MagicMock + from tetra_rp.runtime.manifest_client import ManifestClient + + # Create a manifest to simulate + test_manifest = { + "version": "1.0", + "generated_at": "2024-01-15T10:30:00Z", + "project_name": "test-app", + "resources": { + "gpu_config": { + "resource_type": "LoadBalancerSlsResource", + "handler_file": "handler_gpu.py", + "endpoint_url": "https://api.runpod.io/v2/gpu123", + } + }, + "function_registry": {"process_gpu": "gpu_config"}, + } + + async def test_client_parsing(): + # Create a mock httpx client that returns the manifest directly + mock_http_client = AsyncMock() + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = test_manifest + mock_http_client.get = AsyncMock(return_value=mock_response) + + # Create ManifestClient + client = ManifestClient(mothership_url="http://localhost:8000") + + # Mock the _get_client to return our mock + with patch.object(client, "_get_client", return_value=mock_http_client): + # Call get_manifest - should parse the response + result = await client.get_manifest() + + # Verify it successfully parsed the manifest + assert result == test_manifest + assert "gpu_config" in result["resources"] + assert result["function_registry"]["process_gpu"] == "gpu_config" + + # Run the async test + asyncio.run(test_client_parsing()) diff --git a/tests/unit/runtime/test_lb_handler.py b/tests/unit/runtime/test_lb_handler.py index 966a2ab6..1da78a4f 100644 --- a/tests/unit/runtime/test_lb_handler.py +++ b/tests/unit/runtime/test_lb_handler.py @@ -5,7 +5,7 @@ import pytest from fastapi.testclient import TestClient -from tetra_rp.runtime.lb_handler import create_lb_handler +from tetra_rp.runtime.lb_handler import create_lb_handler, _get_manifest_fetcher class TestManifestEndpoint: @@ -13,12 +13,10 @@ class TestManifestEndpoint: @pytest.fixture(autouse=True) def reset_manifest_fetcher(self): - """Reset the global manifest fetcher before each test.""" - import tetra_rp.runtime.lb_handler as lb_handler_module - - lb_handler_module._manifest_fetcher = None + """Reset the manifest fetcher cache before each test.""" + _get_manifest_fetcher.cache_clear() yield - lb_handler_module._manifest_fetcher = None + _get_manifest_fetcher.cache_clear() @pytest.fixture def sample_manifest(self): @@ -172,14 +170,14 @@ def test_manifest_endpoint_response_structure(self, sample_manifest, monkeypatch client = TestClient(app) response = client.get("/manifest") - data = response.json() + manifest = response.json() - # Verify structure - assert "version" in data - assert "generated_at" in data - assert "project_name" in data - assert "resources" in data - assert "function_registry" in data + # Verify manifest structure + assert "version" in manifest + assert "generated_at" in manifest + assert "project_name" in manifest + assert "resources" in manifest + assert "function_registry" in manifest def test_manifest_endpoint_with_empty_resources(self, monkeypatch): """Test endpoint behavior when manifest has no resources.""" @@ -339,10 +337,10 @@ def test_manifest_endpoint_with_complex_manifest(self, monkeypatch): response = client.get("/manifest") assert response.status_code == 200 - data = response.json() - assert len(data["resources"]) == 2 - assert "gpu_config" in data["resources"] - assert "cpu_config" in data["resources"] + manifest = response.json() + assert len(manifest["resources"]) == 2 + assert "gpu_config" in manifest["resources"] + assert "cpu_config" in manifest["resources"] def test_manifest_endpoint_uses_fetcher_with_caching( self, sample_manifest, monkeypatch diff --git a/tests/unit/runtime/test_manifest_client.py b/tests/unit/runtime/test_manifest_client.py index be48a38c..0578613e 100644 --- a/tests/unit/runtime/test_manifest_client.py +++ b/tests/unit/runtime/test_manifest_client.py @@ -21,11 +21,14 @@ def mock_response(self): response = MagicMock() response.status_code = 200 response.json.return_value = { - "manifest": { - "gpu_config": "https://api.runpod.io/v2/gpu123", - "cpu_config": "https://api.runpod.io/v2/cpu456", + "version": "1.0", + "generated_at": "2025-01-03T12:00:00Z", + "project_name": "test-app", + "resources": { + "gpu_config": {"endpoint_url": "https://api.runpod.io/v2/gpu123"}, + "cpu_config": {"endpoint_url": "https://api.runpod.io/v2/cpu456"}, }, - "updated_at": "2025-01-03T12:00:00Z", + "function_registry": {}, } return response @@ -66,8 +69,18 @@ async def test_get_manifest_success(self, mock_response): manifest = await client.get_manifest() assert manifest == { - "gpu_config": "https://api.runpod.io/v2/gpu123", - "cpu_config": "https://api.runpod.io/v2/cpu456", + "version": "1.0", + "generated_at": "2025-01-03T12:00:00Z", + "project_name": "test-app", + "resources": { + "gpu_config": { + "endpoint_url": "https://api.runpod.io/v2/gpu123" + }, + "cpu_config": { + "endpoint_url": "https://api.runpod.io/v2/cpu456" + }, + }, + "function_registry": {}, } @pytest.mark.asyncio @@ -115,7 +128,11 @@ async def test_get_manifest_retry(self): response = MagicMock() response.status_code = 200 - response.json.return_value = {"manifest": {"gpu": "https://gpu.example.com"}} + response.json.return_value = { + "version": "1.0", + "resources": {"gpu": {"endpoint_url": "https://gpu.example.com"}}, + "function_registry": {}, + } with patch.object(client, "_get_client") as mock_get_client: mock_http_client = AsyncMock() @@ -134,7 +151,11 @@ async def test_get_manifest_retry(self): new_callable=AsyncMock, ): manifest = await client.get_manifest() - assert manifest == {"gpu": "https://gpu.example.com"} + assert manifest == { + "version": "1.0", + "resources": {"gpu": {"endpoint_url": "https://gpu.example.com"}}, + "function_registry": {}, + } assert mock_http_client.get.call_count == 3 @pytest.mark.asyncio From f34f0469e856ca70365c70bcd9ad5e8e2b2eb80e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Wed, 14 Jan 2026 01:40:41 -0800 Subject: [PATCH 10/12] fix: make function_code and class_code optional for Flash deployments Removes validation that requires function_code and class_code to be present, allowing Flash deployment requests where code is pre-deployed in /app. Changes: - Remove function_code requirement for execution_type='function' - Remove class_code requirement for execution_type='class' - Add documentation explaining optional fields for Flash deployments This enables dual-mode runtime where the same handler serves both: - Live Serverless (with code in request) - Flash Deployed Apps (without code in request) --- src/tetra_rp/protos/remote_execution.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/tetra_rp/protos/remote_execution.py b/src/tetra_rp/protos/remote_execution.py index ab10bf11..a5c9f39c 100644 --- a/src/tetra_rp/protos/remote_execution.py +++ b/src/tetra_rp/protos/remote_execution.py @@ -84,26 +84,24 @@ class FunctionRequest(BaseModel): @model_validator(mode="after") def validate_execution_requirements(self) -> "FunctionRequest": - """Validate that required fields are provided based on execution_type""" + """Validate that required fields are provided based on execution_type. + + Note: function_code and class_code are optional to support Flash deployments + where code is pre-deployed and not sent with the request. + """ if self.execution_type == "function": if self.function_name is None: raise ValueError( 'function_name is required when execution_type is "function"' ) - if self.function_code is None: - raise ValueError( - 'function_code is required when execution_type is "function"' - ) + # function_code is optional - absent for Flash deployments elif self.execution_type == "class": if self.class_name is None: raise ValueError( 'class_name is required when execution_type is "class"' ) - if self.class_code is None: - raise ValueError( - 'class_code is required when execution_type is "class"' - ) + # class_code is optional - absent for Flash deployments return self From 041643e715302064895a43c5f0718139a46a0dc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Wed, 14 Jan 2026 15:30:34 -0800 Subject: [PATCH 11/12] fix: update environment variables after merge resolution - Replace FLASH_MOTHERSHIP_URL with FLASH_MOTHERSHIP_ID in integration tests - Update tests to use FLASH_RESOURCE_NAME (with RUNPOD_ENDPOINT_ID fallback) - Apply ruff formatting to service_registry.py - All quality checks passing (706 tests, 63.52% coverage) --- src/tetra_rp/runtime/service_registry.py | 4 +++- .../integration/test_cross_endpoint_routing.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/tetra_rp/runtime/service_registry.py b/src/tetra_rp/runtime/service_registry.py index e55176ee..870283bd 100644 --- a/src/tetra_rp/runtime/service_registry.py +++ b/src/tetra_rp/runtime/service_registry.py @@ -75,7 +75,9 @@ def __init__( logger.warning(f"Failed to initialize manifest client: {e}") manifest_client = None else: - logger.debug("FLASH_MOTHERSHIP_ID not set, manifest service unavailable") + logger.debug( + "FLASH_MOTHERSHIP_ID not set, manifest service unavailable" + ) manifest_client = None self._manifest_client = manifest_client diff --git a/tests/integration/test_cross_endpoint_routing.py b/tests/integration/test_cross_endpoint_routing.py index aab993d1..c17e56ab 100644 --- a/tests/integration/test_cross_endpoint_routing.py +++ b/tests/integration/test_cross_endpoint_routing.py @@ -70,8 +70,8 @@ async def test_local_function_execution(self, manifest): with patch.dict( "os.environ", { - "RUNPOD_ENDPOINT_ID": "gpu_config", - "FLASH_MOTHERSHIP_URL": "https://mothership.example.com", + "FLASH_RESOURCE_NAME": "gpu_config", + "FLASH_MOTHERSHIP_ID": "mothership-test", }, ): endpoint_registry = { @@ -124,8 +124,8 @@ async def test_remote_function_execution_routing(self, manifest): with patch.dict( "os.environ", { - "RUNPOD_ENDPOINT_ID": "gpu_config", - "FLASH_MOTHERSHIP_URL": "https://mothership.example.com", + "FLASH_RESOURCE_NAME": "gpu_config", + "FLASH_MOTHERSHIP_ID": "mothership-test", }, ): endpoint_registry = { @@ -188,8 +188,8 @@ async def test_manifest_loading_on_demand(self, manifest): with patch.dict( "os.environ", { - "RUNPOD_ENDPOINT_ID": "gpu_config", - "FLASH_MOTHERSHIP_URL": "https://mothership.example.com", + "FLASH_RESOURCE_NAME": "gpu_config", + "FLASH_MOTHERSHIP_ID": "mothership-test", }, ): endpoint_registry = { @@ -245,8 +245,8 @@ async def test_error_handling_in_remote_execution(self, manifest): with patch.dict( "os.environ", { - "RUNPOD_ENDPOINT_ID": "gpu_config", - "FLASH_MOTHERSHIP_URL": "https://mothership.example.com", + "FLASH_RESOURCE_NAME": "gpu_config", + "FLASH_MOTHERSHIP_ID": "mothership-test", }, ): endpoint_registry = { @@ -316,7 +316,7 @@ def test_factory_creates_complete_system(self): "os.environ", { "RUNPOD_ENDPOINT_ID": "resource1", - "FLASH_MOTHERSHIP_URL": "https://mothership.example.com", + "FLASH_MOTHERSHIP_ID": "mothership-test", }, ): wrapper = create_production_wrapper() From 436562f2caafbda9943f7824ef161b5c9d21c156 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Wed, 14 Jan 2026 15:36:17 -0800 Subject: [PATCH 12/12] docs: align ServiceRegistry signature with implementation - Add missing manifest_client and cache_ttl parameters to __init__ docs - Document FLASH_RESOURCE_NAME and RUNPOD_ENDPOINT_ID env vars in docstring - Show _current_endpoint initialization logic - Match actual code implementation exactly --- docs/Cross_Endpoint_Routing.md | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/docs/Cross_Endpoint_Routing.md b/docs/Cross_Endpoint_Routing.md index 44afa63f..0af5e7b1 100644 --- a/docs/Cross_Endpoint_Routing.md +++ b/docs/Cross_Endpoint_Routing.md @@ -453,12 +453,35 @@ Manages service discovery and manifest loading: class ServiceRegistry: """Service discovery and routing for cross-endpoint function calls.""" - def __init__(self, manifest_path: Optional[Path] = None): - """Initialize with manifest and optional manifest client.""" + def __init__( + self, + manifest_path: Optional[Path] = None, + manifest_client: Optional[ManifestClient] = None, + cache_ttl: int = DEFAULT_CACHE_TTL, + ): + """Initialize service registry. + + Args: + manifest_path: Path to flash_manifest.json. Defaults to + FLASH_MANIFEST_PATH env var or auto-detection. + manifest_client: Manifest service client for mothership API. If None, + creates one from FLASH_MOTHERSHIP_ID env var. + cache_ttl: Manifest cache lifetime in seconds (default: 300). + + Environment Variables (for local vs remote detection): + FLASH_RESOURCE_NAME: Resource config name for this endpoint (child endpoints). + Identifies which resource config this endpoint represents in the manifest. + RUNPOD_ENDPOINT_ID: Endpoint ID (used as fallback for mothership identification). + """ self._load_manifest(manifest_path) - self._manifest_client = ManifestClient(...) + self._manifest_client = manifest_client or ManifestClient() self._endpoint_registry = {} # Cached endpoint URLs self._endpoint_registry_lock = asyncio.Lock() + # Child endpoints use FLASH_RESOURCE_NAME to identify which resource they represent + # Mothership doesn't have FLASH_RESOURCE_NAME, so falls back to RUNPOD_ENDPOINT_ID + self._current_endpoint = os.getenv("FLASH_RESOURCE_NAME") or os.getenv( + "RUNPOD_ENDPOINT_ID" + ) def get_resource_for_function(self, func_name: str) -> Optional[ServerlessResource]: """Get resource config for function from manifest."""