Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
c9276a8
feat: add GET /manifest endpoint for mothership service discovery
deanq Jan 12, 2026
11fecb2
docs: convert ASCII diagrams to MermaidJS
deanq Jan 12, 2026
6dd03f1
feat: add ManifestFetcher for caching manifest from RunPod GraphQL
deanq Jan 12, 2026
42edb23
refactor: rename directory terminology to manifest throughout codebase
deanq Jan 12, 2026
4081da3
Merge branch 'main' into deanq/ae-1643-mothership-manifest-sync-n-cache
deanq Jan 12, 2026
9bc59b4
fix: align GET /manifest response format to Deployment_Architecture spec
deanq Jan 12, 2026
97e0e2d
Merge branch 'main' into deanq/ae-1643-mothership-manifest
deanq Jan 13, 2026
632fd9e
docs: convert ASCII diagrams to MermaidJS
deanq Jan 12, 2026
375f1ca
feat: add ManifestFetcher for caching manifest from RunPod GraphQL
deanq Jan 12, 2026
1da8ee2
refactor: rename directory terminology to manifest throughout codebase
deanq Jan 12, 2026
9bed355
fix: align GET /manifest response format to Deployment_Architecture spec
deanq Jan 12, 2026
700cc7c
Merge branch 'deanq/ae-1643-mothership-manifest-sync-n-cache' of http…
deanq Jan 14, 2026
f34f046
fix: make function_code and class_code optional for Flash deployments
deanq Jan 14, 2026
c4313ac
Merge branch 'main' into deanq/ae-1643-mothership-manifest
deanq Jan 14, 2026
1f46663
Merge branch 'deanq/ae-1643-mothership-manifest' into deanq/ae-1643-m…
deanq Jan 14, 2026
cd5d0e6
Merge remote-tracking branch 'origin/main' into deanq/ae-1643-mothers…
deanq Jan 14, 2026
1091005
Merge branch 'main' into deanq/ae-1643-mothership-manifest-sync-n-cache
deanq Jan 14, 2026
041643e
fix: update environment variables after merge resolution
deanq Jan 14, 2026
436562f
docs: align ServiceRegistry signature with implementation
deanq Jan 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
621 changes: 275 additions & 346 deletions docs/Cross_Endpoint_Routing.md

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions docs/Load_Balancer_Endpoints.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ Load-balanced endpoints require different provisioning and health check logic th

### Why This Matters

The Mothership needs to serve as a directory server for child endpoints. This requires:
The Mothership needs to serve as a manifest server for child endpoints. This requires:
- HTTP-based service discovery (not queue-based)
- Ability to expose custom endpoints (`/directory`, `/ping`)
- Ability to expose custom endpoints (`/manifest`, `/ping`)
- Health checking to verify children are ready before routing traffic

## Architecture
Expand Down Expand Up @@ -401,6 +401,6 @@ endpoint = LoadBalancerSlsResource(
## Next Steps

- **Mothership integration**: Use LoadBalancerSlsResource for Mothership endpoints
- **Service discovery**: Implement `/directory` endpoint for child endpoint discovery
- **Service discovery**: Implement `/manifest` endpoint for child endpoint discovery
- **Auto-provisioning**: Automatic child endpoint deployment on Mothership startup
- **Cross-endpoint routing**: Route requests between endpoints using service discovery
16 changes: 7 additions & 9 deletions src/tetra_rp/protos/remote_execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,26 +84,24 @@ class FunctionRequest(BaseModel):

@model_validator(mode="after")
def validate_execution_requirements(self) -> "FunctionRequest":
"""Validate that required fields are provided based on execution_type"""
"""Validate that required fields are provided based on execution_type.

Note: function_code and class_code are optional to support Flash deployments
where code is pre-deployed and not sent with the request.
"""
if self.execution_type == "function":
if self.function_name is None:
raise ValueError(
'function_name is required when execution_type is "function"'
)
if self.function_code is None:
raise ValueError(
'function_code is required when execution_type is "function"'
)
# function_code is optional - absent for Flash deployments

elif self.execution_type == "class":
if self.class_name is None:
raise ValueError(
'class_name is required when execution_type is "class"'
)
if self.class_code is None:
raise ValueError(
'class_code is required when execution_type is "class"'
)
# class_code is optional - absent for Flash deployments

return self

Expand Down
2 changes: 1 addition & 1 deletion src/tetra_rp/runtime/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
DEFAULT_MAX_RETRIES = 3
DEFAULT_BACKOFF_BASE = 2

# Directory cache configuration
# Manifest cache configuration
DEFAULT_CACHE_TTL = 300 # seconds

# Serialization limits
Expand Down
2 changes: 1 addition & 1 deletion src/tetra_rp/runtime/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,6 @@ class ManifestError(FlashRuntimeError):


class ManifestServiceUnavailableError(FlashRuntimeError):
"""Raised when manifest directory service is unavailable."""
"""Raised when manifest service is unavailable."""

pass
25 changes: 21 additions & 4 deletions src/tetra_rp/runtime/lb_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@
import inspect
import logging
import os
from functools import lru_cache
from typing import Any, Callable, Dict

from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse

from .generic_handler import load_manifest
from .manifest_fetcher import ManifestFetcher
from .serialization import (
deserialize_args,
deserialize_kwargs,
Expand All @@ -38,6 +39,15 @@
logger = logging.getLogger(__name__)


@lru_cache(maxsize=1)
def _get_manifest_fetcher() -> ManifestFetcher:
"""Get or create the manifest fetcher singleton.

Uses @lru_cache for thread-safe lazy initialization.
"""
return ManifestFetcher()


def create_lb_handler(
route_registry: Dict[tuple[str, str], Callable],
include_execute: bool = False,
Expand Down Expand Up @@ -181,20 +191,27 @@ async def execute_remote_function(request: Request) -> Dict[str, Any]:
async def get_manifest() -> JSONResponse:
"""Mothership discovery endpoint.

Returns the flash_manifest.json content for service discovery.
Fetches manifest from RunPod GraphQL API (source of truth), caches it
locally, and serves to child endpoints. Falls back to local file if
RunPod API is unavailable.

Only available when FLASH_IS_MOTHERSHIP=true environment variable is set.

Returns:
JSONResponse with manifest content or 404 if not found
"""
manifest_dict = load_manifest()
fetcher = _get_manifest_fetcher()
mothership_id = os.getenv("RUNPOD_ENDPOINT_ID")

# Fetch manifest (from cache, RunPod GQL, or local file)
manifest_dict = await fetcher.get_manifest(mothership_id)

if not manifest_dict or not manifest_dict.get("resources"):
return JSONResponse(
status_code=404,
content={
"error": "Manifest not found",
"detail": "flash_manifest.json could not be loaded",
"detail": "Could not load manifest from RunPod or local file",
},
)

Expand Down
35 changes: 18 additions & 17 deletions src/tetra_rp/runtime/manifest_client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""HTTP client for mothership /manifest endpoint API."""
"""HTTP client for mothership manifest API."""

import asyncio
import logging
Expand All @@ -17,13 +17,13 @@


class ManifestClient:
"""HTTP client for querying mothership /manifest endpoint.
"""HTTP client for querying mothership manifest service.

Fetches the endpoint registry from the mothership's /manifest endpoint,
which maps resource_config names to their deployment URLs.
Fetches the manifest (endpoint registry) that maps resource_config names to
their deployment URLs. The manifest provides service discovery for remote
resource endpoints.

The manifest maps resource_config names to their endpoint URLs.
Example: {"gpu_config": "https://gpu-worker.api.runpod.ai"}
Example: {"gpu_config": "https://api.runpod.io/v2/abc123"}
"""

def __init__(
Expand Down Expand Up @@ -58,15 +58,15 @@ def __init__(
self.max_retries = max_retries
self._client: Optional[httpx.AsyncClient] = None

async def get_directory(self) -> Dict[str, str]:
"""Fetch manifest from mothership /manifest endpoint.
async def get_manifest(self) -> Dict[str, str]:
"""Fetch endpoint manifest from mothership.

Returns:
Dictionary mapping resource_config_name → endpoint_url.
Example: {"gpu_config": "https://gpu-worker.api.runpod.ai"}

Raises:
ManifestServiceUnavailableError: If /manifest endpoint unavailable after retries.
ManifestServiceUnavailableError: If manifest service unavailable after retries.
"""
if httpx is None:
raise ImportError(
Expand All @@ -85,19 +85,20 @@ async def get_directory(self) -> Dict[str, str]:

if response.status_code >= 400:
raise ManifestServiceUnavailableError(
f"Directory API returned {response.status_code}: "
f"Manifest API returned {response.status_code}: "
f"{response.text[:200]}"
)

data = response.json()
if "directory" not in data:
manifest = response.json()
if not isinstance(manifest, dict) or "resources" not in manifest:
raise ManifestServiceUnavailableError(
"Invalid directory response: missing 'directory' key"
"Invalid manifest response: missing 'resources'"
)

directory = data["directory"]
logger.debug(f"Directory loaded: {len(directory)} endpoints")
return directory
logger.debug(
f"Manifest loaded: {len(manifest.get('resources', {}))} resources"
)
return manifest

except (
asyncio.TimeoutError,
Expand All @@ -115,7 +116,7 @@ async def get_directory(self) -> Dict[str, str]:
continue

raise ManifestServiceUnavailableError(
f"Failed to fetch manifest directory after {self.max_retries} attempts: {last_exception}"
f"Failed to fetch manifest after {self.max_retries} attempts: {last_exception}"
)

async def _get_client(self) -> httpx.AsyncClient:
Expand Down
Loading
Loading