diff --git a/api/server.py b/api/server.py index 386fd5f1..00ae2de2 100755 --- a/api/server.py +++ b/api/server.py @@ -9,6 +9,7 @@ import time import os import re +from collections import deque from enum import Enum from typing import Dict, List, Union, Annotated, Optional @@ -35,6 +36,15 @@ allow_headers=["*"], ) +# Solr core name for this application. +SOLR_CORE_NAME = 'name_lookup_shard1_replica_n1' + +# We track the time taken for each Solr query for the last 1000 queries so we can track performance via /status. +DEFAULT_RECENT_TIMES_COUNT = 1000 +RECENT_TIMES_COUNT = int(os.getenv("RECENT_TIMES_COUNT", DEFAULT_RECENT_TIMES_COUNT)) +recent_query_times = deque(maxlen=RECENT_TIMES_COUNT) +recent_solr_times = deque(maxlen=RECENT_TIMES_COUNT) + # ENDPOINT / # If someone tries accessing /, we should redirect them to the Swagger interface. @app.get("/", include_in_schema=False) @@ -50,21 +60,74 @@ async def docs_redirect(): description="
This endpoint will return status information and a list of counts from the underlying Solr database instance for this NameRes instance.
" "You can find out more about this endpoint in the API documentation.
" ) -async def status_get() -> Dict: +async def status_get(metrics: bool = False) -> Dict: """ Return status and count information from the underyling Solr instance. """ - return await status() + return await status(metrics) -async def status() -> Dict: +async def status(include_metrics: bool = False) -> Dict: """ Return a dictionary containing status and count information for the underlying Solr instance. """ query_url = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/admin/cores" + metrics_url = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/admin/metrics" async with httpx.AsyncClient(timeout=None) as client: response = await client.get(query_url, params={ 'action': 'STATUS' }) - if response.status_code >= 300: - logger.error("Solr error on accessing /solr/admin/cores?action=STATUS: %s", response.text) - response.raise_for_status() + if response.status_code >= 300: + logger.error("Solr error on accessing /solr/admin/cores?action=STATUS: %s", response.text) + response.raise_for_status() + + # Fetch Solr query handler, cache, and JVM metrics for strain detection. + # A single call with group=core&group=jvm retrieves both in one round-trip. + # Only performed when the caller passes ?metrics=true, as it adds latency. + solr_metrics = { + "message": "Use /status?metrics=true to retrieve these metrics." + } + if include_metrics: + try: + metrics_resp = await client.get(metrics_url, params=[ + ('group', 'core'), + ('group', 'jvm'), + ('prefix', 'QUERY./select'), + ('prefix', 'CACHE.core.queryResultCache'), + ('prefix', 'memory.heap'), + ('prefix', 'os.processCpuLoad'), + ('wt', 'json'), + ]) + if metrics_resp.status_code < 300: + all_metrics = metrics_resp.json().get('metrics', {}) + + core_data = all_metrics.get(f'solr.core.{SOLR_CORE_NAME}', {}) + qh = core_data.get('QUERY./select.requestTimes', {}) + cache = core_data.get('CACHE.core.queryResultCache', {}) + heap = all_metrics.get('solr.jvm', {}).get('memory.heap', {}) + cpu = all_metrics.get('solr.jvm', {}).get('os.processCpuLoad', None) + + solr_metrics = { + 'query_handler': { + 'requests': core_data.get('QUERY./select.requests'), + 'errors': core_data.get('QUERY./select.errors'), + 'timeouts': core_data.get('QUERY./select.timeouts'), + 'mean_ms': qh.get('mean_ms'), + 'p75_ms': qh.get('p75_ms'), + 'p95_ms': qh.get('p95_ms'), + 'p99_ms': qh.get('p99_ms'), + }, + 'cache': { + 'hitratio': cache.get('hitratio'), + 'evictions': cache.get('evictions'), + 'size': cache.get('size'), + }, + 'jvm': { + 'heap_used_mb': round(heap.get('used', 0) / 1_048_576, 1) if 'used' in heap else None, + 'heap_max_mb': round(heap.get('max', 0) / 1_048_576, 1) if 'max' in heap else None, + 'heap_used_pct': round(heap.get('used', 0) / heap['max'] * 100, 1) if heap.get('max') else None, + 'cpu_load': cpu, + }, + } + except Exception: + logger.warning("Failed to retrieve Solr metrics for /status", exc_info=True) + result = response.json() # Do we know the Babel version and version URL? It will be stored in an environmental variable if we do. @@ -83,9 +146,17 @@ async def status() -> Dict: if 'version' in app_info and app_info['version']: nameres_version = 'v' + app_info['version'] - # We should have a status for name_lookup_shard1_replica_n1. - if 'status' in result and 'name_lookup_shard1_replica_n1' in result['status']: - core = result['status']['name_lookup_shard1_replica_n1'] + # Prepare recent times for reporting. + recent_queries = { + 'max': RECENT_TIMES_COUNT, + 'count': len(recent_query_times), + 'mean_time_ms': sum(recent_query_times) / len(recent_query_times) if recent_query_times else None, + 'mean_solr_time_ms': sum(recent_solr_times) / len(recent_solr_times) if recent_solr_times else None, + } + + # We should have a status for SOLR_CORE_NAME. + if 'status' in result and SOLR_CORE_NAME in result['status']: + core = result['status'][SOLR_CORE_NAME] index = {} if 'index' in core: @@ -110,6 +181,8 @@ async def status() -> Dict: 'segmentCount': index.get('segmentCount', ''), 'lastModified': index.get('lastModified', ''), 'size': index.get('size', ''), + 'recent_queries': recent_queries, + 'solr_metrics': solr_metrics, } else: return { @@ -122,7 +195,9 @@ async def status() -> Dict: 'url': biolink_model_url, 'download_url': biolink_model_download_url, }, + 'recent_queries': recent_queries, 'nameres_version': nameres_version, + 'solr_metrics': solr_metrics, } @@ -605,9 +680,13 @@ async def lookup(string: str, debug=debug_for_this_request)) time_end = time.time_ns() + time_taken_ms = (time_end - time_start)/1_000_000 + time_taken_ms_solr = (time_solr_end - time_solr_start)/1_000_000 + recent_query_times.append(time_taken_ms) + recent_solr_times.append(time_taken_ms_solr) logger.info(f"Lookup query to Solr for {json.dumps(string)} " + f"(autocomplete={autocomplete}, highlighting={highlighting}, offset={offset}, limit={limit}, biolink_types={biolink_types}, only_prefixes={only_prefixes}, exclude_prefixes={exclude_prefixes}, only_taxa={only_taxa}): " - f"took {(time_end - time_start)/1_000_000:.2f}ms (with {(time_solr_end - time_solr_start)/1_000_000:.2f}ms waiting for Solr)" + f"took {time_taken_ms:.2f}ms (with {time_taken_ms_solr:.2f}ms waiting for Solr)" ) return outputs diff --git a/documentation/API.md b/documentation/API.md index 57bcbdea..453f2c1f 100644 --- a/documentation/API.md +++ b/documentation/API.md @@ -333,6 +333,37 @@ Solr database. "version": 34838, "segmentCount": 57, "lastModified": "2025-09-24T19:09:56.524Z", - "size": "142.17 GB" + "size": "142.17 GB", + "recent_queries": { + "count": 1000, + "mean_time_ms": 42.3, + "mean_solr_time_ms": 38.1 + }, + "solr_metrics": { + "query_handler": { + "requests": 9842301, + "errors": 0, + "timeouts": 0, + "mean_ms": 41.2, + "p75_ms": 55.0, + "p95_ms": 120.3, + "p99_ms": 340.7 + }, + "cache": { + "hitratio": 0.91, + "evictions": 1240, + "size": 512 + }, + "jvm": { + "heap_used_mb": 4096.0, + "heap_max_mb": 8192.0, + "heap_used_pct": 50.0, + "cpu_load": 0.12 + } + } } ``` + +`recent_queries` tracks the last 1000 `/lookup` queries handled by this NameRes instance (configurable via the `RECENT_TIMES_COUNT` environment variable). `mean_time_ms` is the total end-to-end time; `mean_solr_time_ms` isolates the time spent waiting for Solr, which helps distinguish Solr-side strain from NameRes processing overhead. Both fields are `null` if no queries have been handled since startup. + +`solr_metrics` is only included when the `?metrics=true` query parameter is passed, as fetching it requires an additional round-trip to Solr. It is populated directly from Solr's `/admin/metrics` API and provides native Solr health indicators: cumulative query handler statistics (useful for detecting errors or timeouts), queryResultCache hit ratio (a low ratio indicates memory pressure or cache thrashing), and JVM heap/CPU metrics. This field is `null` within the response if the Solr metrics API is unavailable. diff --git a/tests/test_service.py b/tests/test_service.py index 2fa9a242..55c675fc 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -259,4 +259,5 @@ def test_only_taxa_queries(): }) results_ftd_disease_with_only_taxon = response.json() assert len(results_ftd_disease_with_only_taxon) == 1 - assert results_ftd_disease_with_only_taxon[0]['curie'] == 'MONDO:0010857' \ No newline at end of file + assert results_ftd_disease_with_only_taxon[0]['curie'] == 'MONDO:0010857' + diff --git a/tests/test_status.py b/tests/test_status.py index b48ddc07..f6290a5e 100644 --- a/tests/test_status.py +++ b/tests/test_status.py @@ -27,3 +27,53 @@ def test_status(): assert status['maxDoc'] == 89 assert status['deletedDocs'] == 0 + +def test_status_shape(): + """Verify /status returns expected fields including recent_queries; solr_metrics absent by default.""" + client = TestClient(app) + response = client.get("/status") + assert response.status_code == 200 + data = response.json() + + assert data['status'] == 'ok' + assert 'numDocs' in data + + # recent_queries should always be present; count/means are None before any queries. + rq = data['recent_queries'] + assert 'count' in rq + assert 'mean_time_ms' in rq + assert 'mean_solr_time_ms' in rq + + # solr_metrics should be present but with only a message unless ?metrics=true is passed. + assert 'solr_metrics' in data and 'message' in data['solr_metrics'] + + +def test_status_metrics_param(): + """With ?metrics=true, solr_metrics is included and has the expected structure.""" + client = TestClient(app) + response = client.get("/status", params={'metrics': 'true'}) + assert response.status_code == 200 + data = response.json() + + assert 'solr_metrics' in data + # solr_metrics may be None if Solr's metrics API is unavailable, but if present + # it must contain the expected structure. + if 'message' not in data['solr_metrics']: + sm = data['solr_metrics'] + assert 'query_handler' in sm + assert 'cache' in sm + assert 'jvm' in sm + assert 'requests' in sm['query_handler'] + assert 'hitratio' in sm['cache'] + assert 'heap_used_pct' in sm['jvm'] + + +def test_status_recent_queries_populated(): + """After a lookup, recent_queries should reflect at least one recorded time.""" + client = TestClient(app) + client.get("/lookup", params={'string': 'alzheimer'}) + response = client.get("/status") + data = response.json() + assert data['recent_queries']['count'] >= 1 + assert data['recent_queries']['mean_time_ms'] is not None + assert data['recent_queries']['mean_solr_time_ms'] is not None