From 9227999b20a1a5dd1590679b468005ada24cf4a0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 18 Dec 2025 16:46:31 -0500 Subject: [PATCH 01/12] Added a "recent times" that allows us to track query times. --- api/server.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/api/server.py b/api/server.py index 1a2965c4..65cc17e4 100755 --- a/api/server.py +++ b/api/server.py @@ -13,6 +13,7 @@ import warnings import os import re +from collections import deque from typing import Dict, List, Union, Annotated, Optional from fastapi import Body, FastAPI, Query @@ -38,6 +39,10 @@ allow_headers=["*"], ) +# We track the time taken for each Solr query for the last 1000 queries so we can track performance via /status. +RECENT_TIMES_COUNT = os.getenv("RECENT_TIMES_COUNT", 1000) +recent_query_times = deque(maxlen=RECENT_TIMES_COUNT) + # ENDPOINT / # If someone tries accessing /, we should redirect them to the Swagger interface. @app.get("/", include_in_schema=False) @@ -95,6 +100,11 @@ async def status() -> Dict: 'segmentCount': index.get('segmentCount', ''), 'lastModified': index.get('lastModified', ''), 'size': index.get('size', ''), + 'recent_queries': { + 'count': len(recent_query_times), + 'mean_time_ms': sum(recent_query_times) / len(recent_query_times) if recent_query_times else -1, + 'recent_time_ms': list(recent_query_times), + } } else: return { @@ -532,9 +542,11 @@ async def lookup(string: str, types=[f"biolink:{d}" for d in doc.get("types", [])])) time_end = time.time_ns() + time_taken_ms = (time_end - time_start)/1_000_000 + recent_query_times.append(time_taken_ms) logger.info(f"Lookup query to Solr for {json.dumps(string)} " + f"(autocomplete={autocomplete}, highlighting={highlighting}, offset={offset}, limit={limit}, biolink_types={biolink_types}, only_prefixes={only_prefixes}, exclude_prefixes={exclude_prefixes}, only_taxa={only_taxa}) " - f"took {(time_end - time_start)/1_000_000:.2f}ms (with {(time_solr_end - time_solr_start)/1_000_000:.2f}ms waiting for Solr)" + f"took {time_taken_ms:.2f}ms (with {(time_solr_end - time_solr_start)/1_000_000:.2f}ms waiting for Solr)" ) return outputs From ddeb95db45be88e99bcdd37cb8edafdfb17dd7c4 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 18 Dec 2025 16:53:48 -0500 Subject: [PATCH 02/12] Improved name. --- api/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/server.py b/api/server.py index 65cc17e4..842eb067 100755 --- a/api/server.py +++ b/api/server.py @@ -103,7 +103,7 @@ async def status() -> Dict: 'recent_queries': { 'count': len(recent_query_times), 'mean_time_ms': sum(recent_query_times) / len(recent_query_times) if recent_query_times else -1, - 'recent_time_ms': list(recent_query_times), + 'recent_times_ms': list(recent_query_times), } } else: From e17e784801cc788721b77b94ebca5448d74d5840 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 7 Apr 2026 10:40:16 -0600 Subject: [PATCH 03/12] Cleaned up code. --- api/server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/api/server.py b/api/server.py index 84cf17d3..b672818b 100755 --- a/api/server.py +++ b/api/server.py @@ -616,10 +616,11 @@ async def lookup(string: str, time_end = time.time_ns() time_taken_ms = (time_end - time_start)/1_000_000 + time_taken_ms_solr = (time_solr_end - time_solr_start)/1_000_000 recent_query_times.append(time_taken_ms) logger.info(f"Lookup query to Solr for {json.dumps(string)} " + f"(autocomplete={autocomplete}, highlighting={highlighting}, offset={offset}, limit={limit}, biolink_types={biolink_types}, only_prefixes={only_prefixes}, exclude_prefixes={exclude_prefixes}, only_taxa={only_taxa}): " - f"took {time_taken_ms:.2f}ms (with {(time_solr_end - time_solr_start)/1_000_000:.2f}ms waiting for Solr)" + f"took {time_taken_ms:.2f}ms (with {time_taken_ms_solr:.2f}ms waiting for Solr)" ) return outputs From 580d6f8559402b579f0e9fbaf5b4920426abe7f0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 7 Apr 2026 10:56:28 -0600 Subject: [PATCH 04/12] Add Solr native metrics and separate Solr latency tracking to /status - Track Solr-only wait time in a separate deque so mean_solr_time_ms can be distinguished from total API time in /status - Pull query handler (requests/errors/timeouts/p75/p95/p99), cache (hitratio/evictions), and JVM (heap %, CPU load) from Solr's /admin/metrics API; fails gracefully with solr_metrics: null - Fix RECENT_TIMES_COUNT env var type cast (int) to prevent deque crash - Replace -1 sentinel with None for empty mean; remove verbose recent_times_ms list from response Co-Authored-By: Claude Sonnet 4.6 --- api/server.py | 63 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 4 deletions(-) diff --git a/api/server.py b/api/server.py index b672818b..db4a2889 100755 --- a/api/server.py +++ b/api/server.py @@ -37,8 +37,9 @@ ) # We track the time taken for each Solr query for the last 1000 queries so we can track performance via /status. -RECENT_TIMES_COUNT = os.getenv("RECENT_TIMES_COUNT", 1000) +RECENT_TIMES_COUNT = int(os.getenv("RECENT_TIMES_COUNT", 1000)) recent_query_times = deque(maxlen=RECENT_TIMES_COUNT) +recent_solr_times = deque(maxlen=RECENT_TIMES_COUNT) # ENDPOINT / # If someone tries accessing /, we should redirect them to the Swagger interface. @@ -63,10 +64,62 @@ async def status_get() -> Dict: async def status() -> Dict: """ Return a dictionary containing status and count information for the underlying Solr instance. """ query_url = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/admin/cores" + metrics_url = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/admin/metrics" async with httpx.AsyncClient(timeout=None) as client: response = await client.get(query_url, params={ 'action': 'STATUS' }) + + # Fetch Solr query handler, cache, and JVM metrics for strain detection. + solr_metrics = None + try: + core_metrics_resp = await client.get(metrics_url, params={ + 'group': 'core', + 'prefix': 'QUERY./select,CACHE.core.queryResultCache', + 'wt': 'json', + }) + jvm_metrics_resp = await client.get(metrics_url, params={ + 'group': 'jvm', + 'prefix': 'memory.heap,os.processCpuLoad', + 'wt': 'json', + }) + if core_metrics_resp.status_code < 300 and jvm_metrics_resp.status_code < 300: + cm = core_metrics_resp.json().get('metrics', {}) + jm = jvm_metrics_resp.json().get('metrics', {}) + + # Core metrics are keyed by "solr.core.:" + core_key = next((k for k in cm if k.startswith('solr.core.')), None) + core_data = cm.get(core_key, {}) if core_key else {} + + qh = core_data.get('QUERY./select.requestTimes', {}) + cache = core_data.get('CACHE.core.queryResultCache', {}) + heap = jm.get('solr.jvm', {}).get('memory.heap', {}) + cpu = jm.get('solr.jvm', {}).get('os.processCpuLoad', None) + + solr_metrics = { + 'query_handler': { + 'requests': core_data.get('QUERY./select.requests'), + 'errors': core_data.get('QUERY./select.errors'), + 'timeouts': core_data.get('QUERY./select.timeouts'), + 'mean_ms': qh.get('mean_ms'), + 'p75_ms': qh.get('p75_ms'), + 'p95_ms': qh.get('p95_ms'), + 'p99_ms': qh.get('p99_ms'), + }, + 'cache': { + 'hitratio': cache.get('hitratio'), + 'evictions': cache.get('evictions'), + 'size': cache.get('size'), + }, + 'jvm': { + 'heap_used_mb': round(heap.get('used', 0) / 1_048_576, 1) if 'used' in heap else None, + 'heap_max_mb': round(heap.get('max', 0) / 1_048_576, 1) if 'max' in heap else None, + 'heap_used_pct': round(heap.get('used', 0) / heap['max'] * 100, 1) if heap.get('max') else None, + 'cpu_load': cpu, + }, + } + except Exception: + logger.warning("Failed to retrieve Solr metrics for /status", exc_info=True) if response.status_code >= 300: logger.error("Solr error on accessing /solr/admin/cores?action=STATUS: %s", response.text) response.raise_for_status() @@ -117,9 +170,10 @@ async def status() -> Dict: 'size': index.get('size', ''), 'recent_queries': { 'count': len(recent_query_times), - 'mean_time_ms': sum(recent_query_times) / len(recent_query_times) if recent_query_times else -1, - 'recent_times_ms': list(recent_query_times), - } + 'mean_time_ms': sum(recent_query_times) / len(recent_query_times) if recent_query_times else None, + 'mean_solr_time_ms': sum(recent_solr_times) / len(recent_solr_times) if recent_solr_times else None, + }, + 'solr_metrics': solr_metrics, } else: return { @@ -618,6 +672,7 @@ async def lookup(string: str, time_taken_ms = (time_end - time_start)/1_000_000 time_taken_ms_solr = (time_solr_end - time_solr_start)/1_000_000 recent_query_times.append(time_taken_ms) + recent_solr_times.append(time_taken_ms_solr) logger.info(f"Lookup query to Solr for {json.dumps(string)} " + f"(autocomplete={autocomplete}, highlighting={highlighting}, offset={offset}, limit={limit}, biolink_types={biolink_types}, only_prefixes={only_prefixes}, exclude_prefixes={exclude_prefixes}, only_taxa={only_taxa}): " f"took {time_taken_ms:.2f}ms (with {time_taken_ms_solr:.2f}ms waiting for Solr)" From a2e998ee9037092df84e4751d9573b6678cd14a3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 7 Apr 2026 10:59:54 -0600 Subject: [PATCH 05/12] Address code review feedback on /status metrics - Combine two /admin/metrics calls into one (group=core&group=jvm), halving the round-trip overhead per /status request - Pin core key to name_lookup_shard1_replica_n1 instead of non- deterministic next() iteration over the metrics dict - Move raise_for_status() inside the async with block so all Solr I/O is co-located - Add test_status_shape and test_status_recent_queries_populated tests - Update API.md with recent_queries and solr_metrics response fields Co-Authored-By: Claude Sonnet 4.6 --- api/server.py | 42 +++++++++++++++++++----------------------- documentation/API.md | 33 ++++++++++++++++++++++++++++++++- tests/test_service.py | 42 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 92 insertions(+), 25 deletions(-) diff --git a/api/server.py b/api/server.py index db4a2889..c4d4babf 100755 --- a/api/server.py +++ b/api/server.py @@ -69,32 +69,30 @@ async def status() -> Dict: response = await client.get(query_url, params={ 'action': 'STATUS' }) + if response.status_code >= 300: + logger.error("Solr error on accessing /solr/admin/cores?action=STATUS: %s", response.text) + response.raise_for_status() # Fetch Solr query handler, cache, and JVM metrics for strain detection. + # A single call with group=core&group=jvm retrieves both in one round-trip. + SOLR_CORE_NAME = 'name_lookup_shard1_replica_n1' solr_metrics = None try: - core_metrics_resp = await client.get(metrics_url, params={ - 'group': 'core', - 'prefix': 'QUERY./select,CACHE.core.queryResultCache', - 'wt': 'json', - }) - jvm_metrics_resp = await client.get(metrics_url, params={ - 'group': 'jvm', - 'prefix': 'memory.heap,os.processCpuLoad', - 'wt': 'json', - }) - if core_metrics_resp.status_code < 300 and jvm_metrics_resp.status_code < 300: - cm = core_metrics_resp.json().get('metrics', {}) - jm = jvm_metrics_resp.json().get('metrics', {}) - - # Core metrics are keyed by "solr.core.:" - core_key = next((k for k in cm if k.startswith('solr.core.')), None) - core_data = cm.get(core_key, {}) if core_key else {} - + metrics_resp = await client.get(metrics_url, params=[ + ('group', 'core'), + ('group', 'jvm'), + ('prefix', 'QUERY./select,CACHE.core.queryResultCache'), + ('prefix', 'memory.heap,os.processCpuLoad'), + ('wt', 'json'), + ]) + if metrics_resp.status_code < 300: + all_metrics = metrics_resp.json().get('metrics', {}) + + core_data = all_metrics.get(f'solr.core.{SOLR_CORE_NAME}', {}) qh = core_data.get('QUERY./select.requestTimes', {}) cache = core_data.get('CACHE.core.queryResultCache', {}) - heap = jm.get('solr.jvm', {}).get('memory.heap', {}) - cpu = jm.get('solr.jvm', {}).get('os.processCpuLoad', None) + heap = all_metrics.get('solr.jvm', {}).get('memory.heap', {}) + cpu = all_metrics.get('solr.jvm', {}).get('os.processCpuLoad', None) solr_metrics = { 'query_handler': { @@ -120,9 +118,7 @@ async def status() -> Dict: } except Exception: logger.warning("Failed to retrieve Solr metrics for /status", exc_info=True) - if response.status_code >= 300: - logger.error("Solr error on accessing /solr/admin/cores?action=STATUS: %s", response.text) - response.raise_for_status() + result = response.json() # Do we know the Babel version and version URL? It will be stored in an environmental variable if we do. diff --git a/documentation/API.md b/documentation/API.md index 57bcbdea..7524e965 100644 --- a/documentation/API.md +++ b/documentation/API.md @@ -333,6 +333,37 @@ Solr database. "version": 34838, "segmentCount": 57, "lastModified": "2025-09-24T19:09:56.524Z", - "size": "142.17 GB" + "size": "142.17 GB", + "recent_queries": { + "count": 1000, + "mean_time_ms": 42.3, + "mean_solr_time_ms": 38.1 + }, + "solr_metrics": { + "query_handler": { + "requests": 9842301, + "errors": 0, + "timeouts": 0, + "mean_ms": 41.2, + "p75_ms": 55.0, + "p95_ms": 120.3, + "p99_ms": 340.7 + }, + "cache": { + "hitratio": 0.91, + "evictions": 1240, + "size": 512 + }, + "jvm": { + "heap_used_mb": 4096.0, + "heap_max_mb": 8192.0, + "heap_used_pct": 50.0, + "cpu_load": 0.12 + } + } } ``` + +`recent_queries` tracks the last 1000 `/lookup` queries handled by this NameRes instance (configurable via the `RECENT_TIMES_COUNT` environment variable). `mean_time_ms` is the total end-to-end time; `mean_solr_time_ms` isolates the time spent waiting for Solr, which helps distinguish Solr-side strain from NameRes processing overhead. Both fields are `null` if no queries have been handled since startup. + +`solr_metrics` is populated directly from Solr's `/admin/metrics` API and provides native Solr health indicators: cumulative query handler statistics (useful for detecting errors or timeouts), queryResultCache hit ratio (a low ratio indicates memory pressure or cache thrashing), and JVM heap/CPU metrics. This field is `null` if the Solr metrics API is unavailable. diff --git a/tests/test_service.py b/tests/test_service.py index 2fa9a242..ece44c4d 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -259,4 +259,44 @@ def test_only_taxa_queries(): }) results_ftd_disease_with_only_taxon = response.json() assert len(results_ftd_disease_with_only_taxon) == 1 - assert results_ftd_disease_with_only_taxon[0]['curie'] == 'MONDO:0010857' \ No newline at end of file + assert results_ftd_disease_with_only_taxon[0]['curie'] == 'MONDO:0010857' + + +def test_status_shape(): + """Verify /status returns expected fields including recent_queries and solr_metrics.""" + client = TestClient(app) + response = client.get("/status") + assert response.status_code == 200 + data = response.json() + + assert data['status'] == 'ok' + assert 'numDocs' in data + + # recent_queries should always be present; count/means are None before any queries. + rq = data['recent_queries'] + assert 'count' in rq + assert 'mean_time_ms' in rq + assert 'mean_solr_time_ms' in rq + + # solr_metrics may be None if Solr's metrics API is unavailable, but if present + # it must contain the expected structure. + assert 'solr_metrics' in data + if data['solr_metrics'] is not None: + sm = data['solr_metrics'] + assert 'query_handler' in sm + assert 'cache' in sm + assert 'jvm' in sm + assert 'requests' in sm['query_handler'] + assert 'hitratio' in sm['cache'] + assert 'heap_used_pct' in sm['jvm'] + + +def test_status_recent_queries_populated(): + """After a lookup, recent_queries should reflect at least one recorded time.""" + client = TestClient(app) + client.get("/lookup", params={'string': 'alzheimer'}) + response = client.get("/status") + data = response.json() + assert data['recent_queries']['count'] >= 1 + assert data['recent_queries']['mean_time_ms'] is not None + assert data['recent_queries']['mean_solr_time_ms'] is not None \ No newline at end of file From c8e876e4a387c34394f154b170381cdd714c96b3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 7 Apr 2026 11:03:43 -0600 Subject: [PATCH 06/12] Gate solr_metrics behind ?metrics=true to avoid adding latency to /status The Solr metrics round-trip is skipped unless the caller explicitly passes ?metrics=true. The solr_metrics key is omitted from the response entirely when not requested. Tests and API.md updated accordingly. Co-Authored-By: Claude Sonnet 4.6 --- api/server.py | 92 ++++++++++++++++++++++--------------------- documentation/API.md | 2 +- tests/test_service.py | 15 ++++++- 3 files changed, 61 insertions(+), 48 deletions(-) diff --git a/api/server.py b/api/server.py index c4d4babf..994459e6 100755 --- a/api/server.py +++ b/api/server.py @@ -56,12 +56,12 @@ async def docs_redirect(): description="

This endpoint will return status information and a list of counts from the underlying Solr database instance for this NameRes instance.

" "

You can find out more about this endpoint in the API documentation.

" ) -async def status_get() -> Dict: +async def status_get(metrics: bool = False) -> Dict: """ Return status and count information from the underyling Solr instance. """ - return await status() + return await status(metrics) -async def status() -> Dict: +async def status(include_metrics: bool = False) -> Dict: """ Return a dictionary containing status and count information for the underlying Solr instance. """ query_url = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/admin/cores" metrics_url = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/admin/metrics" @@ -75,49 +75,51 @@ async def status() -> Dict: # Fetch Solr query handler, cache, and JVM metrics for strain detection. # A single call with group=core&group=jvm retrieves both in one round-trip. + # Only performed when the caller passes ?metrics=true, as it adds latency. SOLR_CORE_NAME = 'name_lookup_shard1_replica_n1' solr_metrics = None - try: - metrics_resp = await client.get(metrics_url, params=[ - ('group', 'core'), - ('group', 'jvm'), - ('prefix', 'QUERY./select,CACHE.core.queryResultCache'), - ('prefix', 'memory.heap,os.processCpuLoad'), - ('wt', 'json'), - ]) - if metrics_resp.status_code < 300: - all_metrics = metrics_resp.json().get('metrics', {}) - - core_data = all_metrics.get(f'solr.core.{SOLR_CORE_NAME}', {}) - qh = core_data.get('QUERY./select.requestTimes', {}) - cache = core_data.get('CACHE.core.queryResultCache', {}) - heap = all_metrics.get('solr.jvm', {}).get('memory.heap', {}) - cpu = all_metrics.get('solr.jvm', {}).get('os.processCpuLoad', None) - - solr_metrics = { - 'query_handler': { - 'requests': core_data.get('QUERY./select.requests'), - 'errors': core_data.get('QUERY./select.errors'), - 'timeouts': core_data.get('QUERY./select.timeouts'), - 'mean_ms': qh.get('mean_ms'), - 'p75_ms': qh.get('p75_ms'), - 'p95_ms': qh.get('p95_ms'), - 'p99_ms': qh.get('p99_ms'), - }, - 'cache': { - 'hitratio': cache.get('hitratio'), - 'evictions': cache.get('evictions'), - 'size': cache.get('size'), - }, - 'jvm': { - 'heap_used_mb': round(heap.get('used', 0) / 1_048_576, 1) if 'used' in heap else None, - 'heap_max_mb': round(heap.get('max', 0) / 1_048_576, 1) if 'max' in heap else None, - 'heap_used_pct': round(heap.get('used', 0) / heap['max'] * 100, 1) if heap.get('max') else None, - 'cpu_load': cpu, - }, - } - except Exception: - logger.warning("Failed to retrieve Solr metrics for /status", exc_info=True) + if include_metrics: + try: + metrics_resp = await client.get(metrics_url, params=[ + ('group', 'core'), + ('group', 'jvm'), + ('prefix', 'QUERY./select,CACHE.core.queryResultCache'), + ('prefix', 'memory.heap,os.processCpuLoad'), + ('wt', 'json'), + ]) + if metrics_resp.status_code < 300: + all_metrics = metrics_resp.json().get('metrics', {}) + + core_data = all_metrics.get(f'solr.core.{SOLR_CORE_NAME}', {}) + qh = core_data.get('QUERY./select.requestTimes', {}) + cache = core_data.get('CACHE.core.queryResultCache', {}) + heap = all_metrics.get('solr.jvm', {}).get('memory.heap', {}) + cpu = all_metrics.get('solr.jvm', {}).get('os.processCpuLoad', None) + + solr_metrics = { + 'query_handler': { + 'requests': core_data.get('QUERY./select.requests'), + 'errors': core_data.get('QUERY./select.errors'), + 'timeouts': core_data.get('QUERY./select.timeouts'), + 'mean_ms': qh.get('mean_ms'), + 'p75_ms': qh.get('p75_ms'), + 'p95_ms': qh.get('p95_ms'), + 'p99_ms': qh.get('p99_ms'), + }, + 'cache': { + 'hitratio': cache.get('hitratio'), + 'evictions': cache.get('evictions'), + 'size': cache.get('size'), + }, + 'jvm': { + 'heap_used_mb': round(heap.get('used', 0) / 1_048_576, 1) if 'used' in heap else None, + 'heap_max_mb': round(heap.get('max', 0) / 1_048_576, 1) if 'max' in heap else None, + 'heap_used_pct': round(heap.get('used', 0) / heap['max'] * 100, 1) if heap.get('max') else None, + 'cpu_load': cpu, + }, + } + except Exception: + logger.warning("Failed to retrieve Solr metrics for /status", exc_info=True) result = response.json() @@ -169,7 +171,7 @@ async def status() -> Dict: 'mean_time_ms': sum(recent_query_times) / len(recent_query_times) if recent_query_times else None, 'mean_solr_time_ms': sum(recent_solr_times) / len(recent_solr_times) if recent_solr_times else None, }, - 'solr_metrics': solr_metrics, + **(({'solr_metrics': solr_metrics}) if include_metrics else {}), } else: return { diff --git a/documentation/API.md b/documentation/API.md index 7524e965..453f2c1f 100644 --- a/documentation/API.md +++ b/documentation/API.md @@ -366,4 +366,4 @@ Solr database. `recent_queries` tracks the last 1000 `/lookup` queries handled by this NameRes instance (configurable via the `RECENT_TIMES_COUNT` environment variable). `mean_time_ms` is the total end-to-end time; `mean_solr_time_ms` isolates the time spent waiting for Solr, which helps distinguish Solr-side strain from NameRes processing overhead. Both fields are `null` if no queries have been handled since startup. -`solr_metrics` is populated directly from Solr's `/admin/metrics` API and provides native Solr health indicators: cumulative query handler statistics (useful for detecting errors or timeouts), queryResultCache hit ratio (a low ratio indicates memory pressure or cache thrashing), and JVM heap/CPU metrics. This field is `null` if the Solr metrics API is unavailable. +`solr_metrics` is only included when the `?metrics=true` query parameter is passed, as fetching it requires an additional round-trip to Solr. It is populated directly from Solr's `/admin/metrics` API and provides native Solr health indicators: cumulative query handler statistics (useful for detecting errors or timeouts), queryResultCache hit ratio (a low ratio indicates memory pressure or cache thrashing), and JVM heap/CPU metrics. This field is `null` within the response if the Solr metrics API is unavailable. diff --git a/tests/test_service.py b/tests/test_service.py index ece44c4d..2654c376 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -263,7 +263,7 @@ def test_only_taxa_queries(): def test_status_shape(): - """Verify /status returns expected fields including recent_queries and solr_metrics.""" + """Verify /status returns expected fields including recent_queries; solr_metrics absent by default.""" client = TestClient(app) response = client.get("/status") assert response.status_code == 200 @@ -278,9 +278,20 @@ def test_status_shape(): assert 'mean_time_ms' in rq assert 'mean_solr_time_ms' in rq + # solr_metrics should not be present unless ?metrics=true is passed. + assert 'solr_metrics' not in data + + +def test_status_metrics_param(): + """With ?metrics=true, solr_metrics is included and has the expected structure.""" + client = TestClient(app) + response = client.get("/status", params={'metrics': 'true'}) + assert response.status_code == 200 + data = response.json() + + assert 'solr_metrics' in data # solr_metrics may be None if Solr's metrics API is unavailable, but if present # it must contain the expected structure. - assert 'solr_metrics' in data if data['solr_metrics'] is not None: sm = data['solr_metrics'] assert 'query_handler' in sm From e09415300c126eba09154f8ab0011b504432fa6b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 7 Apr 2026 13:19:12 -0400 Subject: [PATCH 07/12] Update api/server.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- api/server.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/api/server.py b/api/server.py index 994459e6..792e17b7 100755 --- a/api/server.py +++ b/api/server.py @@ -83,8 +83,10 @@ async def status(include_metrics: bool = False) -> Dict: metrics_resp = await client.get(metrics_url, params=[ ('group', 'core'), ('group', 'jvm'), - ('prefix', 'QUERY./select,CACHE.core.queryResultCache'), - ('prefix', 'memory.heap,os.processCpuLoad'), + ('prefix', 'QUERY./select'), + ('prefix', 'CACHE.core.queryResultCache'), + ('prefix', 'memory.heap'), + ('prefix', 'os.processCpuLoad'), ('wt', 'json'), ]) if metrics_resp.status_code < 300: From 2e9bfa99e5f5512fb60259a4f4ee6b1b8768576d Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 7 Apr 2026 11:18:23 -0600 Subject: [PATCH 08/12] Cleaned up solr_metrics inclusion. --- api/server.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/api/server.py b/api/server.py index 792e17b7..194e7549 100755 --- a/api/server.py +++ b/api/server.py @@ -77,7 +77,9 @@ async def status(include_metrics: bool = False) -> Dict: # A single call with group=core&group=jvm retrieves both in one round-trip. # Only performed when the caller passes ?metrics=true, as it adds latency. SOLR_CORE_NAME = 'name_lookup_shard1_replica_n1' - solr_metrics = None + solr_metrics = { + "message": "Use /status?metrics=true to retrieve these metrics." + } if include_metrics: try: metrics_resp = await client.get(metrics_url, params=[ @@ -173,7 +175,7 @@ async def status(include_metrics: bool = False) -> Dict: 'mean_time_ms': sum(recent_query_times) / len(recent_query_times) if recent_query_times else None, 'mean_solr_time_ms': sum(recent_solr_times) / len(recent_solr_times) if recent_solr_times else None, }, - **(({'solr_metrics': solr_metrics}) if include_metrics else {}), + solr_metrics: solr_metrics, } else: return { @@ -187,6 +189,7 @@ async def status(include_metrics: bool = False) -> Dict: 'download_url': biolink_model_download_url, }, 'nameres_version': nameres_version, + solr_metrics: solr_metrics, } From a4a1d1d93ea41e593f45b3264f49a8a40c95f1df Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 7 Apr 2026 11:23:02 -0600 Subject: [PATCH 09/12] Dedup SOLR_CORE_NAME. --- api/server.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/api/server.py b/api/server.py index 194e7549..198184c4 100755 --- a/api/server.py +++ b/api/server.py @@ -36,8 +36,12 @@ allow_headers=["*"], ) +# Solr core name for this application. +SOLR_CORE_NAME = 'name_lookup_shard1_replica_n1' + # We track the time taken for each Solr query for the last 1000 queries so we can track performance via /status. -RECENT_TIMES_COUNT = int(os.getenv("RECENT_TIMES_COUNT", 1000)) +DEFAULT_RECENT_TIMES_COUNT = 1000 +RECENT_TIMES_COUNT = int(os.getenv("RECENT_TIMES_COUNT", DEFAULT_RECENT_TIMES_COUNT)) recent_query_times = deque(maxlen=RECENT_TIMES_COUNT) recent_solr_times = deque(maxlen=RECENT_TIMES_COUNT) @@ -76,7 +80,6 @@ async def status(include_metrics: bool = False) -> Dict: # Fetch Solr query handler, cache, and JVM metrics for strain detection. # A single call with group=core&group=jvm retrieves both in one round-trip. # Only performed when the caller passes ?metrics=true, as it adds latency. - SOLR_CORE_NAME = 'name_lookup_shard1_replica_n1' solr_metrics = { "message": "Use /status?metrics=true to retrieve these metrics." } @@ -143,9 +146,9 @@ async def status(include_metrics: bool = False) -> Dict: if 'version' in app_info and app_info['version']: nameres_version = 'v' + app_info['version'] - # We should have a status for name_lookup_shard1_replica_n1. - if 'status' in result and 'name_lookup_shard1_replica_n1' in result['status']: - core = result['status']['name_lookup_shard1_replica_n1'] + # We should have a status for SOLR_CORE_NAME. + if 'status' in result and SOLR_CORE_NAME in result['status']: + core = result['status'][SOLR_CORE_NAME] index = {} if 'index' in core: @@ -182,7 +185,7 @@ async def status(include_metrics: bool = False) -> Dict: 'status': 'error', 'message': 'Expected core not found.', 'babel_version': babel_version, - 'babel_version_url': babel_version_url, + 'babel_version_urlg': babel_version_url, 'biolink_model': { 'tag': biolink_model_tag, 'url': biolink_model_url, From e8a4a00a5c807bed2eea6ae2d62571f44d2a5634 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 7 Apr 2026 11:26:20 -0600 Subject: [PATCH 10/12] Fix typo. --- api/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/server.py b/api/server.py index 198184c4..5b34170d 100755 --- a/api/server.py +++ b/api/server.py @@ -178,7 +178,7 @@ async def status(include_metrics: bool = False) -> Dict: 'mean_time_ms': sum(recent_query_times) / len(recent_query_times) if recent_query_times else None, 'mean_solr_time_ms': sum(recent_solr_times) / len(recent_solr_times) if recent_solr_times else None, }, - solr_metrics: solr_metrics, + 'solr_metrics': solr_metrics, } else: return { @@ -192,7 +192,7 @@ async def status(include_metrics: bool = False) -> Dict: 'download_url': biolink_model_download_url, }, 'nameres_version': nameres_version, - solr_metrics: solr_metrics, + 'solr_metrics': solr_metrics, } From 17c4107358d120532632eceba549796308cad141 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 7 Apr 2026 11:30:58 -0600 Subject: [PATCH 11/12] Attempt to reorganize and fix tests. --- tests/test_service.py | 50 ------------------------------------------- tests/test_status.py | 50 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 50 deletions(-) diff --git a/tests/test_service.py b/tests/test_service.py index 2654c376..55c675fc 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -261,53 +261,3 @@ def test_only_taxa_queries(): assert len(results_ftd_disease_with_only_taxon) == 1 assert results_ftd_disease_with_only_taxon[0]['curie'] == 'MONDO:0010857' - -def test_status_shape(): - """Verify /status returns expected fields including recent_queries; solr_metrics absent by default.""" - client = TestClient(app) - response = client.get("/status") - assert response.status_code == 200 - data = response.json() - - assert data['status'] == 'ok' - assert 'numDocs' in data - - # recent_queries should always be present; count/means are None before any queries. - rq = data['recent_queries'] - assert 'count' in rq - assert 'mean_time_ms' in rq - assert 'mean_solr_time_ms' in rq - - # solr_metrics should not be present unless ?metrics=true is passed. - assert 'solr_metrics' not in data - - -def test_status_metrics_param(): - """With ?metrics=true, solr_metrics is included and has the expected structure.""" - client = TestClient(app) - response = client.get("/status", params={'metrics': 'true'}) - assert response.status_code == 200 - data = response.json() - - assert 'solr_metrics' in data - # solr_metrics may be None if Solr's metrics API is unavailable, but if present - # it must contain the expected structure. - if data['solr_metrics'] is not None: - sm = data['solr_metrics'] - assert 'query_handler' in sm - assert 'cache' in sm - assert 'jvm' in sm - assert 'requests' in sm['query_handler'] - assert 'hitratio' in sm['cache'] - assert 'heap_used_pct' in sm['jvm'] - - -def test_status_recent_queries_populated(): - """After a lookup, recent_queries should reflect at least one recorded time.""" - client = TestClient(app) - client.get("/lookup", params={'string': 'alzheimer'}) - response = client.get("/status") - data = response.json() - assert data['recent_queries']['count'] >= 1 - assert data['recent_queries']['mean_time_ms'] is not None - assert data['recent_queries']['mean_solr_time_ms'] is not None \ No newline at end of file diff --git a/tests/test_status.py b/tests/test_status.py index b48ddc07..f6290a5e 100644 --- a/tests/test_status.py +++ b/tests/test_status.py @@ -27,3 +27,53 @@ def test_status(): assert status['maxDoc'] == 89 assert status['deletedDocs'] == 0 + +def test_status_shape(): + """Verify /status returns expected fields including recent_queries; solr_metrics absent by default.""" + client = TestClient(app) + response = client.get("/status") + assert response.status_code == 200 + data = response.json() + + assert data['status'] == 'ok' + assert 'numDocs' in data + + # recent_queries should always be present; count/means are None before any queries. + rq = data['recent_queries'] + assert 'count' in rq + assert 'mean_time_ms' in rq + assert 'mean_solr_time_ms' in rq + + # solr_metrics should be present but with only a message unless ?metrics=true is passed. + assert 'solr_metrics' in data and 'message' in data['solr_metrics'] + + +def test_status_metrics_param(): + """With ?metrics=true, solr_metrics is included and has the expected structure.""" + client = TestClient(app) + response = client.get("/status", params={'metrics': 'true'}) + assert response.status_code == 200 + data = response.json() + + assert 'solr_metrics' in data + # solr_metrics may be None if Solr's metrics API is unavailable, but if present + # it must contain the expected structure. + if 'message' not in data['solr_metrics']: + sm = data['solr_metrics'] + assert 'query_handler' in sm + assert 'cache' in sm + assert 'jvm' in sm + assert 'requests' in sm['query_handler'] + assert 'hitratio' in sm['cache'] + assert 'heap_used_pct' in sm['jvm'] + + +def test_status_recent_queries_populated(): + """After a lookup, recent_queries should reflect at least one recorded time.""" + client = TestClient(app) + client.get("/lookup", params={'string': 'alzheimer'}) + response = client.get("/status") + data = response.json() + assert data['recent_queries']['count'] >= 1 + assert data['recent_queries']['mean_time_ms'] is not None + assert data['recent_queries']['mean_solr_time_ms'] is not None From 38677c309fb0913707002dccf2e5395ab0e981f7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 7 Apr 2026 11:42:26 -0600 Subject: [PATCH 12/12] Improved recent_queries output. --- api/server.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/api/server.py b/api/server.py index 5b34170d..00ae2de2 100755 --- a/api/server.py +++ b/api/server.py @@ -146,6 +146,14 @@ async def status(include_metrics: bool = False) -> Dict: if 'version' in app_info and app_info['version']: nameres_version = 'v' + app_info['version'] + # Prepare recent times for reporting. + recent_queries = { + 'max': RECENT_TIMES_COUNT, + 'count': len(recent_query_times), + 'mean_time_ms': sum(recent_query_times) / len(recent_query_times) if recent_query_times else None, + 'mean_solr_time_ms': sum(recent_solr_times) / len(recent_solr_times) if recent_solr_times else None, + } + # We should have a status for SOLR_CORE_NAME. if 'status' in result and SOLR_CORE_NAME in result['status']: core = result['status'][SOLR_CORE_NAME] @@ -173,11 +181,7 @@ async def status(include_metrics: bool = False) -> Dict: 'segmentCount': index.get('segmentCount', ''), 'lastModified': index.get('lastModified', ''), 'size': index.get('size', ''), - 'recent_queries': { - 'count': len(recent_query_times), - 'mean_time_ms': sum(recent_query_times) / len(recent_query_times) if recent_query_times else None, - 'mean_solr_time_ms': sum(recent_solr_times) / len(recent_solr_times) if recent_solr_times else None, - }, + 'recent_queries': recent_queries, 'solr_metrics': solr_metrics, } else: @@ -185,12 +189,13 @@ async def status(include_metrics: bool = False) -> Dict: 'status': 'error', 'message': 'Expected core not found.', 'babel_version': babel_version, - 'babel_version_urlg': babel_version_url, + 'babel_version_url': babel_version_url, 'biolink_model': { 'tag': biolink_model_tag, 'url': biolink_model_url, 'download_url': biolink_model_download_url, }, + 'recent_queries': recent_queries, 'nameres_version': nameres_version, 'solr_metrics': solr_metrics, }