From d759096f1669447e7e23441587c65735b7826c2c Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Tue, 30 Jun 2026 17:23:27 -0500
Subject: [PATCH 1/3] feat(waterdata): add max_rows to the OGC data getters to
 cap total results
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

limit= on get_daily, get_continuous, get_monitoring_locations,
get_time_series_metadata, get_combined_metadata, get_latest_continuous,
get_latest_daily, get_field_measurements, get_field_measurements_metadata,
get_peaks, and get_channel has always been a per-page size, not a result
cap — _paginate follows every `next` link regardless, so a broad,
unfiltered call (e.g. get_daily(parameter_code="00060", limit=10)) pages
through the entire multi-year, nationwide result 10 rows at a time. Hit
this live: it hung for 2+ minutes.

get_ogc_data already threads a max_rows kwarg through to the _row_cap
context var and _finalize_ogc's truncation (get_reference_table has used
it since it was added); these 11 getters just never exposed it. Add
max_rows: int | None = None to each, exclude it from the OGC query args,
and pass it through to get_ogc_data. Each limit docstring now says
explicitly that it's a page size, not a cap, and points to max_rows.

get_cql builds its requests directly rather than through get_ogc_data, so
it isn't covered here.

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
---
 NEWS.md                        |   2 +
 dataretrieval/waterdata/api.py | 155 +++++++++++++++++++++++++++------
 tests/waterdata_test.py        |  16 ++++
 3 files changed, 148 insertions(+), 25 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index a86a0314..7c6000b1 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,5 @@
+**06/30/2026:** The OGC `waterdata` data getters (`get_daily`, `get_continuous`, `get_monitoring_locations`, `get_time_series_metadata`, `get_combined_metadata`, `get_latest_continuous`, `get_latest_daily`, `get_field_measurements`, `get_field_measurements_metadata`, `get_peaks`, `get_channel`) now accept `max_rows=`, mirroring the cap `get_reference_table` already had. `limit=` has always been a **per-page** size, not a total-result cap — a broad, unfiltered call like `get_daily(parameter_code="00060", limit=10)` would previously page through the *entire* multi-year, nationwide result 10 rows at a time rather than stopping at 10. `max_rows=` now stops pagination once that many rows have accumulated and truncates the combined result to exactly that count; `limit=`'s docstring on every affected getter now says explicitly that it's a page size. `get_cql` (the raw-CQL escape hatch) builds its own requests and isn't covered by this change.
+
 **06/23/2026:** **Breaking change (1.2.0):** the minimum supported Python is now **3.10** (`requires-python = ">=3.10"`). 3.9 support was already effectively broken — the `waterdata` module's dependencies (`anyio`, the test stack) require 3.10+, and the `waterdata` test modules already skipped on <3.10. `anyio` is now declared as a direct dependency (it is imported directly by `waterdata`), and the CI/ruff/mypy targets move to 3.10. Also fully removed the deprecated `variable_info` metadata property: the `NWIS_Metadata` override only warned and returned `None` (it relied on the defunct `get_pmcodes`), and the `BaseMetadata` abstract is gone too since nothing implemented it — accessing `.variable_info` now raises `AttributeError`. `site_info` is unaffected.
 
 **06/23/2026:** **Breaking change (1.2.0):** removed the `nadp` module and the deprecated `samples` module ahead of the 1.2.0 release. `nadp` was deprecated on 05/01/2026 — NADP is not a USGS data source, so retrieve NADP data directly from https://nadp.slh.wisc.edu/. The `samples.get_usgs_samples` shim (a deprecated forward to the modern getter) is gone; use `waterdata.get_samples()` instead. `import dataretrieval.nadp` / `import dataretrieval.samples` now raise `ModuleNotFoundError`.
diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
index e79d08f7..04c1c69d 100644
--- a/dataretrieval/waterdata/api.py
+++ b/dataretrieval/waterdata/api.py
@@ -74,6 +74,7 @@ def get_daily(
     filter: str | None = None,
     filter_lang: FILTER_LANG | None = None,
     convert_type: bool = True,
+    max_rows: int | None = None,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
     """Daily data provide one data value to represent water conditions for the
     day.
@@ -199,6 +200,9 @@ def get_daily(
         allowable limit is 50000. It may be beneficial to set this number lower
         if your internet connection is spotty. The default (None) will set the
         limit to the maximum allowable limit for the service.
+        This is a per-page size, not a cap on the total result: a query
+        matching more rows than ``limit`` still returns every matching row
+        across multiple pages. Use ``max_rows`` to cap the total instead.
     filter, filter_lang : optional
         Server-side CQL filter passed through as the OGC ``filter`` /
         ``filter-lang`` query parameters. See
@@ -206,6 +210,11 @@ def get_daily(
         and the lexicographic-comparison pitfall.
     convert_type : boolean, optional
         If True, converts columns to appropriate types.
+    max_rows : int, optional
+        Cap the total number of rows returned, stopping pagination early
+        instead of downloading the whole result. Unlike ``limit`` (the
+        per-page size), this bounds the total result across every page.
+        The default (None) follows pagination to completion.
 
     Returns
     -------
@@ -273,9 +282,9 @@ def get_daily(
     service = "daily"
 
     # Build argument dictionary, omitting None values
-    args = _get_args(locals())
+    args = _get_args(locals(), exclude={"max_rows"})
 
-    return get_ogc_data(args, service)
+    return get_ogc_data(args, service, max_rows=max_rows)
 
 
 def get_continuous(
@@ -295,6 +304,7 @@ def get_continuous(
     filter: str | None = None,
     filter_lang: FILTER_LANG | None = None,
     convert_type: bool = True,
+    max_rows: int | None = None,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
     """
     Continuous data provide instantaneous water conditions.
@@ -414,6 +424,9 @@ def get_continuous(
         allowable limit is 10000. It may be beneficial to set this number lower
         if your internet connection is spotty. The default (None) will set the
         limit to the maximum allowable limit for the service.
+        This is a per-page size, not a cap on the total result: a query
+        matching more rows than ``limit`` still returns every matching row
+        across multiple pages. Use ``max_rows`` to cap the total instead.
     filter, filter_lang : optional
         Server-side CQL filter passed through as the OGC ``filter`` /
         ``filter-lang`` query parameters. See
@@ -421,6 +434,11 @@ def get_continuous(
         and the lexicographic-comparison pitfall.
     convert_type : boolean, optional
         If True, converts columns to appropriate types.
+    max_rows : int, optional
+        Cap the total number of rows returned, stopping pagination early
+        instead of downloading the whole result. Unlike ``limit`` (the
+        per-page size), this bounds the total result across every page.
+        The default (None) follows pagination to completion.
 
     Returns
     -------
@@ -466,9 +484,9 @@ def get_continuous(
     service = "continuous"
 
     # Build argument dictionary, omitting None values
-    args = _get_args(locals())
+    args = _get_args(locals(), exclude={"max_rows"})
 
-    return get_ogc_data(args, service)
+    return get_ogc_data(args, service, max_rows=max_rows)
 
 
 def get_monitoring_locations(
@@ -520,6 +538,7 @@ def get_monitoring_locations(
     filter: str | None = None,
     filter_lang: FILTER_LANG | None = None,
     convert_type: bool = True,
+    max_rows: int | None = None,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
     """Location information is basic information about the monitoring location
     including the name, identifier, agency responsible for data collection, and
@@ -726,6 +745,9 @@ def get_monitoring_locations(
         allowable limit is 50000. It may be beneficial to set this number lower
         if your internet connection is spotty. The default (None) will set the
         limit to the maximum allowable limit for the service.
+        This is a per-page size, not a cap on the total result: a query
+        matching more rows than ``limit`` still returns every matching row
+        across multiple pages. Use ``max_rows`` to cap the total instead.
     skip_geometry : boolean, optional
         This option can be used to skip response geometries for each feature.
         The returning object will be a data frame with no spatial information.
@@ -738,6 +760,11 @@ def get_monitoring_locations(
         and the lexicographic-comparison pitfall.
     convert_type : boolean, optional
         If True, converts columns to appropriate types.
+    max_rows : int, optional
+        Cap the total number of rows returned, stopping pagination early
+        instead of downloading the whole result. Unlike ``limit`` (the
+        per-page size), this bounds the total result across every page.
+        The default (None) follows pagination to completion.
 
     Returns
     -------
@@ -774,9 +801,11 @@ def get_monitoring_locations(
 
     # Build argument dictionary, omitting None values (resolving the unified
     # `state` argument into the OGC `state_name` queryable).
-    args = _get_args(_with_state(locals(), to="name", into="state_name"))
+    args = _get_args(
+        _with_state(locals(), to="name", into="state_name"), exclude={"max_rows"}
+    )
 
-    return get_ogc_data(args, service)
+    return get_ogc_data(args, service, max_rows=max_rows)
 
 
 def get_time_series_metadata(
@@ -808,6 +837,7 @@ def get_time_series_metadata(
     filter: str | None = None,
     filter_lang: FILTER_LANG | None = None,
     convert_type: bool = True,
+    max_rows: int | None = None,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
     """Daily data and continuous measurements are grouped into time series,
     which represent a collection of observations of a single parameter,
@@ -968,6 +998,9 @@ def get_time_series_metadata(
         allowable limit is 50000. It may be beneficial to set this number lower
         if your internet connection is spotty. The default (None) will set the
         limit to the maximum allowable limit for the service.
+        This is a per-page size, not a cap on the total result: a query
+        matching more rows than ``limit`` still returns every matching row
+        across multiple pages. Use ``max_rows`` to cap the total instead.
     filter, filter_lang : optional
         Server-side CQL filter passed through as the OGC ``filter`` /
         ``filter-lang`` query parameters. See
@@ -975,6 +1008,11 @@ def get_time_series_metadata(
         and the lexicographic-comparison pitfall.
     convert_type : boolean, optional
         If True, converts columns to appropriate types.
+    max_rows : int, optional
+        Cap the total number of rows returned, stopping pagination early
+        instead of downloading the whole result. Unlike ``limit`` (the
+        per-page size), this bounds the total result across every page.
+        The default (None) follows pagination to completion.
 
     Returns
     -------
@@ -1011,9 +1049,11 @@ def get_time_series_metadata(
 
     # Build argument dictionary, omitting None values (resolving the unified
     # `state` argument into the OGC `state_name` queryable).
-    args = _get_args(_with_state(locals(), to="name", into="state_name"))
+    args = _get_args(
+        _with_state(locals(), to="name", into="state_name"), exclude={"max_rows"}
+    )
 
-    return get_ogc_data(args, service)
+    return get_ogc_data(args, service, max_rows=max_rows)
 
 
 def get_combined_metadata(
@@ -1080,6 +1120,7 @@ def get_combined_metadata(
     filter: str | None = None,
     filter_lang: FILTER_LANG | None = None,
     convert_type: bool = True,
+    max_rows: int | None = None,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
     """Get combined monitoring-location and time-series metadata.
 
@@ -1174,7 +1215,10 @@ def get_combined_metadata(
         (longitude/latitude, west-south-east-north).
     limit : int, optional
         Page size; the maximum allowable value is 50000. Default
-        (``None``) requests the maximum allowable limit.
+        (``None``) requests the maximum allowable limit. This is a
+        per-page size, not a cap on the total result: a query matching more
+        rows than ``limit`` still returns every matching row across
+        multiple pages. Use ``max_rows`` to cap the total instead.
     filter, filter_lang : optional
         Server-side CQL filter passed through as the OGC ``filter`` /
         ``filter-lang`` query parameters. See
@@ -1182,6 +1226,11 @@ def get_combined_metadata(
         and the lexicographic-comparison pitfall.
     convert_type : boolean, optional
         If True, converts columns to appropriate types.
+    max_rows : int, optional
+        Cap the total number of rows returned, stopping pagination early
+        instead of downloading the whole result. Unlike ``limit`` (the
+        per-page size), this bounds the total result across every page.
+        The default (None) follows pagination to completion.
 
     Returns
     -------
@@ -1253,9 +1302,11 @@ def get_combined_metadata(
     service = "combined-metadata"
 
     # Resolve the unified `state` argument into the OGC `state_name` queryable.
-    args = _get_args(_with_state(locals(), to="name", into="state_name"))
+    args = _get_args(
+        _with_state(locals(), to="name", into="state_name"), exclude={"max_rows"}
+    )
 
-    return get_ogc_data(args, service)
+    return get_ogc_data(args, service, max_rows=max_rows)
 
 
 def get_latest_continuous(
@@ -1277,6 +1328,7 @@ def get_latest_continuous(
     filter: str | None = None,
     filter_lang: FILTER_LANG | None = None,
     convert_type: bool = True,
+    max_rows: int | None = None,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
     """This endpoint provides the most recent observation for each time series
     of continuous data. Continuous data are collected via automated sensors
@@ -1399,6 +1451,9 @@ def get_latest_continuous(
         allowable limit is 50000. It may be beneficial to set this number lower
         if your internet connection is spotty. The default (None) will set the
         limit to the maximum allowable limit for the service.
+        This is a per-page size, not a cap on the total result: a query
+        matching more rows than ``limit`` still returns every matching row
+        across multiple pages. Use ``max_rows`` to cap the total instead.
     filter, filter_lang : optional
         Server-side CQL filter passed through as the OGC ``filter`` /
         ``filter-lang`` query parameters. See
@@ -1406,6 +1461,11 @@ def get_latest_continuous(
         and the lexicographic-comparison pitfall.
     convert_type : boolean, optional
         If True, converts columns to appropriate types.
+    max_rows : int, optional
+        Cap the total number of rows returned, stopping pagination early
+        instead of downloading the whole result. Unlike ``limit`` (the
+        per-page size), this bounds the total result across every page.
+        The default (None) follows pagination to completion.
 
     Returns
     -------
@@ -1454,9 +1514,9 @@ def get_latest_continuous(
     service = "latest-continuous"
 
     # Build argument dictionary, omitting None values
-    args = _get_args(locals())
+    args = _get_args(locals(), exclude={"max_rows"})
 
-    return get_ogc_data(args, service)
+    return get_ogc_data(args, service, max_rows=max_rows)
 
 
 def get_latest_daily(
@@ -1478,6 +1538,7 @@ def get_latest_daily(
     filter: str | None = None,
     filter_lang: FILTER_LANG | None = None,
     convert_type: bool = True,
+    max_rows: int | None = None,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
     """Daily data provide one data value to represent water conditions for the
     day.
@@ -1602,6 +1663,9 @@ def get_latest_daily(
         allowable limit is 50000. It may be beneficial to set this number lower
         if your internet connection is spotty. The default (None) will set the
         limit to the maximum allowable limit for the service.
+        This is a per-page size, not a cap on the total result: a query
+        matching more rows than ``limit`` still returns every matching row
+        across multiple pages. Use ``max_rows`` to cap the total instead.
     filter, filter_lang : optional
         Server-side CQL filter passed through as the OGC ``filter`` /
         ``filter-lang`` query parameters. See
@@ -1609,6 +1673,11 @@ def get_latest_daily(
         and the lexicographic-comparison pitfall.
     convert_type : boolean, optional
         If True, converts columns to appropriate types.
+    max_rows : int, optional
+        Cap the total number of rows returned, stopping pagination early
+        instead of downloading the whole result. Unlike ``limit`` (the
+        per-page size), this bounds the total result across every page.
+        The default (None) follows pagination to completion.
 
     Returns
     -------
@@ -1656,9 +1725,9 @@ def get_latest_daily(
     service = "latest-daily"
 
     # Build argument dictionary, omitting None values
-    args = _get_args(locals())
+    args = _get_args(locals(), exclude={"max_rows"})
 
-    return get_ogc_data(args, service)
+    return get_ogc_data(args, service, max_rows=max_rows)
 
 
 def get_field_measurements(
@@ -1682,6 +1751,7 @@ def get_field_measurements(
     filter: str | None = None,
     filter_lang: FILTER_LANG | None = None,
     convert_type: bool = True,
+    max_rows: int | None = None,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
     """Field measurements are physically measured values collected during a
     visit to the monitoring location. Field measurements consist of measurements
@@ -1797,6 +1867,9 @@ def get_field_measurements(
         allowable limit is 50000. It may be beneficial to set this number lower
         if your internet connection is spotty. The default (None) will set the
         limit to the maximum allowable limit for the service.
+        This is a per-page size, not a cap on the total result: a query
+        matching more rows than ``limit`` still returns every matching row
+        across multiple pages. Use ``max_rows`` to cap the total instead.
     filter, filter_lang : optional
         Server-side CQL filter passed through as the OGC ``filter`` /
         ``filter-lang`` query parameters. See
@@ -1804,6 +1877,11 @@ def get_field_measurements(
         and the lexicographic-comparison pitfall.
     convert_type : boolean, optional
         If True, converts columns to appropriate types.
+    max_rows : int, optional
+        Cap the total number of rows returned, stopping pagination early
+        instead of downloading the whole result. Unlike ``limit`` (the
+        per-page size), this bounds the total result across every page.
+        The default (None) follows pagination to completion.
 
     Returns
     -------
@@ -1853,9 +1931,9 @@ def get_field_measurements(
     service = "field-measurements"
 
     # Build argument dictionary, omitting None values
-    args = _get_args(locals())
+    args = _get_args(locals(), exclude={"max_rows"})
 
-    return get_ogc_data(args, service)
+    return get_ogc_data(args, service, max_rows=max_rows)
 
 
 def get_field_measurements_metadata(
@@ -1873,6 +1951,7 @@ def get_field_measurements_metadata(
     filter: str | None = None,
     filter_lang: FILTER_LANG | None = None,
     convert_type: bool = True,
+    max_rows: int | None = None,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
     """Get field-measurement metadata: one row per (location, parameter) series.
 
@@ -1918,7 +1997,10 @@ def get_field_measurements_metadata(
         (longitude / latitude, west-south-east-north).
     limit : int, optional
         Page size; the maximum allowable value is 50000. Default
-        (``None``) requests the maximum allowable limit.
+        (``None``) requests the maximum allowable limit. This is a
+        per-page size, not a cap on the total result: a query matching more
+        rows than ``limit`` still returns every matching row across
+        multiple pages. Use ``max_rows`` to cap the total instead.
     filter, filter_lang : optional
         Server-side CQL filter passed through as the OGC ``filter`` /
         ``filter-lang`` query parameters. See
@@ -1926,6 +2008,11 @@ def get_field_measurements_metadata(
         and the lexicographic-comparison pitfall.
     convert_type : boolean, optional
         If True, converts columns to appropriate types.
+    max_rows : int, optional
+        Cap the total number of rows returned, stopping pagination early
+        instead of downloading the whole result. Unlike ``limit`` (the
+        per-page size), this bounds the total result across every page.
+        The default (None) follows pagination to completion.
 
     Returns
     -------
@@ -1974,9 +2061,9 @@ def get_field_measurements_metadata(
     """
     service = "field-measurements-metadata"
 
-    args = _get_args(locals())
+    args = _get_args(locals(), exclude={"max_rows"})
 
-    return get_ogc_data(args, service)
+    return get_ogc_data(args, service, max_rows=max_rows)
 
 
 def get_peaks(
@@ -1998,6 +2085,7 @@ def get_peaks(
     filter: str | None = None,
     filter_lang: FILTER_LANG | None = None,
     convert_type: bool = True,
+    max_rows: int | None = None,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
     """Get the annual peak streamflow / stage record for a monitoring location.
 
@@ -2048,7 +2136,10 @@ def get_peaks(
         (longitude / latitude, west-south-east-north).
     limit : int, optional
         Page size; the maximum allowable value is 50000. Default
-        (``None``) requests the maximum allowable limit.
+        (``None``) requests the maximum allowable limit. This is a
+        per-page size, not a cap on the total result: a query matching more
+        rows than ``limit`` still returns every matching row across
+        multiple pages. Use ``max_rows`` to cap the total instead.
     filter, filter_lang : optional
         Server-side CQL filter passed through as the OGC ``filter`` /
         ``filter-lang`` query parameters. See
@@ -2056,6 +2147,11 @@ def get_peaks(
         and the lexicographic-comparison pitfall.
     convert_type : boolean, optional
         If True, converts columns to appropriate types.
+    max_rows : int, optional
+        Cap the total number of rows returned, stopping pagination early
+        instead of downloading the whole result. Unlike ``limit`` (the
+        per-page size), this bounds the total result across every page.
+        The default (None) follows pagination to completion.
 
     Returns
     -------
@@ -2100,9 +2196,9 @@ def get_peaks(
     """
     service = "peaks"
 
-    args = _get_args(locals())
+    args = _get_args(locals(), exclude={"max_rows"})
 
-    return get_ogc_data(args, service)
+    return get_ogc_data(args, service, max_rows=max_rows)
 
 
 def get_reference_table(
@@ -2916,6 +3012,7 @@ def get_channel(
     filter: str | None = None,
     filter_lang: FILTER_LANG | None = None,
     convert_type: bool = True,
+    max_rows: int | None = None,
 ) -> tuple[pd.DataFrame, BaseMetadata]:
     """
     Channel measurements taken as part of streamflow field measurements.
@@ -3038,6 +3135,9 @@ def get_channel(
         allowable limit is 50000. It may be beneficial to set this number lower
         if your internet connection is spotty. The default (None) will set the
         limit to the maximum allowable limit for the service.
+        This is a per-page size, not a cap on the total result: a query
+        matching more rows than ``limit`` still returns every matching row
+        across multiple pages. Use ``max_rows`` to cap the total instead.
     filter, filter_lang : optional
         Server-side CQL filter passed through as the OGC ``filter`` /
         ``filter-lang`` query parameters. See
@@ -3045,6 +3145,11 @@ def get_channel(
         and the lexicographic-comparison pitfall.
     convert_type : boolean, optional
         If True, converts columns to appropriate types.
+    max_rows : int, optional
+        Cap the total number of rows returned, stopping pagination early
+        instead of downloading the whole result. Unlike ``limit`` (the
+        per-page size), this bounds the total result across every page.
+        The default (None) follows pagination to completion.
 
     Returns
     -------
@@ -3072,9 +3177,9 @@ def get_channel(
     """
     service = "channel-measurements"
 
-    args = _get_args(locals())
+    args = _get_args(locals(), exclude={"max_rows"})
 
-    return get_ogc_data(args, service)
+    return get_ogc_data(args, service, max_rows=max_rows)
 
 
 def get_cql(
diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
index a183e74c..0fc507cc 100644
--- a/tests/waterdata_test.py
+++ b/tests/waterdata_test.py
@@ -556,6 +556,22 @@ def test_get_daily():
     assert df["value"].dtype == "float64"
 
 
+def test_get_daily_max_rows_caps_total_across_pages():
+    # ``limit`` is a per-page size, not a result cap — it doesn't stop
+    # pagination from following every ``next`` link until the whole matching
+    # result is exhausted. ``max_rows`` is the actual cap: with a tiny
+    # ``limit`` forcing multiple pages, the combined result is still
+    # truncated to exactly ``max_rows`` instead of paging to completion.
+    df, _ = get_daily(
+        monitoring_location_id="USGS-05427718",
+        parameter_code="00060",
+        time="2025-01-01/..",
+        limit=1,
+        max_rows=3,
+    )
+    assert len(df) == 3
+
+
 def test_get_daily_properties():
     df, _ = get_daily(
         monitoring_location_id="USGS-05427718",

From 4870f19d8e994ebb5f7b9ba0ac0b9a15f969223c Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Tue, 30 Jun 2026 17:32:30 -0500
Subject: [PATCH 2/3] test(waterdata): make the max_rows test a deterministic
 wiring check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The original test exercised max_rows live with limit=1, forcing ~3 serial
round-trips against the real USGS API (and, under the module's flaky_api
marker, retrying the whole 3-page call on any transient blip) just to
re-prove cap-across-pages behavior that is already covered without a network
hop by the engine's _row_cap / _finalize_ogc tests in
tests/waterdata_utils_test.py.

The only behavior this PR actually adds is the per-getter wiring: max_rows
must be excluded from the request args (it's a client-side pagination cap,
not an OGC query param the server understands) and forwarded to get_ogc_data
as a keyword. Pin exactly that with the file's existing
mock.patch("...api.get_ogc_data") pattern — no network, deterministic.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/waterdata_test.py | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
index 0fc507cc..907232d0 100644
--- a/tests/waterdata_test.py
+++ b/tests/waterdata_test.py
@@ -556,20 +556,25 @@ def test_get_daily():
     assert df["value"].dtype == "float64"
 
 
-def test_get_daily_max_rows_caps_total_across_pages():
-    # ``limit`` is a per-page size, not a result cap — it doesn't stop
-    # pagination from following every ``next`` link until the whole matching
-    # result is exhausted. ``max_rows`` is the actual cap: with a tiny
-    # ``limit`` forcing multiple pages, the combined result is still
-    # truncated to exactly ``max_rows`` instead of paging to completion.
-    df, _ = get_daily(
-        monitoring_location_id="USGS-05427718",
-        parameter_code="00060",
-        time="2025-01-01/..",
-        limit=1,
-        max_rows=3,
-    )
-    assert len(df) == 3
+def test_get_daily_max_rows_is_excluded_from_request_and_forwarded():
+    # ``max_rows`` is a client-side pagination cap, not an OGC query
+    # parameter — the server never sees it. So a getter must keep it out of
+    # the request ``args`` (which become query params) and instead forward it
+    # to ``get_ogc_data`` as the keyword that drives the cap. This pins that
+    # wiring; the cap mechanism itself (stop following ``next`` once the cap is
+    # met, then truncate the combined frame to exactly N) is covered without a
+    # network round-trip by the ``_row_cap`` / ``_finalize_ogc`` tests in
+    # tests/waterdata_utils_test.py.
+    with mock.patch("dataretrieval.waterdata.api.get_ogc_data") as fake:
+        fake.return_value = (pd.DataFrame(), mock.MagicMock(spec=[]))
+        get_daily(
+            monitoring_location_id="USGS-05427718",
+            parameter_code="00060",
+            max_rows=3,
+        )
+    args_dict = fake.call_args[0][0]
+    assert "max_rows" not in args_dict  # not leaked into the query params
+    assert fake.call_args.kwargs["max_rows"] == 3  # forwarded to the cap
 
 
 def test_get_daily_properties():

From 690e07c1602b60772f7ca350bfa2e52d3870b1e9 Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Tue, 30 Jun 2026 21:02:48 -0500
Subject: [PATCH 3/3] docs: drop the NEWS.md entry from the max_rows PR

Removing the changelog entry per request; the code/test/docstring changes
stand on their own.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 NEWS.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 7c6000b1..a86a0314 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,3 @@
-**06/30/2026:** The OGC `waterdata` data getters (`get_daily`, `get_continuous`, `get_monitoring_locations`, `get_time_series_metadata`, `get_combined_metadata`, `get_latest_continuous`, `get_latest_daily`, `get_field_measurements`, `get_field_measurements_metadata`, `get_peaks`, `get_channel`) now accept `max_rows=`, mirroring the cap `get_reference_table` already had. `limit=` has always been a **per-page** size, not a total-result cap — a broad, unfiltered call like `get_daily(parameter_code="00060", limit=10)` would previously page through the *entire* multi-year, nationwide result 10 rows at a time rather than stopping at 10. `max_rows=` now stops pagination once that many rows have accumulated and truncates the combined result to exactly that count; `limit=`'s docstring on every affected getter now says explicitly that it's a page size. `get_cql` (the raw-CQL escape hatch) builds its own requests and isn't covered by this change.
-
 **06/23/2026:** **Breaking change (1.2.0):** the minimum supported Python is now **3.10** (`requires-python = ">=3.10"`). 3.9 support was already effectively broken — the `waterdata` module's dependencies (`anyio`, the test stack) require 3.10+, and the `waterdata` test modules already skipped on <3.10. `anyio` is now declared as a direct dependency (it is imported directly by `waterdata`), and the CI/ruff/mypy targets move to 3.10. Also fully removed the deprecated `variable_info` metadata property: the `NWIS_Metadata` override only warned and returned `None` (it relied on the defunct `get_pmcodes`), and the `BaseMetadata` abstract is gone too since nothing implemented it — accessing `.variable_info` now raises `AttributeError`. `site_info` is unaffected.
 
 **06/23/2026:** **Breaking change (1.2.0):** removed the `nadp` module and the deprecated `samples` module ahead of the 1.2.0 release. `nadp` was deprecated on 05/01/2026 — NADP is not a USGS data source, so retrieve NADP data directly from https://nadp.slh.wisc.edu/. The `samples.get_usgs_samples` shim (a deprecated forward to the modern getter) is gone; use `waterdata.get_samples()` instead. `import dataretrieval.nadp` / `import dataretrieval.samples` now raise `ModuleNotFoundError`.