diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 815ecc03..54785691 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -75,6 +75,7 @@ def get_daily( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + max_rows: int | None = None, **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the @@ -201,6 +202,9 @@ def get_daily( allowable limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. + This is a per-page size, not a cap on the total result: a query + matching more rows than ``limit`` still returns every matching row + across multiple pages. Use ``max_rows`` to cap the total instead. filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See @@ -208,6 +212,11 @@ def get_daily( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + max_rows : int, optional + Cap the total number of rows returned, stopping pagination early + instead of downloading the whole result. Unlike ``limit`` (the + per-page size), this bounds the total result across every page. + The default (None) follows pagination to completion. **queryables : string or iterable of strings, optional Any other queryable property of this collection, passed through as a server-side filter. Call :func:`get_queryables` to see the queryables a @@ -279,9 +288,9 @@ def get_daily( service = "daily" # Build argument dictionary, omitting None values - args = _get_args(locals()) + args = _get_args(locals(), exclude={"max_rows"}) - return get_ogc_data(args, service) + return get_ogc_data(args, service, max_rows=max_rows) def get_continuous( @@ -301,6 +310,7 @@ def get_continuous( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + max_rows: int | None = None, **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """ @@ -421,6 +431,9 @@ def get_continuous( allowable limit is 10000. It may be beneficial to set this number lower if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. + This is a per-page size, not a cap on the total result: a query + matching more rows than ``limit`` still returns every matching row + across multiple pages. Use ``max_rows`` to cap the total instead. filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See @@ -428,6 +441,11 @@ def get_continuous( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + max_rows : int, optional + Cap the total number of rows returned, stopping pagination early + instead of downloading the whole result. Unlike ``limit`` (the + per-page size), this bounds the total result across every page. + The default (None) follows pagination to completion. **queryables : string or iterable of strings, optional Any other queryable property of this collection, passed through as a server-side filter. Call :func:`get_queryables` to see the queryables a @@ -477,9 +495,9 @@ def get_continuous( service = "continuous" # Build argument dictionary, omitting None values - args = _get_args(locals()) + args = _get_args(locals(), exclude={"max_rows"}) - return get_ogc_data(args, service) + return get_ogc_data(args, service, max_rows=max_rows) def get_monitoring_locations( @@ -531,6 +549,7 @@ def get_monitoring_locations( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + max_rows: int | None = None, **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Location information is basic information about the monitoring location @@ -738,6 +757,9 @@ def get_monitoring_locations( allowable limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. + This is a per-page size, not a cap on the total result: a query + matching more rows than ``limit`` still returns every matching row + across multiple pages. Use ``max_rows`` to cap the total instead. skip_geometry : boolean, optional This option can be used to skip response geometries for each feature. The returning object will be a data frame with no spatial information. @@ -750,6 +772,11 @@ def get_monitoring_locations( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + max_rows : int, optional + Cap the total number of rows returned, stopping pagination early + instead of downloading the whole result. Unlike ``limit`` (the + per-page size), this bounds the total result across every page. + The default (None) follows pagination to completion. **queryables : string or iterable of strings, optional Any other queryable property of this collection, passed through as a server-side filter. Call :func:`get_queryables` to see the queryables a @@ -790,9 +817,11 @@ def get_monitoring_locations( # Build argument dictionary, omitting None values (resolving the unified # `state` argument into the OGC `state_name` queryable). - args = _get_args(_with_state(locals(), to="name", into="state_name")) + args = _get_args( + _with_state(locals(), to="name", into="state_name"), exclude={"max_rows"} + ) - return get_ogc_data(args, service) + return get_ogc_data(args, service, max_rows=max_rows) def get_time_series_metadata( @@ -824,6 +853,7 @@ def get_time_series_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + max_rows: int | None = None, **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data and continuous measurements are grouped into time series, @@ -985,6 +1015,9 @@ def get_time_series_metadata( allowable limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. + This is a per-page size, not a cap on the total result: a query + matching more rows than ``limit`` still returns every matching row + across multiple pages. Use ``max_rows`` to cap the total instead. filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See @@ -992,6 +1025,11 @@ def get_time_series_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + max_rows : int, optional + Cap the total number of rows returned, stopping pagination early + instead of downloading the whole result. Unlike ``limit`` (the + per-page size), this bounds the total result across every page. + The default (None) follows pagination to completion. **queryables : string or iterable of strings, optional Any other queryable property of this collection, passed through as a server-side filter. Call :func:`get_queryables` to see the queryables a @@ -1032,9 +1070,11 @@ def get_time_series_metadata( # Build argument dictionary, omitting None values (resolving the unified # `state` argument into the OGC `state_name` queryable). - args = _get_args(_with_state(locals(), to="name", into="state_name")) + args = _get_args( + _with_state(locals(), to="name", into="state_name"), exclude={"max_rows"} + ) - return get_ogc_data(args, service) + return get_ogc_data(args, service, max_rows=max_rows) def get_combined_metadata( @@ -1101,6 +1141,7 @@ def get_combined_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + max_rows: int | None = None, **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get combined monitoring-location and time-series metadata. @@ -1196,7 +1237,10 @@ def get_combined_metadata( (longitude/latitude, west-south-east-north). limit : int, optional Page size; the maximum allowable value is 50000. Default - (``None``) requests the maximum allowable limit. + (``None``) requests the maximum allowable limit. This is a + per-page size, not a cap on the total result: a query matching more + rows than ``limit`` still returns every matching row across + multiple pages. Use ``max_rows`` to cap the total instead. filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See @@ -1204,6 +1248,11 @@ def get_combined_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + max_rows : int, optional + Cap the total number of rows returned, stopping pagination early + instead of downloading the whole result. Unlike ``limit`` (the + per-page size), this bounds the total result across every page. + The default (None) follows pagination to completion. **queryables : string or iterable of strings, optional Any other queryable property of this collection, passed through as a server-side filter. Call :func:`get_queryables` to see the queryables a @@ -1279,9 +1328,11 @@ def get_combined_metadata( service = "combined-metadata" # Resolve the unified `state` argument into the OGC `state_name` queryable. - args = _get_args(_with_state(locals(), to="name", into="state_name")) + args = _get_args( + _with_state(locals(), to="name", into="state_name"), exclude={"max_rows"} + ) - return get_ogc_data(args, service) + return get_ogc_data(args, service, max_rows=max_rows) def get_latest_continuous( @@ -1303,6 +1354,7 @@ def get_latest_continuous( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + max_rows: int | None = None, **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """This endpoint provides the most recent observation for each time series @@ -1426,6 +1478,9 @@ def get_latest_continuous( allowable limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. + This is a per-page size, not a cap on the total result: a query + matching more rows than ``limit`` still returns every matching row + across multiple pages. Use ``max_rows`` to cap the total instead. filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See @@ -1433,6 +1488,11 @@ def get_latest_continuous( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + max_rows : int, optional + Cap the total number of rows returned, stopping pagination early + instead of downloading the whole result. Unlike ``limit`` (the + per-page size), this bounds the total result across every page. + The default (None) follows pagination to completion. **queryables : string or iterable of strings, optional Any other queryable property of this collection, passed through as a server-side filter. Call :func:`get_queryables` to see the queryables a @@ -1485,9 +1545,9 @@ def get_latest_continuous( service = "latest-continuous" # Build argument dictionary, omitting None values - args = _get_args(locals()) + args = _get_args(locals(), exclude={"max_rows"}) - return get_ogc_data(args, service) + return get_ogc_data(args, service, max_rows=max_rows) def get_latest_daily( @@ -1509,6 +1569,7 @@ def get_latest_daily( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + max_rows: int | None = None, **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the @@ -1634,6 +1695,9 @@ def get_latest_daily( allowable limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. + This is a per-page size, not a cap on the total result: a query + matching more rows than ``limit`` still returns every matching row + across multiple pages. Use ``max_rows`` to cap the total instead. filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See @@ -1641,6 +1705,11 @@ def get_latest_daily( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + max_rows : int, optional + Cap the total number of rows returned, stopping pagination early + instead of downloading the whole result. Unlike ``limit`` (the + per-page size), this bounds the total result across every page. + The default (None) follows pagination to completion. **queryables : string or iterable of strings, optional Any other queryable property of this collection, passed through as a server-side filter. Call :func:`get_queryables` to see the queryables a @@ -1692,9 +1761,9 @@ def get_latest_daily( service = "latest-daily" # Build argument dictionary, omitting None values - args = _get_args(locals()) + args = _get_args(locals(), exclude={"max_rows"}) - return get_ogc_data(args, service) + return get_ogc_data(args, service, max_rows=max_rows) def get_field_measurements( @@ -1718,6 +1787,7 @@ def get_field_measurements( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + max_rows: int | None = None, **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Field measurements are physically measured values collected during a @@ -1834,6 +1904,9 @@ def get_field_measurements( allowable limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. + This is a per-page size, not a cap on the total result: a query + matching more rows than ``limit`` still returns every matching row + across multiple pages. Use ``max_rows`` to cap the total instead. filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See @@ -1841,6 +1914,11 @@ def get_field_measurements( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + max_rows : int, optional + Cap the total number of rows returned, stopping pagination early + instead of downloading the whole result. Unlike ``limit`` (the + per-page size), this bounds the total result across every page. + The default (None) follows pagination to completion. **queryables : string or iterable of strings, optional Any other queryable property of this collection, passed through as a server-side filter. Call :func:`get_queryables` to see the queryables a @@ -1894,9 +1972,9 @@ def get_field_measurements( service = "field-measurements" # Build argument dictionary, omitting None values - args = _get_args(locals()) + args = _get_args(locals(), exclude={"max_rows"}) - return get_ogc_data(args, service) + return get_ogc_data(args, service, max_rows=max_rows) def get_field_measurements_metadata( @@ -1914,6 +1992,7 @@ def get_field_measurements_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + max_rows: int | None = None, **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get field-measurement metadata: one row per (location, parameter) series. @@ -1960,7 +2039,10 @@ def get_field_measurements_metadata( (longitude / latitude, west-south-east-north). limit : int, optional Page size; the maximum allowable value is 50000. Default - (``None``) requests the maximum allowable limit. + (``None``) requests the maximum allowable limit. This is a + per-page size, not a cap on the total result: a query matching more + rows than ``limit`` still returns every matching row across + multiple pages. Use ``max_rows`` to cap the total instead. filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See @@ -1968,6 +2050,11 @@ def get_field_measurements_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + max_rows : int, optional + Cap the total number of rows returned, stopping pagination early + instead of downloading the whole result. Unlike ``limit`` (the + per-page size), this bounds the total result across every page. + The default (None) follows pagination to completion. **queryables : string or iterable of strings, optional Any other queryable property of this collection, passed through as a server-side filter. Call :func:`get_queryables` to see the queryables a @@ -2020,9 +2107,9 @@ def get_field_measurements_metadata( """ service = "field-measurements-metadata" - args = _get_args(locals()) + args = _get_args(locals(), exclude={"max_rows"}) - return get_ogc_data(args, service) + return get_ogc_data(args, service, max_rows=max_rows) def get_peaks( @@ -2044,6 +2131,7 @@ def get_peaks( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + max_rows: int | None = None, **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get the annual peak streamflow / stage record for a monitoring location. @@ -2095,7 +2183,10 @@ def get_peaks( (longitude / latitude, west-south-east-north). limit : int, optional Page size; the maximum allowable value is 50000. Default - (``None``) requests the maximum allowable limit. + (``None``) requests the maximum allowable limit. This is a + per-page size, not a cap on the total result: a query matching more + rows than ``limit`` still returns every matching row across + multiple pages. Use ``max_rows`` to cap the total instead. filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See @@ -2103,6 +2194,11 @@ def get_peaks( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + max_rows : int, optional + Cap the total number of rows returned, stopping pagination early + instead of downloading the whole result. Unlike ``limit`` (the + per-page size), this bounds the total result across every page. + The default (None) follows pagination to completion. **queryables : string or iterable of strings, optional Any other queryable property of this collection, passed through as a server-side filter. Call :func:`get_queryables` to see the queryables a @@ -2151,9 +2247,9 @@ def get_peaks( """ service = "peaks" - args = _get_args(locals()) + args = _get_args(locals(), exclude={"max_rows"}) - return get_ogc_data(args, service) + return get_ogc_data(args, service, max_rows=max_rows) def get_reference_table( @@ -3029,6 +3125,7 @@ def get_channel( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + max_rows: int | None = None, **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """ @@ -3152,6 +3249,9 @@ def get_channel( allowable limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (None) will set the limit to the maximum allowable limit for the service. + This is a per-page size, not a cap on the total result: a query + matching more rows than ``limit`` still returns every matching row + across multiple pages. Use ``max_rows`` to cap the total instead. filter, filter_lang : optional Server-side CQL filter passed through as the OGC ``filter`` / ``filter-lang`` query parameters. See @@ -3159,6 +3259,11 @@ def get_channel( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + max_rows : int, optional + Cap the total number of rows returned, stopping pagination early + instead of downloading the whole result. Unlike ``limit`` (the + per-page size), this bounds the total result across every page. + The default (None) follows pagination to completion. **queryables : string or iterable of strings, optional Any other queryable property of this collection, passed through as a server-side filter. Call :func:`get_queryables` to see the queryables a @@ -3190,9 +3295,9 @@ def get_channel( """ service = "channel-measurements" - args = _get_args(locals()) + args = _get_args(locals(), exclude={"max_rows"}) - return get_ogc_data(args, service) + return get_ogc_data(args, service, max_rows=max_rows) def get_cql( diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index a183e74c..907232d0 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -556,6 +556,27 @@ def test_get_daily(): assert df["value"].dtype == "float64" +def test_get_daily_max_rows_is_excluded_from_request_and_forwarded(): + # ``max_rows`` is a client-side pagination cap, not an OGC query + # parameter — the server never sees it. So a getter must keep it out of + # the request ``args`` (which become query params) and instead forward it + # to ``get_ogc_data`` as the keyword that drives the cap. This pins that + # wiring; the cap mechanism itself (stop following ``next`` once the cap is + # met, then truncate the combined frame to exactly N) is covered without a + # network round-trip by the ``_row_cap`` / ``_finalize_ogc`` tests in + # tests/waterdata_utils_test.py. + with mock.patch("dataretrieval.waterdata.api.get_ogc_data") as fake: + fake.return_value = (pd.DataFrame(), mock.MagicMock(spec=[])) + get_daily( + monitoring_location_id="USGS-05427718", + parameter_code="00060", + max_rows=3, + ) + args_dict = fake.call_args[0][0] + assert "max_rows" not in args_dict # not leaked into the query params + assert fake.call_args.kwargs["max_rows"] == 3 # forwarded to the cap + + def test_get_daily_properties(): df, _ = get_daily( monitoring_location_id="USGS-05427718",