diff --git a/dataretrieval/ogc/engine.py b/dataretrieval/ogc/engine.py index 2fe89440..b84f2dec 100644 --- a/dataretrieval/ogc/engine.py +++ b/dataretrieval/ogc/engine.py @@ -265,10 +265,14 @@ def _default_headers() -> dict[str, str]: return headers -def _check_ogc_requests(endpoint: str, req_type: str = "queryables") -> dict[str, Any]: +def _check_ogc_requests( + endpoint: str, req_type: str = "queryables" +) -> tuple[dict[str, Any], httpx.Response]: """ Sends an HTTP GET request to the specified OGC endpoint and request type, - returning the JSON response. + returning the parsed JSON body alongside the raw response (so a caller + that needs response-derived metadata, e.g. :class:`BaseMetadata`, doesn't + have to re-issue the request). Parameters ---------- @@ -282,6 +286,9 @@ def _check_ogc_requests(endpoint: str, req_type: str = "queryables") -> dict[str ------- dict The JSON response from the OGC endpoint. + httpx.Response + The raw response, for callers that need it (URL, elapsed time, + headers). Raises ------ @@ -299,7 +306,7 @@ def _check_ogc_requests(endpoint: str, req_type: str = "queryables") -> dict[str _raise_for_non_200(resp) # ``Response.json`` is typed ``Any``; the OGC queryables/schema endpoints # return a JSON object, and callers index it as a dict. - return cast("dict[str, Any]", resp.json()) + return cast("dict[str, Any]", resp.json()), resp def _ogc_query_params( diff --git a/dataretrieval/ogc/shaping.py b/dataretrieval/ogc/shaping.py index 4576eb80..b8e3c8e5 100644 --- a/dataretrieval/ogc/shaping.py +++ b/dataretrieval/ogc/shaping.py @@ -190,7 +190,7 @@ def _deal_with_empty( # call (it goes away once requests move to their own module). from dataretrieval.ogc.engine import _check_ogc_requests - schema = _check_ogc_requests(endpoint=service, req_type="schema") + schema, _ = _check_ogc_requests(endpoint=service, req_type="schema") properties = list(schema.get("properties", {}).keys()) return pd.DataFrame(columns=properties) return return_list diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py index 7d3fce45..99b6e178 100644 --- a/dataretrieval/waterdata/__init__.py +++ b/dataretrieval/waterdata/__init__.py @@ -25,6 +25,7 @@ get_latest_daily, get_monitoring_locations, get_peaks, + get_queryables, get_reference_table, get_samples, get_samples_summary, @@ -62,6 +63,7 @@ "get_monitoring_locations", "get_nearest_continuous", "get_peaks", + "get_queryables", "get_ratings", "get_reference_table", "get_samples", diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index e79d08f7..815ecc03 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -38,6 +38,7 @@ SAMPLES_URL, _accept_legacy_kwargs, _as_str_list, + _check_ogc_requests, _check_profiles, _construct_cql_request, _default_headers, @@ -74,6 +75,7 @@ def get_daily( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the day. @@ -206,6 +208,10 @@ def get_daily( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + **queryables : string or iterable of strings, optional + Any other queryable property of this collection, passed through as a + server-side filter. Call :func:`get_queryables` to see the queryables a + collection supports. Returns ------- @@ -295,6 +301,7 @@ def get_continuous( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Continuous data provide instantaneous water conditions. @@ -421,6 +428,10 @@ def get_continuous( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + **queryables : string or iterable of strings, optional + Any other queryable property of this collection, passed through as a + server-side filter. Call :func:`get_queryables` to see the queryables a + collection supports. Returns ------- @@ -520,6 +531,7 @@ def get_monitoring_locations( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Location information is basic information about the monitoring location including the name, identifier, agency responsible for data collection, and @@ -738,6 +750,10 @@ def get_monitoring_locations( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + **queryables : string or iterable of strings, optional + Any other queryable property of this collection, passed through as a + server-side filter. Call :func:`get_queryables` to see the queryables a + collection supports. Returns ------- @@ -808,6 +824,7 @@ def get_time_series_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data and continuous measurements are grouped into time series, which represent a collection of observations of a single parameter, @@ -975,6 +992,10 @@ def get_time_series_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + **queryables : string or iterable of strings, optional + Any other queryable property of this collection, passed through as a + server-side filter. Call :func:`get_queryables` to see the queryables a + collection supports. Returns ------- @@ -1080,6 +1101,7 @@ def get_combined_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get combined monitoring-location and time-series metadata. @@ -1182,6 +1204,10 @@ def get_combined_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + **queryables : string or iterable of strings, optional + Any other queryable property of this collection, passed through as a + server-side filter. Call :func:`get_queryables` to see the queryables a + collection supports. Returns ------- @@ -1277,6 +1303,7 @@ def get_latest_continuous( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """This endpoint provides the most recent observation for each time series of continuous data. Continuous data are collected via automated sensors @@ -1406,6 +1433,10 @@ def get_latest_continuous( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + **queryables : string or iterable of strings, optional + Any other queryable property of this collection, passed through as a + server-side filter. Call :func:`get_queryables` to see the queryables a + collection supports. Returns ------- @@ -1478,6 +1509,7 @@ def get_latest_daily( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the day. @@ -1609,6 +1641,10 @@ def get_latest_daily( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + **queryables : string or iterable of strings, optional + Any other queryable property of this collection, passed through as a + server-side filter. Call :func:`get_queryables` to see the queryables a + collection supports. Returns ------- @@ -1682,6 +1718,7 @@ def get_field_measurements( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Field measurements are physically measured values collected during a visit to the monitoring location. Field measurements consist of measurements @@ -1804,6 +1841,10 @@ def get_field_measurements( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + **queryables : string or iterable of strings, optional + Any other queryable property of this collection, passed through as a + server-side filter. Call :func:`get_queryables` to see the queryables a + collection supports. Returns ------- @@ -1873,6 +1914,7 @@ def get_field_measurements_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get field-measurement metadata: one row per (location, parameter) series. @@ -1926,6 +1968,10 @@ def get_field_measurements_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + **queryables : string or iterable of strings, optional + Any other queryable property of this collection, passed through as a + server-side filter. Call :func:`get_queryables` to see the queryables a + collection supports. Returns ------- @@ -1998,6 +2044,7 @@ def get_peaks( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get the annual peak streamflow / stage record for a monitoring location. @@ -2056,6 +2103,10 @@ def get_peaks( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + **queryables : string or iterable of strings, optional + Any other queryable property of this collection, passed through as a + server-side filter. Call :func:`get_queryables` to see the queryables a + collection supports. Returns ------- @@ -2198,6 +2249,68 @@ def get_reference_table( ) +def get_queryables(collection: str) -> tuple[pd.DataFrame, BaseMetadata]: + """List the queryable properties of a Water Data API collection. + + Every OGC collection (``daily``, ``continuous``, ``monitoring-locations``, + ...) advertises the set of properties that can be filtered on -- exposed as + the typed keyword arguments of the matching ``get_*`` function, and usable + directly in a CQL2 ``filter``. This returns that set, so the available + filters can be discovered programmatically and monitored for upstream + additions. + + Parameters + ---------- + collection : string + The collection id, e.g. ``"daily"``, ``"continuous"``, + ``"monitoring-locations"``, or ``"time-series-metadata"``. See + :data:`dataretrieval.waterdata.types.WATERDATA_SERVICES` for the data + collections; reference collections (e.g. ``"parameter-codes"``) work + too. + + Returns + ------- + df : ``pandas.DataFrame`` + One row per queryable, sorted by name, with columns ``queryable`` (the + property name), ``type``, ``title``, and ``description``. + md : :class:`dataretrieval.utils.BaseMetadata` + Metadata describing the request (URL, query time, response headers). + + Raises + ------ + DataRetrievalError + On an HTTP error response (e.g. an unknown ``collection`` yields a 404), + the typed subclass for the status. + + Examples + -------- + .. doctest:: + :skipif: True # network + + >>> from dataretrieval import waterdata + >>> df, md = waterdata.get_queryables("daily") + >>> df.set_index("queryable").loc["state_name", "type"] + 'string' + """ + # The OGC queryables document is a JSON Schema whose ``properties`` map each + # filterable property name to a ``{title, type, description}`` definition. + body, response = _check_ogc_requests(endpoint=collection, req_type="queryables") + properties: dict[str, Any] = body.get("properties", {}) + df = pd.DataFrame( + [ + { + "queryable": name, + "type": prop.get("type"), + "title": prop.get("title"), + "description": (prop.get("description") or "").strip(), + } + for name, prop in sorted(properties.items()) + ], + columns=["queryable", "type", "title", "description"], + ) + return df, BaseMetadata(response) + + def get_codes(code_service: CODE_SERVICES) -> tuple[pd.DataFrame, BaseMetadata]: """Return codes from a Samples code service. @@ -2916,6 +3029,7 @@ def get_channel( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + **queryables: Any, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Channel measurements taken as part of streamflow field measurements. @@ -3045,6 +3159,10 @@ def get_channel( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + **queryables : string or iterable of strings, optional + Any other queryable property of this collection, passed through as a + server-side filter. Call :func:`get_queryables` to see the queryables a + collection supports. Returns ------- diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 65f9ea2f..53d5f955 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -157,6 +157,23 @@ } +def _flatten_queryables(local_vars: dict[str, Any]) -> dict[str, Any]: + """Merge a getter's ``**queryables`` passthrough kwargs -- collected by + ``locals()`` under the ``queryables`` key -- up into ``local_vars`` as + top-level entries, so an extra server-side filter such as + ``state_name="Wisconsin"`` is normalized, mutual-exclusion-checked, and sent + exactly like a named param. See + :func:`dataretrieval.waterdata.get_queryables` for each collection's + filterable properties (the service rejects an unknown one with a 400). + + ``**queryables`` always arrives as a dict (empty when unused) and the key is + popped, so this is a no-op on getters without the passthrough and idempotent + if called twice. + """ + local_vars.update(local_vars.pop("queryables", {})) + return local_vars + + def _get_args( local_vars: dict[str, Any], exclude: set[str] | None = None ) -> dict[str, Any]: @@ -165,8 +182,10 @@ def _get_args( Supplies the Water Data API's extended ``no_normalize`` set (numeric params such as ``water_year``, ``thresholds``, ``boundingBox``) so they keep their element types. See :func:`engine._get_args` for the full - normalization contract. + normalization contract. Also flattens any ``**queryables`` passthrough + (see :func:`_flatten_queryables`). """ + _flatten_queryables(local_vars) return _engine_get_args(local_vars, exclude, no_normalize=_NO_NORMALIZE_PARAMS) @@ -182,6 +201,12 @@ def _with_state(local_vars: dict[str, Any], *, to: str, into: str) -> dict[str, raw values (e.g. non-US FIPS); passing ``state`` together with either raises ``ValueError``. """ + # Flatten ``**queryables`` first so a native state param arriving that way + # (e.g. ``get_time_series_metadata``'s ``state_code``, which isn't an + # explicit parameter) is visible to apply_state's mutual-exclusion guard. + # Otherwise ``state`` plus a passthrough ``state_code`` would slip past the + # check and silently send both. + _flatten_queryables(local_vars) return apply_state( local_vars, to=to, into=into, reject=("state_code", "state_name") ) diff --git a/tests/data/waterdata_queryables.json b/tests/data/waterdata_queryables.json new file mode 100644 index 00000000..a9d9862c --- /dev/null +++ b/tests/data/waterdata_queryables.json @@ -0,0 +1,513 @@ +{ + "channel-measurements": [ + "channel_area", + "channel_area_unit", + "channel_evenness", + "channel_flow", + "channel_flow_unit", + "channel_location_direction", + "channel_location_distance", + "channel_location_distance_unit", + "channel_material", + "channel_measurement_type", + "channel_name", + "channel_stability", + "channel_velocity", + "channel_velocity_unit", + "channel_width", + "channel_width_unit", + "field_visit_id", + "geometry", + "horizontal_velocity_description", + "id", + "last_modified", + "longitudinal_velocity_description", + "measurement_number", + "measurement_type", + "monitoring_location_id", + "time", + "vertical_velocity_description" + ], + "combined-metadata": [ + "agency_code", + "agency_name", + "altitude", + "altitude_accuracy", + "altitude_method_code", + "altitude_method_name", + "aquifer_code", + "aquifer_type_code", + "basin_code", + "begin", + "computation_identifier", + "construction_date", + "contributing_drainage_area", + "country_code", + "country_name", + "county_code", + "county_name", + "data_gap_interval", + "data_type", + "depth_source_code", + "district_code", + "drainage_area", + "end", + "geometry", + "hole_constructed_depth", + "horizontal_position_method_code", + "horizontal_position_method_name", + "horizontal_positional_accuracy", + "horizontal_positional_accuracy_code", + "hydrologic_unit_code", + "id", + "last_modified", + "minor_civil_division_code", + "monitoring_location_id", + "monitoring_location_name", + "monitoring_location_number", + "national_aquifer_code", + "original_horizontal_datum", + "original_horizontal_datum_name", + "parameter_code", + "parameter_description", + "parameter_name", + "parent_time_series_id", + "primary", + "reading_type", + "site_type", + "site_type_code", + "state_code", + "state_name", + "statistic_id", + "sublocation_identifier", + "thresholds", + "time_zone_abbreviation", + "unit_of_measure", + "uses_daylight_savings", + "vertical_datum", + "vertical_datum_name", + "web_description", + "well_constructed_depth" + ], + "continuous": [ + "agency_code", + "agency_name", + "altitude", + "altitude_accuracy", + "altitude_method_code", + "altitude_method_name", + "approval_status", + "aquifer_code", + "aquifer_type_code", + "basin_code", + "construction_date", + "contributing_drainage_area", + "country_code", + "country_name", + "county_code", + "county_name", + "data_gap_interval", + "depth_source_code", + "district_code", + "drainage_area", + "geometry", + "hole_constructed_depth", + "horizontal_position_method_code", + "horizontal_position_method_name", + "horizontal_positional_accuracy", + "horizontal_positional_accuracy_code", + "hydrologic_unit_code", + "id", + "last_modified", + "minor_civil_division_code", + "monitoring_location_id", + "monitoring_location_name", + "monitoring_location_number", + "national_aquifer_code", + "original_horizontal_datum", + "original_horizontal_datum_name", + "parameter_code", + "qualifier", + "site_type", + "site_type_code", + "state_code", + "state_name", + "statistic_id", + "time", + "time_series_id", + "time_zone_abbreviation", + "unit_of_measure", + "uses_daylight_savings", + "value", + "vertical_datum", + "vertical_datum_name", + "well_constructed_depth" + ], + "daily": [ + "agency_code", + "agency_name", + "altitude", + "altitude_accuracy", + "altitude_method_code", + "altitude_method_name", + "approval_status", + "aquifer_code", + "aquifer_type_code", + "basin_code", + "construction_date", + "contributing_drainage_area", + "country_code", + "country_name", + "county_code", + "county_name", + "data_gap_interval", + "depth_source_code", + "district_code", + "drainage_area", + "geometry", + "hole_constructed_depth", + "horizontal_position_method_code", + "horizontal_position_method_name", + "horizontal_positional_accuracy", + "horizontal_positional_accuracy_code", + "hydrologic_unit_code", + "id", + "last_modified", + "minor_civil_division_code", + "monitoring_location_id", + "monitoring_location_name", + "monitoring_location_number", + "national_aquifer_code", + "original_horizontal_datum", + "original_horizontal_datum_name", + "parameter_code", + "qualifier", + "site_type", + "site_type_code", + "state_code", + "state_name", + "statistic_id", + "time", + "time_series_id", + "time_zone_abbreviation", + "unit_of_measure", + "uses_daylight_savings", + "value", + "vertical_datum", + "vertical_datum_name", + "well_constructed_depth" + ], + "field-measurements": [ + "agency_code", + "agency_name", + "altitude", + "altitude_accuracy", + "altitude_method_code", + "altitude_method_name", + "approval_status", + "aquifer_code", + "aquifer_type_code", + "basin_code", + "construction_date", + "contributing_drainage_area", + "control_condition", + "country_code", + "country_name", + "county_code", + "county_name", + "day", + "depth_source_code", + "district_code", + "drainage_area", + "field_measurements_series_id", + "field_visit_id", + "geometry", + "hole_constructed_depth", + "horizontal_position_method_code", + "horizontal_position_method_name", + "horizontal_positional_accuracy", + "horizontal_positional_accuracy_code", + "hydrologic_unit_code", + "id", + "last_modified", + "measurement_rated", + "measuring_agency", + "minor_civil_division_code", + "monitoring_location_id", + "monitoring_location_name", + "monitoring_location_number", + "month", + "national_aquifer_code", + "observing_procedure", + "observing_procedure_code", + "original_horizontal_datum", + "original_horizontal_datum_name", + "parameter_code", + "qualifier", + "reading_type", + "site_type", + "site_type_code", + "state_code", + "state_name", + "time", + "time_of_day", + "time_zone_abbreviation", + "unit_of_measure", + "uses_daylight_savings", + "value", + "vertical_datum", + "vertical_datum_name", + "vertical_datum_site", + "well_constructed_depth", + "year" + ], + "field-measurements-metadata": [ + "begin", + "end", + "geometry", + "id", + "last_modified", + "monitoring_location_id", + "parameter_code", + "parameter_description", + "parameter_name", + "reading_type" + ], + "latest-continuous": [ + "agency_code", + "agency_name", + "altitude", + "altitude_accuracy", + "altitude_method_code", + "altitude_method_name", + "approval_status", + "aquifer_code", + "aquifer_type_code", + "basin_code", + "construction_date", + "contributing_drainage_area", + "country_code", + "country_name", + "county_code", + "county_name", + "data_gap_interval", + "depth_source_code", + "district_code", + "drainage_area", + "geometry", + "hole_constructed_depth", + "horizontal_position_method_code", + "horizontal_position_method_name", + "horizontal_positional_accuracy", + "horizontal_positional_accuracy_code", + "hydrologic_unit_code", + "id", + "last_modified", + "minor_civil_division_code", + "monitoring_location_id", + "monitoring_location_name", + "monitoring_location_number", + "national_aquifer_code", + "original_horizontal_datum", + "original_horizontal_datum_name", + "parameter_code", + "qualifier", + "site_type", + "site_type_code", + "state_code", + "state_name", + "statistic_id", + "time", + "time_series_id", + "time_zone_abbreviation", + "unit_of_measure", + "uses_daylight_savings", + "value", + "vertical_datum", + "vertical_datum_name", + "well_constructed_depth" + ], + "latest-daily": [ + "agency_code", + "agency_name", + "altitude", + "altitude_accuracy", + "altitude_method_code", + "altitude_method_name", + "approval_status", + "aquifer_code", + "aquifer_type_code", + "basin_code", + "construction_date", + "contributing_drainage_area", + "country_code", + "country_name", + "county_code", + "county_name", + "data_gap_interval", + "depth_source_code", + "district_code", + "drainage_area", + "geometry", + "hole_constructed_depth", + "horizontal_position_method_code", + "horizontal_position_method_name", + "horizontal_positional_accuracy", + "horizontal_positional_accuracy_code", + "hydrologic_unit_code", + "id", + "last_modified", + "minor_civil_division_code", + "monitoring_location_id", + "monitoring_location_name", + "monitoring_location_number", + "national_aquifer_code", + "original_horizontal_datum", + "original_horizontal_datum_name", + "parameter_code", + "qualifier", + "site_type", + "site_type_code", + "state_code", + "state_name", + "statistic_id", + "time", + "time_series_id", + "time_zone_abbreviation", + "unit_of_measure", + "uses_daylight_savings", + "value", + "vertical_datum", + "vertical_datum_name", + "well_constructed_depth" + ], + "monitoring-locations": [ + "agency_code", + "agency_name", + "altitude", + "altitude_accuracy", + "altitude_method_code", + "altitude_method_name", + "aquifer_code", + "aquifer_type_code", + "basin_code", + "construction_date", + "contributing_drainage_area", + "country_code", + "country_name", + "county_code", + "county_name", + "depth_source_code", + "district_code", + "drainage_area", + "geometry", + "hole_constructed_depth", + "horizontal_position_method_code", + "horizontal_position_method_name", + "horizontal_positional_accuracy", + "horizontal_positional_accuracy_code", + "hydrologic_unit_code", + "id", + "minor_civil_division_code", + "monitoring_location_name", + "monitoring_location_number", + "national_aquifer_code", + "original_horizontal_datum", + "original_horizontal_datum_name", + "revision_created", + "revision_modified", + "revision_note", + "site_type", + "site_type_code", + "state_code", + "state_name", + "time_zone_abbreviation", + "uses_daylight_savings", + "vertical_datum", + "vertical_datum_name", + "well_constructed_depth" + ], + "peaks": [ + "agency_code", + "agency_name", + "altitude", + "altitude_accuracy", + "altitude_method_code", + "altitude_method_name", + "aquifer_code", + "aquifer_type_code", + "basin_code", + "construction_date", + "contributing_drainage_area", + "country_code", + "country_name", + "county_code", + "county_name", + "data_gap_interval", + "day", + "depth_source_code", + "district_code", + "drainage_area", + "geometry", + "hole_constructed_depth", + "horizontal_position_method_code", + "horizontal_position_method_name", + "horizontal_positional_accuracy", + "horizontal_positional_accuracy_code", + "hydrologic_unit_code", + "id", + "last_modified", + "minor_civil_division_code", + "monitoring_location_id", + "monitoring_location_name", + "monitoring_location_number", + "month", + "national_aquifer_code", + "original_horizontal_datum", + "original_horizontal_datum_name", + "parameter_code", + "peak_since", + "qualifier", + "site_type", + "site_type_code", + "state_code", + "state_name", + "time", + "time_of_day", + "time_series_id", + "time_zone_abbreviation", + "unit_of_measure", + "uses_daylight_savings", + "value", + "vertical_datum", + "vertical_datum_name", + "water_year", + "well_constructed_depth", + "year" + ], + "time-series-metadata": [ + "begin", + "begin_utc", + "computation_identifier", + "computation_period_identifier", + "data_gap_interval", + "end", + "end_utc", + "geometry", + "hydrologic_unit_code", + "id", + "last_modified", + "monitoring_location_id", + "parameter_code", + "parameter_description", + "parameter_name", + "parent_time_series_id", + "primary", + "state_name", + "statistic_id", + "sublocation_identifier", + "thresholds", + "unit_of_measure", + "web_description" + ] +} diff --git a/tests/waterdata_queryables_test.py b/tests/waterdata_queryables_test.py new file mode 100644 index 00000000..3d26d08d --- /dev/null +++ b/tests/waterdata_queryables_test.py @@ -0,0 +1,180 @@ +"""Tests for :func:`dataretrieval.waterdata.get_queryables`, plus a live monitor +that flags upstream changes to the Water Data API's queryable sets. + +The live monitor (:func:`test_queryables_match_snapshot`) compares the +queryables each collection advertises against a committed snapshot +(``tests/data/waterdata_queryables.json``). When it fails, the upstream API has +added / removed / renamed a queryable: regenerate the snapshot and enable any +new queryables on the matching getter. Regenerate with:: + + python - <<'PY' + import httpx, json + from typing import get_args + from dataretrieval.waterdata.types import WATERDATA_SERVICES + base = "https://api.waterdata.usgs.gov/ogcapi/v0" + snap = {} + for c in get_args(WATERDATA_SERVICES): + r = httpx.get(f"{base}/collections/{c}/queryables", timeout=30) + r.raise_for_status() + snap[c] = sorted(r.json().get("properties", {})) + json.dump(snap, open("tests/data/waterdata_queryables.json", "w"), + indent=2, sort_keys=True) + open("tests/data/waterdata_queryables.json", "a").write("\\n") + PY +""" + +import json +import re +from pathlib import Path +from urllib.parse import parse_qs, urlsplit + +import pytest + +import dataretrieval +from dataretrieval import waterdata +from dataretrieval.utils import BaseMetadata +from tests.conftest import flaky_api + +# The OGC queryables endpoint for any Water Data collection. +QUERYABLES_RE = re.compile( + r"^https://api\.waterdata\.usgs\.gov/ogcapi/v0/collections/[^/]+/queryables$" +) + +# A minimal queryables document (the JSON Schema shape the real endpoint returns). +_FAKE_QUERYABLES = { + "type": "object", + "title": "Daily", + "$schema": "https://json-schema.org/draft/2019-09/schema", + "properties": { + "state_name": { + "title": "State name", + "type": "string", + "description": "The name of the state.\n", + }, + "parameter_code": { + "title": "Parameter code", + "type": "string", + "description": "5-digit codes.\n", + }, + }, +} + +_SNAPSHOT_PATH = Path(__file__).parent / "data" / "waterdata_queryables.json" +_SNAPSHOT = json.loads(_SNAPSHOT_PATH.read_text()) + + +# --- get_queryables unit tests (mocked) ------------------------------------ + + +def test_get_queryables_parses_properties(httpx_mock): + """Properties become one tidy row each, sorted by name, with the + description whitespace-stripped; returns ``(DataFrame, BaseMetadata)``.""" + httpx_mock.add_response(method="GET", url=QUERYABLES_RE, json=_FAKE_QUERYABLES) + + df, md = waterdata.get_queryables("daily") + + assert isinstance(md, BaseMetadata) + assert list(df.columns) == ["queryable", "type", "title", "description"] + # Sorted by name (parameter_code before state_name). + assert df["queryable"].tolist() == ["parameter_code", "state_name"] + row = df.set_index("queryable").loc["state_name"] + assert row["type"] == "string" + assert row["title"] == "State name" + assert row["description"] == "The name of the state." # trailing \n stripped + + +def test_get_queryables_unknown_collection_raises(httpx_mock): + """An HTTP error (e.g. a 404 for an unknown collection) is surfaced as the + typed ``DataRetrievalError``, not a bare DataFrame.""" + httpx_mock.add_response( + method="GET", + url=QUERYABLES_RE, + status_code=404, + json={"code": "404", "description": "Collection not found"}, + ) + + with pytest.raises(dataretrieval.DataRetrievalError): + waterdata.get_queryables("not-a-collection") + + +# --- passthrough queryables (mocked) --------------------------------------- + +_DAILY_ITEMS_RE = re.compile( + r"^https://api\.waterdata\.usgs\.gov/ogcapi/v0/collections/daily/items" +) +_DAILY_SCHEMA_RE = re.compile( + r"^https://api\.waterdata\.usgs\.gov/ogcapi/v0/collections/daily/schema$" +) +_EMPTY_FEATURES = { + "type": "FeatureCollection", + "features": [], + "numberReturned": 0, + "numberMatched": 0, + "links": [], +} + + +def _mock_daily(httpx_mock): + """Mock the two endpoints a ``get_daily`` call touches: the items query and + the schema fetch (used for output typing).""" + httpx_mock.add_response(method="GET", url=_DAILY_SCHEMA_RE, json={"properties": {}}) + httpx_mock.add_response(method="GET", url=_DAILY_ITEMS_RE, json=_EMPTY_FEATURES) + + +def _items_query(httpx_mock): + """Parsed query string of the ``/items`` request the getter sent.""" + req = next(r for r in httpx_mock.get_requests() if "/items" in str(r.url)) + return parse_qs(urlsplit(str(req.url)).query) + + +def test_passthrough_queryables_sent_as_filters(httpx_mock): + """An OGC getter forwards queryables that aren't in its explicit signature + (e.g. ``state_name``, ``site_type_code``) to the service as query filters, + alongside the named params.""" + _mock_daily(httpx_mock) + + waterdata.get_daily( + monitoring_location_id="USGS-05427718", + state_name="Wisconsin", + site_type_code="ST", + ) + + qs = _items_query(httpx_mock) + assert qs["state_name"] == ["Wisconsin"] + assert qs["site_type_code"] == ["ST"] + assert qs["monitoring_location_id"] == ["USGS-05427718"] + + +def test_passthrough_list_queryable_is_comma_joined(httpx_mock): + """A list-valued passthrough queryable is normalized and comma-joined like a + named multi-value param.""" + _mock_daily(httpx_mock) + + waterdata.get_daily( + monitoring_location_id="USGS-05427718", + site_type_code=["ST", "LK"], + ) + + assert _items_query(httpx_mock)["site_type_code"] == ["ST,LK"] + + +# --- live queryables monitor ----------------------------------------------- + + +@flaky_api +@pytest.mark.parametrize("collection", sorted(_SNAPSHOT)) +def test_queryables_match_snapshot(collection): + """Each collection's live queryables match the committed snapshot. + + A failure means the upstream API changed a collection's queryables. + Regenerate ``tests/data/waterdata_queryables.json`` (see this module's + docstring) and enable any newly added queryables on the matching getter. + """ + df, _ = waterdata.get_queryables(collection) + live = set(df["queryable"]) + expected = set(_SNAPSHOT[collection]) + assert live == expected, ( + f"{collection} queryables changed upstream: " + f"added={sorted(live - expected)}, removed={sorted(expected - live)}. " + f"Regenerate {_SNAPSHOT_PATH.name} and enable any new queryables." + ) diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 4d568d1f..735ea5ce 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -984,6 +984,19 @@ def test_with_state_conflict_raises(): ) +def test_with_state_conflict_via_queryables_raises(): + """A native state param arriving through ``**queryables`` (i.e. not an + explicit getter parameter, as with ``get_time_series_metadata``'s + ``state_code``) is flattened before the mutual-exclusion check, so combining + it with ``state`` still raises rather than silently sending both filters.""" + with pytest.raises(ValueError, match="not both"): + _utils_module._with_state( + {"state": "WI", "queryables": {"state_code": "55"}}, + to="name", + into="state_name", + ) + + def test_ogc_getter_resolves_state_at_getter_layer(monkeypatch): """The OGC getters resolve the unified ``state`` into ``state_name`` themselves (any encoding), so the shared ``get_ogc_data`` wrapper stays