From 162568250c4b807baac1574902dee08747788f12 Mon Sep 17 00:00:00 2001 From: elbeejay Date: Tue, 13 Dec 2022 17:14:17 -0500 Subject: [PATCH 1/6] failing tz testcase --- dataretrieval/nwis.py | 8 ++++++-- tests/nwis_test.py | 9 ++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index c385e28b..76c167b5 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -50,6 +50,9 @@ def format_response(df, service=None, **kwargs): df = df.tz_localize('UTC', level=1) else: + # transform datetime column to datetime values + #df['datetime'] = pd.to_datetime(df.pop('datetime'), errors='coerce') + # define index using datetime values df.set_index(['datetime'], inplace=True) if hasattr(df.index, 'tzinfo') and df.index.tzinfo is None: df = df.tz_localize('UTC') @@ -770,12 +773,13 @@ def _read_json(json): # should be able to avoid this by dumping record_json = str(record_json).replace("'", '"') - # read json, converting all values to float64 and all qaulifiers + # read json, converting all values to float64 and all qualifiers # Lists can't be hashed, thus we cannot df.merge on a list column record_df = pd.read_json(record_json, orient='records', dtype={'value': 'float64', - 'qualifiers': 'unicode'}) + 'qualifiers': 'unicode'}, + convert_dates=True) record_df['qualifiers'] = (record_df['qualifiers'] .str.strip("[]").str.replace("'", "")) diff --git a/tests/nwis_test.py b/tests/nwis_test.py index 44a40a56..0b6a68f2 100644 --- a/tests/nwis_test.py +++ b/tests/nwis_test.py @@ -2,6 +2,7 @@ import pandas as pd import pytest from dataretrieval.nwis import get_record, preformat_peaks_response +from dataretrieval.nwis import what_sites, get_iv START_DATE = '2018-01-24' END_DATE = '2018-01-25' @@ -36,7 +37,6 @@ def test_iv_service(): def test_iv_service_answer(): df = test_iv_service() - # check multiindex function assert df.index.names == [SITENO_COL, DATETIME_COL], "iv service returned incorrect index: {}".format(df.index.names) @@ -122,3 +122,10 @@ def test_inc_date_03(): assert df.shape == df2.shape # assert that the datetime index is not there assert df2.index.name != 'datetime' + + +def test_multiple_tz_01(): + """Test based on GitHub Issue #60 - error merging different time zones.""" + sites, sites_md = what_sites(stateCd='MD') + iv, iv_md = get_iv(sites=sites.site_no.values[:25].tolist()) + import pdb; pdb.set_trace() \ No newline at end of file From 4b7b0e8538b5cd134768f9ff27d981a6e5967849 Mon Sep 17 00:00:00 2001 From: elbeejay Date: Fri, 16 Dec 2022 08:36:42 -0500 Subject: [PATCH 2/6] pair of tests, one fails, one passes before fix --- tests/nwis_test.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/tests/nwis_test.py b/tests/nwis_test.py index 0b6a68f2..8800a109 100644 --- a/tests/nwis_test.py +++ b/tests/nwis_test.py @@ -1,8 +1,9 @@ import numpy as np import pandas as pd import pytest +import datetime from dataretrieval.nwis import get_record, preformat_peaks_response -from dataretrieval.nwis import what_sites, get_iv +from dataretrieval.nwis import what_sites, get_iv, get_dv, get_discharge_peaks START_DATE = '2018-01-24' END_DATE = '2018-01-25' @@ -124,8 +125,24 @@ def test_inc_date_03(): assert df2.index.name != 'datetime' -def test_multiple_tz_01(): - """Test based on GitHub Issue #60 - error merging different time zones.""" - sites, sites_md = what_sites(stateCd='MD') - iv, iv_md = get_iv(sites=sites.site_no.values[:25].tolist()) - import pdb; pdb.set_trace() \ No newline at end of file +class TestTZ: + """Tests relating to GitHub Issue #60.""" + sites, _ = what_sites(stateCd='MD') + + def test_multiple_tz_01(self): + """Test based on GitHub Issue #60 - error merging different time zones.""" + # this test fails before issue #60 is fixed + iv, _ = get_iv(sites=self.sites.site_no.values[:25].tolist()) + # assert that the datetime column exists + assert 'datetime' in iv.index.names + # assert that it is a datetime type + assert isinstance(iv.index[0][1], datetime.datetime) + + def test_multiple_tz_02(self): + """Test based on GitHub Issue #60 - confirm behavior for same tz.""" + # this test passes before issue #60 is fixed + iv, _ = get_iv(sites=self.sites.site_no.values[:20].tolist()) + # assert that the datetime column exists + assert 'datetime' in iv.index.names + # assert that it is a datetime type + assert isinstance(iv.index[0][1], datetime.datetime) \ No newline at end of file From f4bcbc567b2961fc573c38bb625cddada21be11f Mon Sep 17 00:00:00 2001 From: elbeejay Date: Fri, 16 Dec 2022 08:37:17 -0500 Subject: [PATCH 3/6] fix for multiple tz by converting tz at end --- dataretrieval/nwis.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index 76c167b5..36683ed6 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -779,7 +779,7 @@ def _read_json(json): orient='records', dtype={'value': 'float64', 'qualifiers': 'unicode'}, - convert_dates=True) + convert_dates=False) record_df['qualifiers'] = (record_df['qualifiers'] .str.strip("[]").str.replace("'", "")) @@ -797,6 +797,9 @@ def _read_json(json): merged_df = update_merge(merged_df, record_df, na_only=True, on=['site_no', 'datetime']) + # convert to datetime + merged_df['datetime'] = pd.to_datetime(merged_df['datetime']) + return merged_df From 1908d0a12f5b00bb123f28d17ea231a18dbedefc Mon Sep 17 00:00:00 2001 From: elbeejay Date: Fri, 16 Dec 2022 11:41:41 -0500 Subject: [PATCH 4/6] normalize tz to utc to match single-site process --- dataretrieval/nwis.py | 9 +++------ tests/nwis_test.py | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index 36683ed6..a0dc27b4 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -50,9 +50,6 @@ def format_response(df, service=None, **kwargs): df = df.tz_localize('UTC', level=1) else: - # transform datetime column to datetime values - #df['datetime'] = pd.to_datetime(df.pop('datetime'), errors='coerce') - # define index using datetime values df.set_index(['datetime'], inplace=True) if hasattr(df.index, 'tzinfo') and df.index.tzinfo is None: df = df.tz_localize('UTC') @@ -121,7 +118,7 @@ def _qwdata(datetime_index=True, **kwargs): 'rdb_qw_attributes': 'expanded', 'date_format': 'YYYY-MM-DD', 'rdb_compression': 'value', - 'submmitted_form': 'brief_list'} + 'submitted_form': 'brief_list'} # 'qw_sample_wide': 'separated_wide'} # check for parameter codes, and reformat query args @@ -797,8 +794,8 @@ def _read_json(json): merged_df = update_merge(merged_df, record_df, na_only=True, on=['site_no', 'datetime']) - # convert to datetime - merged_df['datetime'] = pd.to_datetime(merged_df['datetime']) + # convert to datetime, normalizing the timezone to UTC when doing so + merged_df['datetime'] = pd.to_datetime(merged_df['datetime'], utc=True) return merged_df diff --git a/tests/nwis_test.py b/tests/nwis_test.py index 8800a109..6b6ba7e0 100644 --- a/tests/nwis_test.py +++ b/tests/nwis_test.py @@ -145,4 +145,4 @@ def test_multiple_tz_02(self): # assert that the datetime column exists assert 'datetime' in iv.index.names # assert that it is a datetime type - assert isinstance(iv.index[0][1], datetime.datetime) \ No newline at end of file + assert isinstance(iv.index[0][1], datetime.datetime) From 4bfc256ad05aadbfe94af6f3d4c49faf49c93171 Mon Sep 17 00:00:00 2001 From: elbeejay Date: Fri, 16 Dec 2022 12:57:04 -0500 Subject: [PATCH 5/6] documentation for time --- docs/source/examples/readme_examples.rst | 53 ++++++++++----- docs/source/index.rst | 5 +- docs/source/userguide/index.rst | 16 +++++ docs/source/userguide/timeconventions.rst | 80 +++++++++++++++++++++++ 4 files changed, 137 insertions(+), 17 deletions(-) create mode 100644 docs/source/userguide/index.rst create mode 100644 docs/source/userguide/timeconventions.rst diff --git a/docs/source/examples/readme_examples.rst b/docs/source/examples/readme_examples.rst index 3dc14609..1caa0264 100644 --- a/docs/source/examples/readme_examples.rst +++ b/docs/source/examples/readme_examples.rst @@ -9,18 +9,41 @@ Examples from the Readme file on retrieving NWIS data .. doctest:: - # first import the functions for downloading data from NWIS - import dataretrieval.nwis as nwis - - # specify the USGS site code for which we want data. - site = '03339000' - - - # get instantaneous values (iv) - df = nwis.get_record(sites=site, service='iv', start='2017-12-31', end='2018-01-01') - - # get water quality samples (qwdata) - df2 = nwis.get_record(sites=site, service='qwdata', start='2017-12-31', end='2018-01-01') - - # get basic info about the site - df3 = nwis.get_record(sites=site, service='site') \ No newline at end of file + >>> # first import the functions for downloading data from NWIS + >>> import dataretrieval.nwis as nwis + + >>> # specify the USGS site code for which we want data. + >>> site = '03339000' + + >>> # get instantaneous values (iv) + >>> df = nwis.get_record(sites=site, service='iv', start='2017-12-31', end='2018-01-01') + + >>> df.head() + 00010 00010_cd site_no 00060 00060_cd ... 63680_ysi), [discontinued 10/5/21_cd 63680_hach 63680_hach_cd 99133 99133_cd + datetime ... + 2017-12-31 06:00:00+00:00 1.0 A 03339000 140.0 A ... A 3.6 A 4.61 A + 2017-12-31 06:15:00+00:00 1.0 A 03339000 138.0 A ... A 3.6 A 4.61 A + 2017-12-31 06:30:00+00:00 1.0 A 03339000 139.0 A ... A 3.4 A 4.61 A + 2017-12-31 06:45:00+00:00 1.0 A 03339000 139.0 A ... A 3.4 A 4.61 A + 2017-12-31 07:00:00+00:00 1.0 A 03339000 139.0 A ... A 3.5 A 4.61 A + + [5 rows x 21 columns] + + >>> # get water quality samples (qwdata) + >>> df2 = nwis.get_record(sites=site, service='qwdata', start='2018-12-01', end='2019-01-01') + + >>> print(df2) + agency_cd site_no sample_dt sample_tm sample_end_dt sample_end_tm ... p80154 p82398 p84164 p91157 p91158 p91159 + datetime ... + 2018-12-10 17:30:00+00:00 USGS 03339000 2018-12-10 11:30 NaN NaN ... 16 50 3060 0.0165 0.0141 0.0024 + + [1 rows x 33 columns] + + >>> # get basic info about the site + >>> df3 = nwis.get_record(sites=site, service='site') + + >>> print(df3) + agency_cd site_no station_nm site_tp_cd lat_va ... aqfr_type_cd well_depth_va hole_depth_va depth_src_cd project_no + 0 USGS 03339000 VERMILION RIVER NEAR DANVILLE, IL ST 400603 ... NaN NaN NaN NaN 100 + + [1 rows x 42 columns] \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 6c8bcc4a..54f1110b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -2,11 +2,11 @@ Welcome ======= Welcome to the documentation for the Python ``dataretrieval`` package. -``dataretreival`` is a Python alternative to the `USGS R dataRetrieval package`_ +``dataretrieval`` is a Python alternative to the `USGS R dataRetrieval package`_ and is used to obtain USGS and EPA water quality data, streamflow data, and metadata directly from webservices. -.. _USGS R dataRetrieval package: https://github.com/USGS-R/dataRetrieval +.. _USGS R dataRetrieval package: https://github.com/DOI-USGS/dataRetrieval Table of Contents @@ -16,6 +16,7 @@ Table of Contents :maxdepth: 1 meta/installing + userguide/index examples/index meta/contributing meta/license diff --git a/docs/source/userguide/index.rst b/docs/source/userguide/index.rst new file mode 100644 index 00000000..4d7ae895 --- /dev/null +++ b/docs/source/userguide/index.rst @@ -0,0 +1,16 @@ +.. userguide: + +========== +User Guide +========== + +Contents +-------- + +.. toctree:: + :maxdepth: 1 + + timeconventions + + +.. include:: timeconventions.rst diff --git a/docs/source/userguide/timeconventions.rst b/docs/source/userguide/timeconventions.rst new file mode 100644 index 00000000..60fab8ff --- /dev/null +++ b/docs/source/userguide/timeconventions.rst @@ -0,0 +1,80 @@ +.. timeconventions: + +Datetime Information +-------------------- + +``dataretrieval`` attempts to normalize time data to UTC time when converting +web service data into dataframes. To do this, in-built pandas functions are +used; either :obj:`pandas.to_datetime()` during the initial datetime object +conversion, or :obj:`pandas.DataFrame.tz_localize()` if the datetime objects +exist but are not UTC-localized. In most cases (single-site and multi-site), +``dataretrieval`` assigns the datetime information as the dataframe *index*, +the exception to this is when incomplete datetime information is available, in +these cases integers are used as the dataframe index (see `PR#58`_ for more +details). + +.. _PR#58: https://github.com/USGS-python/dataretrieval/pull/58 + + +Inspecting Timestamps +********************* + +For single sites, the index of the returned dataframe contains pandas +timestamps. + +.. code:: python + + >>> import dataretrieval.nwis as nwis + >>> site = '03339000' + >>> df = nwis.get_record(sites=site, service='peaks', + ... start='2015-01-01', end='2017-12-31') + >>> print(df) + agency_cd site_no peak_tm peak_va peak_cd gage_ht gage_ht_cd year_last_pk ag_dt ag_tm ag_gage_ht ag_gage_ht_cd + datetime + 2015-06-08 00:00:00+00:00 USGS 03339000 17:30 25100 C 22.83 NaN NaN NaN NaN NaN NaN + 2015-12-29 00:00:00+00:00 USGS 03339000 18:45 37600 C 26.66 NaN NaN NaN NaN NaN NaN + 2017-05-05 00:00:00+00:00 USGS 03339000 04:45 17000 C 18.47 NaN NaN NaN NaN NaN NaN + +Here the index of the dataframe ``df`` is a set of datetime objects. Each has +the format, ``YYYY-MM-DD HH:MM:SS+HH:MM``. Because these timestamps are +localized to be in UTC, the expected offset (``+HH:MM``) is ``+00:00``. +These values can be converted to a local timezone of your choosing using +:obj:`pandas` functionality. + +.. code:: python + + >>> df.index = df.index.tz_convert(tz='America/New_York') + >>> print(df) + agency_cd site_no peak_tm peak_va peak_cd gage_ht gage_ht_cd year_last_pk ag_dt ag_tm ag_gage_ht ag_gage_ht_cd + datetime + 2015-06-07 20:00:00-04:00 USGS 03339000 17:30 25100 C 22.83 NaN NaN NaN NaN NaN NaN + 2015-12-28 19:00:00-05:00 USGS 03339000 18:45 37600 C 26.66 NaN NaN NaN NaN NaN NaN + 2017-05-04 20:00:00-04:00 USGS 03339000 04:45 17000 C 18.47 NaN NaN NaN NaN NaN NaN + +Above, the index was converted to localize the timestamps to New York. +In the updated dataframe index, the resulting timestamps now have offsets of +``-04:00`` and ``-05:00`` as New York is either 4 or 5 hours behind UTC +depending on the time of year (due to daylight savings). + +When information for multiple sites is requested, ``dataretrieval`` creates a +dataframe with a multi-index, with the first entry containing the site number, +and the second containing the datetime information. + +.. doctest:: + + >>> import dataretrieval.nwis as nwis + >>> sites = ['180049066381200', '290000095192602'] + >>> df = nwis.get_record(sites=sites, service='gwlevels', + ... start='2021-10-01', end='2022-01-01') + >>> df + agency_cd site_tp_cd lev_dt lev_tm lev_tz_cd ... lev_dt_acy_cd lev_acy_cd lev_src_cd lev_meth_cd lev_age_cd + site_no datetime ... + 180049066381200 2021-10-04 19:54:00+00:00 USGS GW 2021-10-04 19:54 +0000 ... m NaN S S A + 2021-11-16 14:28:00+00:00 USGS GW 2021-11-16 14:28 +0000 ... m NaN S S A + 2021-12-09 10:43:00+00:00 USGS GW 2021-12-09 10:43 +0000 ... m NaN S S A + 290000095192602 2021-12-08 19:07:00+00:00 USGS GW 2021-12-08 19:07 +0000 ... m NaN S S P + + [4 rows x 15 columns] + +Here note that the default datetime index information returned is also UTC +localized, and therefore the offset values are ``+00:00``. \ No newline at end of file From d8a83f9dcdcd8f44cace708c1ea7882449d69a09 Mon Sep 17 00:00:00 2001 From: "J. Hariharan" Date: Mon, 19 Dec 2022 13:59:15 -0500 Subject: [PATCH 6/6] Update waterservices_test.py submmitted -> submitted typo fix --- tests/waterservices_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/waterservices_test.py b/tests/waterservices_test.py index 2ae04b0a..bb0ec105 100755 --- a/tests/waterservices_test.py +++ b/tests/waterservices_test.py @@ -116,7 +116,7 @@ def test_get_qwdata(requests_mock): request_url = 'https://nwis.waterdata.usgs.gov/nwis/qwdata?site_no={}' \ '&qw_sample_wide=qw_sample_wide&agency_cd=USGS&format={}&pm_cd_compare=Greater+than' \ '&inventory_output=0&rdb_inventory_output=file&TZoutput=0&rdb_qw_attributes=expanded' \ - '&date_format=YYYY-MM-DD&rdb_compression=value&submmitted_form=brief_list'.format(site, format) + '&date_format=YYYY-MM-DD&rdb_compression=value&submitted_form=brief_list'.format(site, format) response_file_path = 'data/waterdata_qwdata.txt' mock_request(requests_mock, request_url, response_file_path) with pytest.warns(DeprecationWarning):