DOI-USGS · thodson-usgs · Dec 20, 2022 · Dec 13, 2022 · Dec 16, 2022 · Dec 16, 2022
diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py
@@ -118,7 +118,7 @@ def _qwdata(datetime_index=True, **kwargs):
                'rdb_qw_attributes': 'expanded',
                'date_format': 'YYYY-MM-DD',
                'rdb_compression': 'value',
-               'submmitted_form': 'brief_list'}
+               'submitted_form': 'brief_list'}
     # 'qw_sample_wide': 'separated_wide'}
 
     # check for parameter codes, and reformat query args
@@ -770,12 +770,13 @@ def _read_json(json):
             # should be able to avoid this by dumping
             record_json = str(record_json).replace("'", '"')
 
-            # read json, converting all values to float64 and all qaulifiers
+            # read json, converting all values to float64 and all qualifiers
             # Lists can't be hashed, thus we cannot df.merge on a list column
             record_df = pd.read_json(record_json,
                                      orient='records',
                                      dtype={'value': 'float64',
-                                            'qualifiers': 'unicode'})
+                                            'qualifiers': 'unicode'},
+                                     convert_dates=False)
 
             record_df['qualifiers'] = (record_df['qualifiers']
                                        .str.strip("[]").str.replace("'", ""))
@@ -793,6 +794,9 @@ def _read_json(json):
                 merged_df = update_merge(merged_df, record_df, na_only=True,
                                          on=['site_no', 'datetime'])
 
+    # convert to datetime, normalizing the timezone to UTC when doing so
+    merged_df['datetime'] = pd.to_datetime(merged_df['datetime'], utc=True)
+
     return merged_df
 
 

diff --git a/docs/source/examples/readme_examples.rst b/docs/source/examples/readme_examples.rst
@@ -9,18 +9,41 @@ Examples from the Readme file on retrieving NWIS data
 
 .. doctest::
 
-    # first import the functions for downloading data from NWIS
-    import dataretrieval.nwis as nwis
-
-    # specify the USGS site code for which we want data.
-    site = '03339000'
-
-
-    # get instantaneous values (iv)
-    df = nwis.get_record(sites=site, service='iv', start='2017-12-31', end='2018-01-01')
-
-    # get water quality samples (qwdata)
-    df2 = nwis.get_record(sites=site, service='qwdata', start='2017-12-31', end='2018-01-01')
-
-    # get basic info about the site
-    df3 = nwis.get_record(sites=site, service='site')
+    >>> # first import the functions for downloading data from NWIS
+    >>> import dataretrieval.nwis as nwis
+
+    >>> # specify the USGS site code for which we want data.
+    >>> site = '03339000'
+
+    >>> # get instantaneous values (iv)
+    >>> df = nwis.get_record(sites=site, service='iv', start='2017-12-31', end='2018-01-01')
+
+    >>> df.head()
+                               00010 00010_cd   site_no  00060 00060_cd  ...  63680_ysi), [discontinued 10/5/21_cd 63680_hach  63680_hach_cd 99133  99133_cd
+    datetime                                                             ...
+    2017-12-31 06:00:00+00:00    1.0        A  03339000  140.0        A  ...                                     A        3.6              A  4.61         A
+    2017-12-31 06:15:00+00:00    1.0        A  03339000  138.0        A  ...                                     A        3.6              A  4.61         A
+    2017-12-31 06:30:00+00:00    1.0        A  03339000  139.0        A  ...                                     A        3.4              A  4.61         A
+    2017-12-31 06:45:00+00:00    1.0        A  03339000  139.0        A  ...                                     A        3.4              A  4.61         A
+    2017-12-31 07:00:00+00:00    1.0        A  03339000  139.0        A  ...                                     A        3.5              A  4.61         A
+    <BLANKLINE>
+    [5 rows x 21 columns]
+
+    >>> # get water quality samples (qwdata)
+    >>> df2 = nwis.get_record(sites=site, service='qwdata', start='2018-12-01', end='2019-01-01')
+
+    >>> print(df2)
+                              agency_cd   site_no   sample_dt sample_tm  sample_end_dt  sample_end_tm  ... p80154 p82398 p84164  p91157  p91158  p91159
+    datetime                                                                                           ...
+    2018-12-10 17:30:00+00:00      USGS  03339000  2018-12-10     11:30            NaN            NaN  ...     16     50   3060  0.0165  0.0141  0.0024
+    <BLANKLINE>
+    [1 rows x 33 columns]
+
+    >>> # get basic info about the site
+    >>> df3 = nwis.get_record(sites=site, service='site')
+
+    >>> print(df3)
+      agency_cd   site_no                         station_nm site_tp_cd  lat_va  ...  aqfr_type_cd  well_depth_va  hole_depth_va depth_src_cd project_no
+    0      USGS  03339000  VERMILION RIVER NEAR DANVILLE, IL         ST  400603  ...           NaN            NaN            NaN          NaN        100
+    <BLANKLINE>
+    [1 rows x 42 columns]
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -2,11 +2,11 @@ Welcome
 =======
 
 Welcome to the documentation for the Python ``dataretrieval`` package.
-``dataretreival`` is a Python alternative to the `USGS R dataRetrieval package`_
+``dataretrieval`` is a Python alternative to the `USGS R dataRetrieval package`_
 and is used to obtain USGS and EPA water quality data, streamflow data, and
 metadata directly from webservices.
 
-.. _USGS R dataRetrieval package: https://github.com/USGS-R/dataRetrieval
+.. _USGS R dataRetrieval package: https://github.com/DOI-USGS/dataRetrieval
 
 
 Table of Contents
@@ -16,6 +16,7 @@ Table of Contents
    :maxdepth: 1
 
    meta/installing
+   userguide/index
    examples/index
    meta/contributing
    meta/license

diff --git a/docs/source/userguide/index.rst b/docs/source/userguide/index.rst
@@ -0,0 +1,16 @@
+.. userguide:
+
+==========
+User Guide
+==========
+
+Contents
+--------
+
+.. toctree::
+    :maxdepth: 1
+
+    timeconventions
+
+
+.. include:: timeconventions.rst
diff --git a/docs/source/userguide/timeconventions.rst b/docs/source/userguide/timeconventions.rst
@@ -0,0 +1,80 @@
+.. timeconventions:
+
+Datetime Information
+--------------------
+
+``dataretrieval`` attempts to normalize time data to UTC time when converting
+web service data into dataframes. To do this, in-built pandas functions are
+used; either :obj:`pandas.to_datetime()` during the initial datetime object
+conversion, or :obj:`pandas.DataFrame.tz_localize()` if the datetime objects
+exist but are not UTC-localized. In most cases (single-site and multi-site),
+``dataretrieval`` assigns the datetime information as the dataframe *index*,
+the exception to this is when incomplete datetime information is available, in
+these cases integers are used as the dataframe index (see `PR#58`_ for more
+details).
+
+.. _PR#58: https://github.com/USGS-python/dataretrieval/pull/58
+
+
+Inspecting Timestamps
+*********************
+
+For single sites, the index of the returned dataframe contains pandas
+timestamps.
+
+.. code:: python
+
+    >>> import dataretrieval.nwis as nwis
+    >>> site = '03339000'
+    >>> df = nwis.get_record(sites=site, service='peaks',
+    ...                      start='2015-01-01', end='2017-12-31')
+    >>> print(df)
+                              agency_cd   site_no peak_tm  peak_va peak_cd  gage_ht  gage_ht_cd  year_last_pk  ag_dt  ag_tm  ag_gage_ht  ag_gage_ht_cd
+    datetime
+    2015-06-08 00:00:00+00:00      USGS  03339000   17:30    25100       C    22.83         NaN           NaN    NaN    NaN         NaN            NaN
+    2015-12-29 00:00:00+00:00      USGS  03339000   18:45    37600       C    26.66         NaN           NaN    NaN    NaN         NaN            NaN
+    2017-05-05 00:00:00+00:00      USGS  03339000   04:45    17000       C    18.47         NaN           NaN    NaN    NaN         NaN            NaN
+
+Here the index of the dataframe ``df`` is a set of datetime objects. Each has
+the format, ``YYYY-MM-DD HH:MM:SS+HH:MM``. Because these timestamps are
+localized to be in UTC, the expected offset (``+HH:MM``) is ``+00:00``.
+These values can be converted to a local timezone of your choosing using
+:obj:`pandas` functionality.
+
+.. code:: python
+
+    >>> df.index = df.index.tz_convert(tz='America/New_York')
+    >>> print(df)
+                              agency_cd   site_no peak_tm  peak_va peak_cd  gage_ht  gage_ht_cd  year_last_pk  ag_dt  ag_tm  ag_gage_ht  ag_gage_ht_cd
+    datetime
+    2015-06-07 20:00:00-04:00      USGS  03339000   17:30    25100       C    22.83         NaN           NaN    NaN    NaN         NaN            NaN
+    2015-12-28 19:00:00-05:00      USGS  03339000   18:45    37600       C    26.66         NaN           NaN    NaN    NaN         NaN            NaN
+    2017-05-04 20:00:00-04:00      USGS  03339000   04:45    17000       C    18.47         NaN           NaN    NaN    NaN         NaN            NaN
+
+Above, the index was converted to localize the timestamps to New York.
+In the updated dataframe index, the resulting timestamps now have offsets of
+``-04:00`` and ``-05:00`` as New York is either 4 or 5 hours behind UTC
+depending on the time of year (due to daylight savings).
+
+When information for multiple sites is requested, ``dataretrieval`` creates a
+dataframe with a multi-index, with the first entry containing the site number,
+and the second containing the datetime information.
+
+.. doctest::
+
+    >>> import dataretrieval.nwis as nwis
+    >>> sites = ['180049066381200', '290000095192602']
+    >>> df = nwis.get_record(sites=sites, service='gwlevels',
+    ...                      start='2021-10-01', end='2022-01-01')
+    >>> df
+                                              agency_cd site_tp_cd      lev_dt lev_tm lev_tz_cd  ...  lev_dt_acy_cd  lev_acy_cd  lev_src_cd  lev_meth_cd lev_age_cd
+    site_no         datetime                                                                     ...
+    180049066381200 2021-10-04 19:54:00+00:00      USGS         GW  2021-10-04  19:54     +0000  ...              m         NaN           S            S          A
+                    2021-11-16 14:28:00+00:00      USGS         GW  2021-11-16  14:28     +0000  ...              m         NaN           S            S          A
+                    2021-12-09 10:43:00+00:00      USGS         GW  2021-12-09  10:43     +0000  ...              m         NaN           S            S          A
+    290000095192602 2021-12-08 19:07:00+00:00      USGS         GW  2021-12-08  19:07     +0000  ...              m         NaN           S            S          P
+    <BLANKLINE>
+    [4 rows x 15 columns]
+
+Here note that the default datetime index information returned is also UTC
+localized, and therefore the offset values are ``+00:00``.
diff --git a/tests/nwis_test.py b/tests/nwis_test.py
@@ -1,7 +1,9 @@
 import numpy as np
 import pandas as pd
 import pytest
+import datetime
 from dataretrieval.nwis import get_record, preformat_peaks_response
+from dataretrieval.nwis import what_sites, get_iv, get_dv, get_discharge_peaks
 
 START_DATE = '2018-01-24'
 END_DATE   = '2018-01-25'
@@ -36,7 +38,6 @@ def test_iv_service():
 
 def test_iv_service_answer():
     df = test_iv_service()
-
     # check multiindex function
     assert df.index.names == [SITENO_COL, DATETIME_COL], "iv service returned incorrect index: {}".format(df.index.names)
 
@@ -122,3 +123,26 @@ def test_inc_date_03():
     assert df.shape == df2.shape
     # assert that the datetime index is not there
     assert df2.index.name != 'datetime'
+
+
+class TestTZ:
+    """Tests relating to GitHub Issue #60."""
+    sites, _ = what_sites(stateCd='MD')
+
+    def test_multiple_tz_01(self):
+        """Test based on GitHub Issue #60 - error merging different time zones."""
+        # this test fails before issue #60 is fixed
+        iv, _ = get_iv(sites=self.sites.site_no.values[:25].tolist())
+        # assert that the datetime column exists
+        assert 'datetime' in iv.index.names
+        # assert that it is a datetime type
+        assert isinstance(iv.index[0][1], datetime.datetime)
+
+    def test_multiple_tz_02(self):
+        """Test based on GitHub Issue #60 - confirm behavior for same tz."""
+        # this test passes before issue #60 is fixed
+        iv, _ = get_iv(sites=self.sites.site_no.values[:20].tolist())
+        # assert that the datetime column exists
+        assert 'datetime' in iv.index.names
+        # assert that it is a datetime type
+        assert isinstance(iv.index[0][1], datetime.datetime)
diff --git a/tests/waterservices_test.py b/tests/waterservices_test.py
@@ -116,7 +116,7 @@ def test_get_qwdata(requests_mock):
     request_url = 'https://nwis.waterdata.usgs.gov/nwis/qwdata?site_no={}' \
                   '&qw_sample_wide=qw_sample_wide&agency_cd=USGS&format={}&pm_cd_compare=Greater+than' \
                   '&inventory_output=0&rdb_inventory_output=file&TZoutput=0&rdb_qw_attributes=expanded' \
-                  '&date_format=YYYY-MM-DD&rdb_compression=value&submmitted_form=brief_list'.format(site, format)
+                  '&date_format=YYYY-MM-DD&rdb_compression=value&submitted_form=brief_list'.format(site, format)
     response_file_path = 'data/waterdata_qwdata.txt'
     mock_request(requests_mock, request_url, response_file_path)
     with pytest.warns(DeprecationWarning):