Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions dataretrieval/nwis.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def _qwdata(datetime_index=True, **kwargs):
'rdb_qw_attributes': 'expanded',
'date_format': 'YYYY-MM-DD',
'rdb_compression': 'value',
'submmitted_form': 'brief_list'}
'submitted_form': 'brief_list'}
# 'qw_sample_wide': 'separated_wide'}

# check for parameter codes, and reformat query args
Expand Down Expand Up @@ -770,12 +770,13 @@ def _read_json(json):
# should be able to avoid this by dumping
record_json = str(record_json).replace("'", '"')

# read json, converting all values to float64 and all qaulifiers
# read json, converting all values to float64 and all qualifiers
# Lists can't be hashed, thus we cannot df.merge on a list column
record_df = pd.read_json(record_json,
orient='records',
dtype={'value': 'float64',
'qualifiers': 'unicode'})
'qualifiers': 'unicode'},
convert_dates=False)

record_df['qualifiers'] = (record_df['qualifiers']
.str.strip("[]").str.replace("'", ""))
Expand All @@ -793,6 +794,9 @@ def _read_json(json):
merged_df = update_merge(merged_df, record_df, na_only=True,
on=['site_no', 'datetime'])

# convert to datetime, normalizing the timezone to UTC when doing so
merged_df['datetime'] = pd.to_datetime(merged_df['datetime'], utc=True)

return merged_df


Expand Down
53 changes: 38 additions & 15 deletions docs/source/examples/readme_examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,41 @@ Examples from the Readme file on retrieving NWIS data

.. doctest::

# first import the functions for downloading data from NWIS
import dataretrieval.nwis as nwis

# specify the USGS site code for which we want data.
site = '03339000'


# get instantaneous values (iv)
df = nwis.get_record(sites=site, service='iv', start='2017-12-31', end='2018-01-01')

# get water quality samples (qwdata)
df2 = nwis.get_record(sites=site, service='qwdata', start='2017-12-31', end='2018-01-01')

# get basic info about the site
df3 = nwis.get_record(sites=site, service='site')
>>> # first import the functions for downloading data from NWIS
>>> import dataretrieval.nwis as nwis

>>> # specify the USGS site code for which we want data.
>>> site = '03339000'

>>> # get instantaneous values (iv)
>>> df = nwis.get_record(sites=site, service='iv', start='2017-12-31', end='2018-01-01')

>>> df.head()
00010 00010_cd site_no 00060 00060_cd ... 63680_ysi), [discontinued 10/5/21_cd 63680_hach 63680_hach_cd 99133 99133_cd
datetime ...
2017-12-31 06:00:00+00:00 1.0 A 03339000 140.0 A ... A 3.6 A 4.61 A
2017-12-31 06:15:00+00:00 1.0 A 03339000 138.0 A ... A 3.6 A 4.61 A
2017-12-31 06:30:00+00:00 1.0 A 03339000 139.0 A ... A 3.4 A 4.61 A
2017-12-31 06:45:00+00:00 1.0 A 03339000 139.0 A ... A 3.4 A 4.61 A
2017-12-31 07:00:00+00:00 1.0 A 03339000 139.0 A ... A 3.5 A 4.61 A
<BLANKLINE>
[5 rows x 21 columns]

>>> # get water quality samples (qwdata)
>>> df2 = nwis.get_record(sites=site, service='qwdata', start='2018-12-01', end='2019-01-01')

>>> print(df2)
agency_cd site_no sample_dt sample_tm sample_end_dt sample_end_tm ... p80154 p82398 p84164 p91157 p91158 p91159
datetime ...
2018-12-10 17:30:00+00:00 USGS 03339000 2018-12-10 11:30 NaN NaN ... 16 50 3060 0.0165 0.0141 0.0024
<BLANKLINE>
[1 rows x 33 columns]

>>> # get basic info about the site
>>> df3 = nwis.get_record(sites=site, service='site')

>>> print(df3)
agency_cd site_no station_nm site_tp_cd lat_va ... aqfr_type_cd well_depth_va hole_depth_va depth_src_cd project_no
0 USGS 03339000 VERMILION RIVER NEAR DANVILLE, IL ST 400603 ... NaN NaN NaN NaN 100
<BLANKLINE>
[1 rows x 42 columns]
5 changes: 3 additions & 2 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ Welcome
=======

Welcome to the documentation for the Python ``dataretrieval`` package.
``dataretreival`` is a Python alternative to the `USGS R dataRetrieval package`_
``dataretrieval`` is a Python alternative to the `USGS R dataRetrieval package`_
and is used to obtain USGS and EPA water quality data, streamflow data, and
metadata directly from webservices.

.. _USGS R dataRetrieval package: https://github.com/USGS-R/dataRetrieval
.. _USGS R dataRetrieval package: https://github.com/DOI-USGS/dataRetrieval


Table of Contents
Expand All @@ -16,6 +16,7 @@ Table of Contents
:maxdepth: 1

meta/installing
userguide/index
examples/index
meta/contributing
meta/license
Expand Down
16 changes: 16 additions & 0 deletions docs/source/userguide/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.. userguide:

==========
User Guide
==========

Contents
--------

.. toctree::
:maxdepth: 1

timeconventions


.. include:: timeconventions.rst
80 changes: 80 additions & 0 deletions docs/source/userguide/timeconventions.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
.. timeconventions:

Datetime Information
--------------------

``dataretrieval`` attempts to normalize time data to UTC time when converting
web service data into dataframes. To do this, in-built pandas functions are
used; either :obj:`pandas.to_datetime()` during the initial datetime object
conversion, or :obj:`pandas.DataFrame.tz_localize()` if the datetime objects
exist but are not UTC-localized. In most cases (single-site and multi-site),
``dataretrieval`` assigns the datetime information as the dataframe *index*,
the exception to this is when incomplete datetime information is available, in
these cases integers are used as the dataframe index (see `PR#58`_ for more
details).

.. _PR#58: https://github.com/USGS-python/dataretrieval/pull/58


Inspecting Timestamps
*********************

For single sites, the index of the returned dataframe contains pandas
timestamps.

.. code:: python

>>> import dataretrieval.nwis as nwis
>>> site = '03339000'
>>> df = nwis.get_record(sites=site, service='peaks',
... start='2015-01-01', end='2017-12-31')
>>> print(df)
agency_cd site_no peak_tm peak_va peak_cd gage_ht gage_ht_cd year_last_pk ag_dt ag_tm ag_gage_ht ag_gage_ht_cd
datetime
2015-06-08 00:00:00+00:00 USGS 03339000 17:30 25100 C 22.83 NaN NaN NaN NaN NaN NaN
2015-12-29 00:00:00+00:00 USGS 03339000 18:45 37600 C 26.66 NaN NaN NaN NaN NaN NaN
2017-05-05 00:00:00+00:00 USGS 03339000 04:45 17000 C 18.47 NaN NaN NaN NaN NaN NaN

Here the index of the dataframe ``df`` is a set of datetime objects. Each has
the format, ``YYYY-MM-DD HH:MM:SS+HH:MM``. Because these timestamps are
localized to be in UTC, the expected offset (``+HH:MM``) is ``+00:00``.
These values can be converted to a local timezone of your choosing using
:obj:`pandas` functionality.

.. code:: python

>>> df.index = df.index.tz_convert(tz='America/New_York')
>>> print(df)
agency_cd site_no peak_tm peak_va peak_cd gage_ht gage_ht_cd year_last_pk ag_dt ag_tm ag_gage_ht ag_gage_ht_cd
datetime
2015-06-07 20:00:00-04:00 USGS 03339000 17:30 25100 C 22.83 NaN NaN NaN NaN NaN NaN
2015-12-28 19:00:00-05:00 USGS 03339000 18:45 37600 C 26.66 NaN NaN NaN NaN NaN NaN
2017-05-04 20:00:00-04:00 USGS 03339000 04:45 17000 C 18.47 NaN NaN NaN NaN NaN NaN

Above, the index was converted to localize the timestamps to New York.
In the updated dataframe index, the resulting timestamps now have offsets of
``-04:00`` and ``-05:00`` as New York is either 4 or 5 hours behind UTC
depending on the time of year (due to daylight savings).

When information for multiple sites is requested, ``dataretrieval`` creates a
dataframe with a multi-index, with the first entry containing the site number,
and the second containing the datetime information.

.. doctest::

>>> import dataretrieval.nwis as nwis
>>> sites = ['180049066381200', '290000095192602']
>>> df = nwis.get_record(sites=sites, service='gwlevels',
... start='2021-10-01', end='2022-01-01')
>>> df
agency_cd site_tp_cd lev_dt lev_tm lev_tz_cd ... lev_dt_acy_cd lev_acy_cd lev_src_cd lev_meth_cd lev_age_cd
site_no datetime ...
180049066381200 2021-10-04 19:54:00+00:00 USGS GW 2021-10-04 19:54 +0000 ... m NaN S S A
2021-11-16 14:28:00+00:00 USGS GW 2021-11-16 14:28 +0000 ... m NaN S S A
2021-12-09 10:43:00+00:00 USGS GW 2021-12-09 10:43 +0000 ... m NaN S S A
290000095192602 2021-12-08 19:07:00+00:00 USGS GW 2021-12-08 19:07 +0000 ... m NaN S S P
<BLANKLINE>
[4 rows x 15 columns]

Here note that the default datetime index information returned is also UTC
localized, and therefore the offset values are ``+00:00``.
26 changes: 25 additions & 1 deletion tests/nwis_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import numpy as np
import pandas as pd
import pytest
import datetime
from dataretrieval.nwis import get_record, preformat_peaks_response
from dataretrieval.nwis import what_sites, get_iv, get_dv, get_discharge_peaks

START_DATE = '2018-01-24'
END_DATE = '2018-01-25'
Expand Down Expand Up @@ -36,7 +38,6 @@ def test_iv_service():

def test_iv_service_answer():
df = test_iv_service()

# check multiindex function
assert df.index.names == [SITENO_COL, DATETIME_COL], "iv service returned incorrect index: {}".format(df.index.names)

Expand Down Expand Up @@ -122,3 +123,26 @@ def test_inc_date_03():
assert df.shape == df2.shape
# assert that the datetime index is not there
assert df2.index.name != 'datetime'


class TestTZ:
"""Tests relating to GitHub Issue #60."""
sites, _ = what_sites(stateCd='MD')

def test_multiple_tz_01(self):
"""Test based on GitHub Issue #60 - error merging different time zones."""
# this test fails before issue #60 is fixed
iv, _ = get_iv(sites=self.sites.site_no.values[:25].tolist())
# assert that the datetime column exists
assert 'datetime' in iv.index.names
# assert that it is a datetime type
assert isinstance(iv.index[0][1], datetime.datetime)

def test_multiple_tz_02(self):
"""Test based on GitHub Issue #60 - confirm behavior for same tz."""
# this test passes before issue #60 is fixed
iv, _ = get_iv(sites=self.sites.site_no.values[:20].tolist())
# assert that the datetime column exists
assert 'datetime' in iv.index.names
# assert that it is a datetime type
assert isinstance(iv.index[0][1], datetime.datetime)
2 changes: 1 addition & 1 deletion tests/waterservices_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def test_get_qwdata(requests_mock):
request_url = 'https://nwis.waterdata.usgs.gov/nwis/qwdata?site_no={}' \
'&qw_sample_wide=qw_sample_wide&agency_cd=USGS&format={}&pm_cd_compare=Greater+than' \
'&inventory_output=0&rdb_inventory_output=file&TZoutput=0&rdb_qw_attributes=expanded' \
'&date_format=YYYY-MM-DD&rdb_compression=value&submmitted_form=brief_list'.format(site, format)
'&date_format=YYYY-MM-DD&rdb_compression=value&submitted_form=brief_list'.format(site, format)
response_file_path = 'data/waterdata_qwdata.txt'
mock_request(requests_mock, request_url, response_file_path)
with pytest.warns(DeprecationWarning):
Expand Down