From b04b0080bcb8fb30c85d12e7d3b5503dc4f1f220 Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Thu, 20 Feb 2020 17:28:11 +1100
Subject: [PATCH 01/33] initial commit

---
 .../velocity_hourly_timeseries.py             | 430 ++++++++++++++++++
 .../velocity_hourly_timeseries_template.json  | 214 +++++++++
 2 files changed, 644 insertions(+)
 create mode 100644 aodntools/timeseries_products/velocity_hourly_timeseries.py
 create mode 100644 aodntools/timeseries_products/velocity_hourly_timeseries_template.json

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
new file mode 100644
index 0000000..aefe96e
--- /dev/null
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -0,0 +1,430 @@
+import os
+import sys
+import tempfile
+import shutil
+from netCDF4 import Dataset, num2date
+import numpy as np
+import json
+from datetime import datetime
+import argparse
+
+from pkg_resources import resource_filename
+from aodntools import __version__
+import aodntools.timeseries_products.aggregated_timeseries as TStools
+
+import xarray as xr
+import pandas as pd
+
+
+TEMPLATE_JSON = resource_filename(__name__, 'velocity_hourly_timeseries_template.json')
+
+
+def sort_files(file_list, input_dir=""):
+    """
+    sort list of files according to deployment date
+    :param file_list: List of files to sort
+    :return: sorted list of files
+    """
+
+    time_start = []
+    for file in file_list:
+        with Dataset(os.path.join(input_dir, file), 'r') as ds:
+            time_start.append(np.datetime64(ds.time_deployment_start))
+    tuples = sorted(zip(time_start, files_to_agg))
+    return [t[1] for t in tuples]
+
+
+def check_file(nc, site_code):
+    """
+    Return list of errors found in the file:
+    Variables of interest are present
+    TIME, DEPTH, LATITUDE, LONGITUDE,  is present
+    NOMINAL_DEPTH is not present as variable or attribute
+    file_version is not FV01
+    if LATITUDE or LONIGITUDE dimension has length >1
+    if TIME.seconds_to_middle_of_measurement exist
+
+    :param nc: xarray dataset
+    :param site_code: code of the mooring site
+    :return: dictionary with the file name and list of failed tests
+    """
+
+    attributes = list(nc.attrs)
+    variables = list(nc.variables)
+    allowed_dimensions = ['TIME', 'LATITUDE', 'LONGITUDE', 'HEIGHT_ABOVE_SENSOR']
+    required_variables = ['UCUR', 'VCUR', 'WCUR', 'DEPTH']
+    error_list = []
+
+    if nc.site_code != site_code:
+        error_list.append('Wrong site_code: ' + nc.site_code)
+
+    if 'Level 1' not in nc.file_version:
+        error_list.append('Wrong file version: ' + nc.file_version)
+    if 'DEPTH' not in variables:
+        error_list.append('DEPTH variable missing')
+    if 'DEPTH' not in variables and 'HEIGHT_ABOVE_SENSOR' not in variables:
+        error_list.append('DEPTH and HEIGHT_ABOVE_SENSOR missing')
+    if 'TIME' not in variables:
+        error_list.append('TIME variable missing')
+    if 'LATITUDE' not in variables:
+        error_list.append('LATITUDE variable missing')
+    if 'LONGITUDE' not in variables:
+        error_list.append('LONGITUDE variable missing')
+    if 'seconds_to_middle_of_measurement' not in nc['TIME'].attrs:
+        error_list.append('seconds_to_middle_of_measurement not present in TIME')
+
+    for variable in required_variables:
+        if variable not in variables:
+            error_list.append(variable + ' variable missing')
+        else:
+            VoIdimensions = list(nc[variable].dims)
+            if 'TIME' not in VoIdimensions:
+                error_list.append('TIME is not a dimension for ' + variable)
+            if 'LATITUDE' in VoIdimensions and len(nc.LATITUDE) > 1:
+                error_list.append('more than one LATITUDE for ' + variable)
+            if 'LONGITUDE' in VoIdimensions and len(nc.LONGITUDE) > 1:
+                error_list.append('more than one LONGITUDE for ' + variable)
+            for dim in VoIdimensions:
+                if dim not in allowed_dimensions:
+                    error_list.append('not allowed dimension: ' + dim)
+
+    if 'NOMINAL_DEPTH' not in variables and 'instrument_nominal_depth' not in attributes:
+        error_list.append('no NOMINAL_DEPTH')
+
+    return error_list
+
+def get_instrument_id(nc):
+    """
+    Create instrument id based on deployment metadata
+    :param nc: xarray dataset
+    :return: instrumentID as string
+    """
+    return '; '.join([nc.deployment_code, nc.instrument, nc.instrument_serial_number])
+
+
+def in_water(nc):
+    """
+    cut data the entire dataset to in-water only timestamps, dropping the out-of-water records.
+    :param nc: xarray dataset
+    :return: xarray dataset
+    """
+    time_deployment_start = np.datetime64(nc.attrs['time_deployment_start'][:-1])
+    time_deployment_end = np.datetime64(nc.attrs['time_deployment_end'][:-1])
+    TIME = nc['TIME'][:]
+    return nc.where((TIME >= time_deployment_start) & (TIME <= time_deployment_end), drop=True)
+
+
+def cell_velocity_resample(df, binning_function, is_WCUR):
+    """
+    Resample a dataset to a specific time_interval.
+    if WCUR not present, returns nan
+    :param df: grouped dataframe
+    :param binning_function: function used for binning as non string standard numpy function
+    :param is_WCUR: True if WCUR is present in nc, False otherwise
+    :return: binned U, v, W CUR according to the binning function
+    """
+    df_binned = df.apply(binning_function)
+    UCUR = df_binned['UCUR']
+    VCUR = df_binned['VCUR']
+    if is_WCUR:
+        WCUR = df_binned['WCUR']
+    else:
+        WCUR = np.full(len(df), np.nan)
+    DEPTH = df_binned['DEPTH']
+
+    return UCUR, VCUR, WCUR, DEPTH
+
+
+def get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun, epoch, one_day, is_WCUR):
+    """
+    get U, V, W current values resampled
+    :param nc_cell: xarray DATASET
+    :param ds: netcdf4 dataset
+    :param slice_start: start index of the slice
+    :param varlist: list of variable names to subset the dataset
+    :param binnig_fun: list of function names for binning
+    :param one_day: timedelta one day
+    :param epoch: base epoch
+    :param is_WCUR: flag indicating if WCUR is present
+    :return: end index of the slice
+    """
+    nc_cell = nc_cell.where(nc_cell.DEPTH_quality_control < 4, drop=True)
+    nc_cell = nc_cell[varlist]
+    nc_cell = nc_cell.to_dataframe()
+    ## back the index 30min
+    nc_cell.index = nc_cell.index - pd.Timedelta(30, units='m')
+
+    nc_cell_1H = nc_cell.resample('1H')
+    slice_end = len(nc_cell_1H) + slice_start
+
+    ## move time it forward and get it
+    time_slice = ((np.fromiter(nc_cell_1H.groups.keys(), dtype='M8[ns]') + np.timedelta64(1, 'h')) - epoch) / one_day
+    ds['TIME'][slice_start:slice_end] = time_slice
+
+    # take the mean of the variables
+    ds['UCUR'][slice_start:slice_end], \
+    ds['VCUR'][slice_start:slice_end], \
+    ds['WCUR'][slice_start:slice_end], \
+    ds['DEPTH'][slice_start:slice_end] = cell_velocity_resample(nc_cell_1H, 'mean', is_WCUR)
+
+    for method in binning_fun:
+        ds['UCUR_' + method][slice_start:slice_end], \
+        ds['VCUR_' + method][slice_start:slice_end], \
+        ds['WCUR_' + method][slice_start:slice_end], \
+        ds['DEPTH_' + method][slice_start:slice_end] = cell_velocity_resample(nc_cell_1H, method, is_WCUR)
+
+    return slice_end
+
+
+
+## MAIN FUNCTION
+def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
+                        download_url_prefix=None, opendap_url_prefix=None):
+    """
+    Aggregate U, V and W CUR variables from all deployments at one site.
+    the vertical cells are flattened and related to its depth
+    additional metadata variables are stored to track the origin of the data
+    :param files_to_agg: list of files to aggregate
+    :param site_code: site code
+    :param input_dir: base path where source files are stored
+    :param output_dir: path where the result file will be written
+    :param download_url_prefix: URL prefix for file download (to be prepended to paths in files_to_agg)
+    :param opendap_url_prefix: URL prefix for OPENAP access (to be prepended to paths in files_to_agg)
+    :return: file path of the aggregated product, list of rejected files
+    """
+
+    varlist = ['UCUR', 'VCUR', 'WCUR', 'DEPTH']
+    binning_fun = ['max', 'min', 'std', 'count']
+    
+    time_units="days since 1950-01-01 00:00:00 UTC"
+    time_calendar="gregorian"
+    epoch = np.datetime64("1950-01-01T00:00:00")
+    one_day = np.timedelta64(1, 'D')
+
+    bad_files = []
+    rejected_files = []
+
+    ## default name for temporary file. It will be renamed at the end
+    _, temp_outfile = tempfile.mkstemp(suffix='.nc', dir=output_dir)
+
+    ## check files and get total number of flattened obs
+    print("CHECKING FILES...")
+    for index, file in enumerate(files_to_agg):
+        print(index, end=',', flush=True)
+        with xr.open_dataset(os.path.join(input_dir, file)) as nc:
+            nc = in_water(nc)
+            error_list = check_file(nc, site_code)
+            if error_list:
+                bad_files.append([file, error_list])
+                rejected_files.append(file)
+    print(" ")
+
+    ## remove bad files form the list
+    for file in bad_files:
+        files_to_agg.remove(file[0])
+
+    ## sort the files in chronological order
+    files_to_agg = sort_files(files_to_agg, input_dir=input_dir)
+
+
+    ## create ncdf file, dimensions (unlimited) and variables
+    ds = Dataset(os.path.join(output_dir, temp_outfile), 'w')
+    OBSERVATION = ds.createDimension('OBSERVATION', size=None)
+    INSTRUMENT = ds.createDimension('INSTRUMENT', size=None)
+
+    obs_double_template = {'datatype': np.float64, 'zlib': True, 'dimensions': ('OBSERVATION'), "fill_value": 99999.0}
+    obs_float_template = {'datatype': np.float32, 'zlib': True, 'dimensions': ('OBSERVATION'), "fill_value": 99999.0}
+    obs_int_template = {'datatype': np.uint32, 'zlib': True, 'dimensions': ('OBSERVATION')}
+    inst_S256_template = {'datatype': 'str', 'dimensions': ('INSTRUMENT')}
+    inst_float_template ={'datatype': np.float32, 'dimensions': ('INSTRUMENT')}
+    inst_double_template ={'datatype': np.float64, 'dimensions': ('INSTRUMENT')}
+
+    UCUR = ds.createVariable(varname='UCUR', **obs_float_template)
+    UCUR_max = ds.createVariable(varname='UCUR_max', **obs_float_template)
+    UCUR_min = ds.createVariable(varname='UCUR_min', **obs_float_template)
+    UCUR_std = ds.createVariable(varname='UCUR_std', **obs_float_template)
+    UCUR_count = ds.createVariable(varname='UCUR_count', **obs_int_template)
+    VCUR = ds.createVariable(varname='VCUR', **obs_float_template)
+    VCUR_max = ds.createVariable(varname='VCUR_max', **obs_float_template)
+    VCUR_min = ds.createVariable(varname='VCUR_min', **obs_float_template)
+    VCUR_std = ds.createVariable(varname='VCUR_std', **obs_float_template)
+    VCUR_count = ds.createVariable(varname='VCUR_count', **obs_int_template)
+    WCUR = ds.createVariable(varname='WCUR', **obs_float_template)
+    WCUR_max = ds.createVariable(varname='WCUR_max', **obs_float_template)
+    WCUR_min = ds.createVariable(varname='WCUR_min', **obs_float_template)
+    WCUR_std = ds.createVariable(varname='WCUR_std', **obs_float_template)
+    WCUR_count = ds.createVariable(varname='WCUR_count', **obs_int_template)
+
+    DEPTH = ds.createVariable(varname='DEPTH', **obs_float_template)
+    DEPTH_max = ds.createVariable(varname='DEPTH_max', **obs_float_template)
+    DEPTH_min = ds.createVariable(varname='DEPTH_min', **obs_float_template)
+    DEPTH_std = ds.createVariable(varname='DEPTH_std', **obs_float_template)
+    DEPTH_count = ds.createVariable(varname='DEPTH_count', **obs_int_template)
+
+    TIME = ds.createVariable(varname='TIME', **obs_double_template)
+    instrument_index = ds.createVariable(varname='instrument_index', **obs_int_template)
+
+    source_file = ds.createVariable(varname='source_file', **inst_S256_template)
+    instrument_id = ds.createVariable(varname='instrument_id', **inst_S256_template)
+    LATITUDE = ds.createVariable(varname='LATITUDE', **inst_double_template)
+    LONGITUDE = ds.createVariable(varname='LONGITUDE', **inst_double_template)
+    NOMINAL_DEPTH = ds.createVariable(varname='NOMINAL_DEPTH', **inst_float_template)
+    #SECONDS_TO_MIDDLE = ds.createVariable(varname='SECONDS_TO_MIDDLE', **inst_float_template)
+
+
+    ## main loop
+    print('PROCESSING...')
+    slice_start = 0
+    for index, file in enumerate(files_to_agg):
+        print(index, end=",", flush=True)
+
+        ## this is for filling the slice of variables with INSTRUMENT dim
+        slice_instrument_start = slice_start
+
+        with xr.open_dataset(os.path.join(input_dir, file)) as nc:
+
+            if 'HEIGHT_ABOVE_SENSOR' in list(nc.variables):
+                is_2D = True
+            else:
+                is_2D = False
+
+            if 'WCUR' in list(nc.data_vars):
+                is_WCUR = True
+            else:
+                is_WCUR = False
+
+            ## in-water data only
+            nc = in_water(nc)
+
+            ## move timestamp to the middle of the measurement window
+            time_delta_ns = int(nc['TIME'].seconds_to_middle_of_measurement * 10**9)
+            nc['TIME'] = nc['TIME'] + np.timedelta64(time_delta_ns, 'ns')
+
+            if is_2D:
+                ## process all cells, one by one
+                cells = nc.HEIGHT_ABOVE_SENSOR.values
+                for cell in cells:
+                    ## get cell data, drop HEIGHT_ABOVE_SENSOR dim
+                    nc_cell = nc.where(nc.HEIGHT_ABOVE_SENSOR == cell, drop=True).squeeze('HEIGHT_ABOVE_SENSOR')
+                    ## convert to absolute DEPTH
+                    nc_cell['DEPTH'] = nc_cell['DEPTH'] - cell
+
+                    slice_end = get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun,
+                                                     epoch, one_day, is_WCUR)
+
+                    slice_start = slice_end
+            else:
+                slice_end = get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun,
+                                                 epoch, one_day, is_WCUR)
+                slice_start = slice_end
+
+            ## metadata variables
+            instrument_index[slice_instrument_start:slice_end] = np.repeat(index, slice_end - slice_instrument_start)
+            LATITUDE[index] = nc.LATITUDE.values
+            LONGITUDE[index] = nc.LONGITUDE.values
+            NOMINAL_DEPTH[index] = np.array(TStools.get_nominal_depth(nc))
+            instrument_id[index] = get_instrument_id(nc)
+            source_file[index] = file
+
+    print(" ")
+    ## add atributes
+    with open(TEMPLATE_JSON) as json_file:
+        attribute_dictionary = json.load(json_file)
+    variable_attribute_dictionary = attribute_dictionary['_variables']
+    global_attribute_dictionary = attribute_dictionary['_global']
+
+    ## set variable attrs
+    for var in list(ds.variables):
+        ds[var].setncatts(variable_attribute_dictionary[var])
+
+    if download_url_prefix or opendap_url_prefix:
+        ds['source_file'].setncatts(TStools.source_file_attributes(download_url_prefix, opendap_url_prefix))
+
+    ## set global attrs
+    timeformat = '%Y-%m-%dT%H:%M:%SZ'
+    file_timeformat = '%Y%m%d'
+
+    time_start = num2date(np.min(TIME[:]), time_units, time_calendar).strftime(timeformat)
+    time_end = num2date(np.max(TIME[:]), time_units, time_calendar).strftime(timeformat)
+    time_start_filename = num2date(np.min(TIME[:]), time_units, time_calendar).strftime(file_timeformat)
+    time_end_filename = num2date(np.max(TIME[:]), time_units, time_calendar).strftime(file_timeformat)
+
+
+    contributor_name, contributor_email, contributor_role = TStools.get_contributors(files_to_agg=files_to_agg, input_dir=input_dir)
+    add_attribute = {
+                    'title':                    ("Long Timeseries Velocity Hourly Aggregated product: " + ', '.join(varlist) + " at " +
+                                                  site_code + " between " + time_start + " and " + time_end),
+                    'site_code':                site_code,
+                    'time_coverage_start':      time_start,
+                    'time_coverage_end':        time_end,
+                    'geospatial_vertical_min':  np.float32(np.nanmin(ds['DEPTH'])),
+                    'geospatial_vertical_max':  np.float32(np.nanmax(ds['DEPTH'])),
+                    'geospatial_lat_min':       np.float64(np.min(ds['LATITUDE'])),
+                    'geospatial_lat_max':       np.float64(np.max(ds['LATITUDE'])),
+                    'geospatial_lon_min':       np.float64(np.min(ds['LONGITUDE'])),
+                    'geospatial_lon_max':       np.float64(np.max(ds['LONGITUDE'])),
+                    'date_created':             datetime.utcnow().strftime(timeformat),
+                    'history':                  datetime.utcnow().strftime(timeformat) + ': Aggregated file created.',
+                    'keywords':                 ', '.join(varlist + ['AGGREGATED']),
+                    'rejected_files':           "\n".join(rejected_files),
+                    'contributor_name':        "; ".join(contributor_name),
+                    'contributor_email':       "; ".join(contributor_email),
+                    'contributor_role':        "; ".join(contributor_role),
+                    'generating_code_version':  __version__
+    }
+
+    ## add version
+    github_comment = ('\nThis file was created using https://github.com/aodn/python-aodntools/blob/'
+                      '{v}/aodntools/timeseries_products/aggregated_timeseries.py'.format(v=__version__))
+    global_attribute_dictionary['lineage'] += github_comment
+
+    global_attribute_dictionary.update(add_attribute)
+    ds.setncatts(dict(sorted(global_attribute_dictionary.items())))
+
+
+    ## NOTE: There is a possibility of having NaNs in DEPTH after the binning
+    ## this is the warning when calculating the min/max DEPTH
+    ## maybe I should clean the dataset before close it
+
+    ds.close()
+
+
+
+    ## create the output file name and rename the tmp file
+    facility_code = TStools.get_facility_code(os.path.join(input_dir, files_to_agg[0]))
+    data_code = 'VZ'
+    product_type = 'hourly-timeseries'
+    file_version = 1
+    output_name = '_'.join(['IMOS', facility_code, data_code, time_start_filename, site_code, ('FV0'+str(file_version)),
+                            ("velocity-"+product_type),
+                            ('END-'+ time_end_filename), 'C-' + datetime.utcnow().strftime(file_timeformat)]) + '.nc'
+    ncout_path = os.path.join(output_dir, output_name)
+    shutil.move(temp_outfile, ncout_path)
+
+
+    return ncout_path, bad_files
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="Concatenate X,Y,Z velocity variables from ALL instruments from ALL deployments from ONE site")
+    parser.add_argument('-site', dest='site_code', help='site code, like NRMMAI',  required=True)
+    parser.add_argument('-files', dest='filenames', help='name of the file that contains the source URLs', required=True)
+    parser.add_argument('-path', dest='output_path', help='path where the result file will be written. Default: ./', default='./', required=False)
+    parser.add_argument('-indir', dest='input_dir', help='base path of input files', default='', required=False)
+    parser.add_argument('-outdir', dest='output_dir', help='path where the result file will be written. Default ./',
+                        default='./', required=False)
+    parser.add_argument('-download_url', dest='download_url', help='path to the download_url_prefix',
+                        default='', required=False)
+    parser.add_argument('-opendap_url', dest='opendap_url', help='path to the opendap_url_prefix',
+                        default='', required=False)
+
+    args = parser.parse_args()
+
+    with open(args.filenames) as ff:
+        files_to_agg = [line.rstrip() for line in ff]
+
+
+    print(velocity_aggregated(files_to_agg=files_to_agg, site_code=args.site_code,
+                              input_dir=args.input_dir, output_dir=args.output_dir,
+                              download_url_prefix=args.download_url, opendap_url_prefix=args.opendap_url))
diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries_template.json b/aodntools/timeseries_products/velocity_hourly_timeseries_template.json
new file mode 100644
index 0000000..e0d4fcc
--- /dev/null
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries_template.json
@@ -0,0 +1,214 @@
+{
+    "_variables": {
+        "TIME": {
+            "axis": "T",
+            "comment": "time stamp corresponds to the hour and represents binned data [30,30) minutes before and after the hour",
+            "long_name": "time",
+            "standard_name": "time",
+            "valid_max": 90000.0,
+            "valid_min": 0.0,
+            "units": "days since 1950-01-01 00:00:00 UTC",
+            "calendar": "gregorian"
+            },
+        "LATITUDE":{
+            "axis": "Y",
+            "long_name": "latitude",
+            "reference_datum": "WGS84 geographic coordinate system",
+            "standard_name": "latitude",
+            "units": "degrees_north",
+            "valid_max": 90.0,
+            "valid_min": -90.0
+            },
+        "LONGITUDE": {
+            "axis": "X",
+            "long_name": "longitude",
+            "reference_datum": "WGS84 geographic coordinate system",
+            "standard_name": "longitude",
+            "units": "degrees_east",
+            "valid_max": 180.0,
+            "valid_min": -180.0
+            },
+        "DEPTH": {
+            "coordinates": "TIME LATITUDE LONGITUDE NOMINAL DEPTH",
+            "DEPTH:ancillary_variables": "DEPTH_min DEPTH_max DEPTH_std DEPTH_count",
+            "long_name": "mean actual depth",
+            "positive": "down",
+            "reference_datum": "sea surface",
+            "standard_name": "depth",
+            "units": "m",
+            "valid_max": 12000.0,
+            "valid_min": -5.0,
+            "DEPTH:cell_methods": "TIME:mean (interval: 1 hr comment: time mid point)"
+            },
+        "DEPTH_max": {
+            "units": "m",
+            "standard_name": "depth",
+            "long_name": "max data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:max"
+            },
+        "DEPTH_min": {
+            "units": "m",
+            "standard_name": "depth",
+            "long_name": "min data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:min"
+            },
+        "DEPTH_std": {
+            "units": "m",
+            "standard_name": "depth",
+            "long_name": "std data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:std"
+            },
+        "DEPTH_count": {
+            "standard_name": "depth number_of_observations",
+            "units": "1",
+            "long_name": "count data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:count"
+            },
+        "NOMINAL_DEPTH": {
+            "axis": "Z",
+            "long_name": "nominal depth",
+            "positive": "down",
+            "reference_datum": "sea surface",
+            "standard_name": "depth",
+            "units": "m",
+            "valid_max": 12000.0,
+            "valid_min": -5.0
+            },
+        "instrument_index": {
+            "long_name": "which instrument this obs is for",
+            "instance_dimension": "INSTRUMENT"
+            },
+        "instrument_id": {
+            "long_name": "source deployment code, instrument make, model, serial_number"
+            },
+        "source_file": {
+            "long_name": "source file for this instrument"
+            },
+        "UCUR": {
+            "coordinates": "TIME DEPTH LATITUDE LONGITUDE",
+            "ancillary_variables": "UCUR_max, UCUR_min, UCUR_std, UCUR_count",
+            "long_name": "eastward_sea_water_velocity",
+            "standard_name": "eastward_sea_water_velocity",
+            "units": "m s-1",
+            "valid_max": 10.0,
+            "valid_min": -10.0
+            },
+        "UCUR_max": {
+            "units": "m s-1",
+            "standard_name": "eastward_sea_water_velocity",
+            "long_name": "max data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:max"
+            },
+        "UCUR_min": {
+            "units": "m s-1",
+            "standard_name": "eastward_sea_water_velocity",
+            "long_name": "min data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:min"
+            },
+        "UCUR_std": {
+            "units": "m s-1",
+            "standard_name": "eastward_sea_water_velocity",
+            "long_name": "std data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:std"
+            },
+        "UCUR_count": {
+            "standard_name": "eastward_sea_water_velocity number_of_observations",
+            "units": "1",
+            "long_name": "count data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:count"
+            },
+        "VCUR": {
+            "coordinates": "TIME DEPTH LATITUDE LONGITUDE",
+            "ancillary_variables": "VCUR_max, VCUR_min, VCUR_std, VCUR_count",
+            "long_name": "northward_sea_water_velocity",
+            "standard_name": "northward_sea_water_velocity",
+            "units": "m s-1",
+            "valid_max": 10.0,
+            "valid_min": -10.0
+            },
+        "VCUR_max": {
+            "units": "m s-1",
+            "standard_name": "northward_sea_water_velocity",
+            "long_name": "max data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:max"
+            },
+        "VCUR_min": {
+            "units": "m s-1",
+            "standard_name": "northward_sea_water_velocity",
+            "long_name": "min data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:min"
+            },
+        "VCUR_std": {
+            "units": "m s-1",
+            "standard_name": "northward_sea_water_velocity",
+            "long_name": "std data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:std"
+            },
+        "VCUR_count": {
+            "standard_name": "northward_sea_water_velocity number_of_observations",
+            "units": "1",
+            "long_name": "count data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:count"
+            },
+
+        "WCUR": {
+            "coordinates": "TIME DEPTH LATITUDE LONGITUDE",
+            "ancillary_variables": "WCUR_max, WCUR_min, WCUR_std, WCUR_count",
+            "long_name": "upward_sea_water_velocity",
+            "standard_name": "upward_sea_water_velocity",
+            "units": "m s-1",
+            "valid_max": 5.0,
+            "valid_min": -5.0
+            },
+        "WCUR_max": {
+            "units": "m s-1",
+            "standard_name": "upward_sea_water_velocity",
+            "long_name": "max data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:max"
+            },
+        "WCUR_min": {
+            "units": "m s-1",
+            "standard_name": "upward_sea_water_velocity",
+            "long_name": "min data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:min"
+            },
+        "WCUR_std": {
+            "units": "m s-1",
+            "standard_name": "upward_sea_water_velocity",
+            "long_name": "std data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:std"
+            },
+        "WCUR_count": {
+            "standard_name": "upward_sea_water_velocity number_of_observations",
+            "units": "1",
+            "long_name": "count data value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME:count"
+            },
+        "SECONDS_TO_MIDDLE": {
+            "long_name": "number of seconds from timestamp to middle of the measurement window"
+        }
+    },
+    "_global":{
+        "abstract": "Hourly Time-series Product: This file contains all measurements of quality controled U, V and W sea water velocity variables from all instruments deployed at the selected site. Timestamps are chronologically ordered, and binned into 1 hour time intervals. Instrument details are stored as a variable in order to keep a record of the origin of each measurement. Out-of-water measurements have been excluded.",
+        "acknowledgement": "Any users of IMOS data are required to clearly acknowledge the source of the material derived from IMOS in the format: \"Data was sourced from the Integrated Marine Observing System (IMOS) - IMOS is a national collaborative research infrastructure, supported by the Australian Government.\" If relevant, also credit other organisations involved in collection of this particular datastream (as listed in 'credit' in the metadata record).",
+        "author": "Klein, Eduardo",
+        "author_email": "eduardo.kleinsalas@utas.edu.au",
+        "citation": "The citation in a list of references is: \"IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access].\".",
+        "Conventions": "CF-1.6,IMOS-1.4",
+        "data_centre": "Australian Ocean Data Network (AODN)",
+        "data_centre_email": "info@aodn.org.au",
+        "disclaimer": "Data, products and services from IMOS are provided \"as is\" without any warranty as to fitness for a particular purpose.",
+        "featureType": "timeSeries",
+        "file_version": "Level 2 - Quality Controlled Data",
+        "file_version_quality_control": "Quality controlled data have been through quality assurance procedures such as automated routines and sensor calibration or visual inspection and flag of obvious errors. The data are in physical units using standard SI metric units with calibration and other pre-processing routines applied, all time and location values are in absolute coordinates to comply with standards and datum. Data includes flags for each measurement to indicate the estimated quality of the measurement. Metadata exists for the data or for the higher level dataset that the data belongs to. This is the standard IMOS data level and is what should be made available to AODN and to the IMOS community.",
+        "institution_references": "http://imos.org.au/facilities/aodn/",
+        "keywords_vocabulary": "IMOS parameter names. See https://github.com/aodn/imos-toolbox/blob/master/IMOS/imosParameters.txt",
+        "license": "http://creativecommons.org/licenses/by/4.0/",
+        "naming_authority": "IMOS",
+        "project": "Integrated Marine Observing System (IMOS)",
+        "references": "http://www.imos.org.au",
+        "source": "Mooring",
+        "standard_name_vocabulary": "NetCDF Climate and Forecast (CF) Metadata Convention Standard Name Table 45",
+        "lineage": "The aggregated UCUR, VCUR and WCUR are produced by sequentially concatenating the individual values in each of the input files after being binned into 1 hour fixed interval. In the case of ADCPs, the current values at each measuring cell are referenced to its absolute DEPTH. The resulting variable has dimension OBSERVATION. The DEPTH variable is calculated from the DEPTH measurements at the instrument and the HEIGHT_ABOVE_SENSOR distance corresponding to each measurement cell. The values are summarised using the arithmetic mean. Additional variables derived from the binning process are also stored: minimum, maximum, standard deviation and number of observations in each time bin. The resulting variables have dimension OBSERVATION. The variable TIME from input files and centered to the hour is concatenated into a variable TIME(OBSERVATION). The DEPTH variable from input files is averaged into the same 1 hour bin and concatenated into a variable DEPTH(OBSERVATION). If not present, fill values are stored. All output variables with the INSTRUMENT dimension are sorted in chronological order. In order to keep track of the provenance of VoI in the aggregated file, accessory variables are created."
+        }
+}

From c9b5632112ec13c16e589b9c01c3d83f70f96d25 Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Mon, 24 Feb 2020 12:14:56 +1100
Subject: [PATCH 02/33] add cell_index variable

---
 .../timeseries_products/velocity_hourly_timeseries.py    | 9 +++++++--
 .../velocity_hourly_timeseries_template.json             | 4 ++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index aefe96e..7f08ac5 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -269,6 +269,8 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
     LATITUDE = ds.createVariable(varname='LATITUDE', **inst_double_template)
     LONGITUDE = ds.createVariable(varname='LONGITUDE', **inst_double_template)
     NOMINAL_DEPTH = ds.createVariable(varname='NOMINAL_DEPTH', **inst_float_template)
+    cell_index = ds.createVariable(varname='cell_index', **obs_int_template)
+
     #SECONDS_TO_MIDDLE = ds.createVariable(varname='SECONDS_TO_MIDDLE', **inst_float_template)
 
 
@@ -295,6 +297,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
 
             ## in-water data only
             nc = in_water(nc)
+            n_measurements = len(nc.TIME)
 
             ## move timestamp to the middle of the measurement window
             time_delta_ns = int(nc['TIME'].seconds_to_middle_of_measurement * 10**9)
@@ -303,19 +306,21 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
             if is_2D:
                 ## process all cells, one by one
                 cells = nc.HEIGHT_ABOVE_SENSOR.values
-                for cell in cells:
+                for cell_idx, cell in enumerate(cells):
                     ## get cell data, drop HEIGHT_ABOVE_SENSOR dim
                     nc_cell = nc.where(nc.HEIGHT_ABOVE_SENSOR == cell, drop=True).squeeze('HEIGHT_ABOVE_SENSOR')
                     ## convert to absolute DEPTH
                     nc_cell['DEPTH'] = nc_cell['DEPTH'] - cell
-
                     slice_end = get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun,
                                                      epoch, one_day, is_WCUR)
+                    cell_index[slice_start:slice_end] = np.full(slice_end - slice_start, cell_idx, dtype=np.uint32)
 
                     slice_start = slice_end
             else:
                 slice_end = get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun,
                                                  epoch, one_day, is_WCUR)
+                cell_index[slice_start:slice_end] = np.full(slice_end - slice_start, 0, dtype=np.uint32)
+
                 slice_start = slice_end
 
             ## metadata variables
diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries_template.json b/aodntools/timeseries_products/velocity_hourly_timeseries_template.json
index e0d4fcc..4fdb4ef 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries_template.json
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries_template.json
@@ -184,6 +184,10 @@
             "long_name": "count data value in the bin, after rejection of flagged data",
             "cell_methods": "TIME:count"
             },
+        "cell_index": {
+            "long_name": "index of the corresponding measuring cell",
+            "comment": "the closest cell to the sensor is cell 0"
+        },
         "SECONDS_TO_MIDDLE": {
             "long_name": "number of seconds from timestamp to middle of the measurement window"
         }

From 1f4c76939907c5059e3e6ab9992e4875363abd23 Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Tue, 25 Feb 2020 09:49:12 +1100
Subject: [PATCH 03/33] change alias of the package

---
 .../velocity_hourly_timeseries.py                    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index 7f08ac5..dd488a4 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -10,7 +10,7 @@
 
 from pkg_resources import resource_filename
 from aodntools import __version__
-import aodntools.timeseries_products.aggregated_timeseries as TStools
+import aodntools.timeseries_products.aggregated_timeseries as utils
 
 import xarray as xr
 import pandas as pd
@@ -224,7 +224,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
         files_to_agg.remove(file[0])
 
     ## sort the files in chronological order
-    files_to_agg = sort_files(files_to_agg, input_dir=input_dir)
+    files_to_agg = utils.sort_files(files_to_agg, input_dir=input_dir)
 
 
     ## create ncdf file, dimensions (unlimited) and variables
@@ -327,7 +327,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
             instrument_index[slice_instrument_start:slice_end] = np.repeat(index, slice_end - slice_instrument_start)
             LATITUDE[index] = nc.LATITUDE.values
             LONGITUDE[index] = nc.LONGITUDE.values
-            NOMINAL_DEPTH[index] = np.array(TStools.get_nominal_depth(nc))
+            NOMINAL_DEPTH[index] = np.array(utils.get_nominal_depth(nc))
             instrument_id[index] = get_instrument_id(nc)
             source_file[index] = file
 
@@ -343,7 +343,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
         ds[var].setncatts(variable_attribute_dictionary[var])
 
     if download_url_prefix or opendap_url_prefix:
-        ds['source_file'].setncatts(TStools.source_file_attributes(download_url_prefix, opendap_url_prefix))
+        ds['source_file'].setncatts(utils.source_file_attributes(download_url_prefix, opendap_url_prefix))
 
     ## set global attrs
     timeformat = '%Y-%m-%dT%H:%M:%SZ'
@@ -355,7 +355,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
     time_end_filename = num2date(np.max(TIME[:]), time_units, time_calendar).strftime(file_timeformat)
 
 
-    contributor_name, contributor_email, contributor_role = TStools.get_contributors(files_to_agg=files_to_agg, input_dir=input_dir)
+    contributor_name, contributor_email, contributor_role = utils.get_contributors(files_to_agg=files_to_agg, input_dir=input_dir)
     add_attribute = {
                     'title':                    ("Long Timeseries Velocity Hourly Aggregated product: " + ', '.join(varlist) + " at " +
                                                   site_code + " between " + time_start + " and " + time_end),
@@ -396,7 +396,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
 
 
     ## create the output file name and rename the tmp file
-    facility_code = TStools.get_facility_code(os.path.join(input_dir, files_to_agg[0]))
+    facility_code = utils.get_facility_code(os.path.join(input_dir, files_to_agg[0]))
     data_code = 'VZ'
     product_type = 'hourly-timeseries'
     file_version = 1

From ac83ec46a6694ef847f14c833f30cdeffc7b0a08 Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Tue, 25 Feb 2020 09:52:18 +1100
Subject: [PATCH 04/33] replace functions by aodn package call

---
 .../velocity_hourly_timeseries.py             | 42 ++-----------------
 1 file changed, 3 insertions(+), 39 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index dd488a4..9dd808d 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -19,21 +19,6 @@
 TEMPLATE_JSON = resource_filename(__name__, 'velocity_hourly_timeseries_template.json')
 
 
-def sort_files(file_list, input_dir=""):
-    """
-    sort list of files according to deployment date
-    :param file_list: List of files to sort
-    :return: sorted list of files
-    """
-
-    time_start = []
-    for file in file_list:
-        with Dataset(os.path.join(input_dir, file), 'r') as ds:
-            time_start.append(np.datetime64(ds.time_deployment_start))
-    tuples = sorted(zip(time_start, files_to_agg))
-    return [t[1] for t in tuples]
-
-
 def check_file(nc, site_code):
     """
     Return list of errors found in the file:
@@ -93,27 +78,6 @@ def check_file(nc, site_code):
 
     return error_list
 
-def get_instrument_id(nc):
-    """
-    Create instrument id based on deployment metadata
-    :param nc: xarray dataset
-    :return: instrumentID as string
-    """
-    return '; '.join([nc.deployment_code, nc.instrument, nc.instrument_serial_number])
-
-
-def in_water(nc):
-    """
-    cut data the entire dataset to in-water only timestamps, dropping the out-of-water records.
-    :param nc: xarray dataset
-    :return: xarray dataset
-    """
-    time_deployment_start = np.datetime64(nc.attrs['time_deployment_start'][:-1])
-    time_deployment_end = np.datetime64(nc.attrs['time_deployment_end'][:-1])
-    TIME = nc['TIME'][:]
-    return nc.where((TIME >= time_deployment_start) & (TIME <= time_deployment_end), drop=True)
-
-
 def cell_velocity_resample(df, binning_function, is_WCUR):
     """
     Resample a dataset to a specific time_interval.
@@ -212,7 +176,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
     for index, file in enumerate(files_to_agg):
         print(index, end=',', flush=True)
         with xr.open_dataset(os.path.join(input_dir, file)) as nc:
-            nc = in_water(nc)
+            nc = utils.in_water(nc)
             error_list = check_file(nc, site_code)
             if error_list:
                 bad_files.append([file, error_list])
@@ -296,7 +260,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
                 is_WCUR = False
 
             ## in-water data only
-            nc = in_water(nc)
+            nc = utils.in_water(nc)
             n_measurements = len(nc.TIME)
 
             ## move timestamp to the middle of the measurement window
@@ -328,7 +292,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
             LATITUDE[index] = nc.LATITUDE.values
             LONGITUDE[index] = nc.LONGITUDE.values
             NOMINAL_DEPTH[index] = np.array(utils.get_nominal_depth(nc))
-            instrument_id[index] = get_instrument_id(nc)
+            instrument_id[index] = utils.get_instrument_id(nc)
             source_file[index] = file
 
     print(" ")

From f7c71cdc3d462c38e1648de0721f5427a5e58fc9 Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Tue, 25 Feb 2020 09:57:05 +1100
Subject: [PATCH 05/33] rename variable

---
 aodntools/timeseries_products/velocity_hourly_timeseries.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index 9dd808d..1520d5c 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -233,7 +233,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
     LATITUDE = ds.createVariable(varname='LATITUDE', **inst_double_template)
     LONGITUDE = ds.createVariable(varname='LONGITUDE', **inst_double_template)
     NOMINAL_DEPTH = ds.createVariable(varname='NOMINAL_DEPTH', **inst_float_template)
-    cell_index = ds.createVariable(varname='cell_index', **obs_int_template)
+    CELL_INDEX = ds.createVariable(varname='CELL_INDEX', **obs_int_template)
 
     #SECONDS_TO_MIDDLE = ds.createVariable(varname='SECONDS_TO_MIDDLE', **inst_float_template)
 
@@ -277,13 +277,13 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
                     nc_cell['DEPTH'] = nc_cell['DEPTH'] - cell
                     slice_end = get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun,
                                                      epoch, one_day, is_WCUR)
-                    cell_index[slice_start:slice_end] = np.full(slice_end - slice_start, cell_idx, dtype=np.uint32)
+                    CELL_INDEX[slice_start:slice_end] = np.full(slice_end - slice_start, cell_idx, dtype=np.uint32)
 
                     slice_start = slice_end
             else:
                 slice_end = get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun,
                                                  epoch, one_day, is_WCUR)
-                cell_index[slice_start:slice_end] = np.full(slice_end - slice_start, 0, dtype=np.uint32)
+                CELL_INDEX[slice_start:slice_end] = np.full(slice_end - slice_start, 0, dtype=np.uint32)
 
                 slice_start = slice_end
 

From 103d9c9279dbbcb084a396fab387d9b43abdfe4b Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Mon, 2 Mar 2020 12:32:44 +1100
Subject: [PATCH 06/33] adapt to NETCDF4_CLASSIC. Fix cell_index variable name
 in template

---
 .../velocity_hourly_timeseries.py               | 17 +++++++++--------
 .../velocity_hourly_timeseries_template.json    |  2 +-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index 1520d5c..3396359 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -2,8 +2,8 @@
 import sys
 import tempfile
 import shutil
-from netCDF4 import Dataset, num2date
 import numpy as np
+from netCDF4 import Dataset, num2date, stringtochar
 import json
 from datetime import datetime
 import argparse
@@ -189,17 +189,18 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
 
     ## sort the files in chronological order
     files_to_agg = utils.sort_files(files_to_agg, input_dir=input_dir)
-
+    n_files = len(files_to_agg)
 
     ## create ncdf file, dimensions (unlimited) and variables
-    ds = Dataset(os.path.join(output_dir, temp_outfile), 'w')
+    ds = Dataset(os.path.join(output_dir, temp_outfile), 'w', format='NETCDF4_CLASSIC')
     OBSERVATION = ds.createDimension('OBSERVATION', size=None)
-    INSTRUMENT = ds.createDimension('INSTRUMENT', size=None)
+    INSTRUMENT = ds.createDimension('INSTRUMENT', size=n_files)
+    STRING256 = ds.createDimension("strlen", 256)
 
     obs_double_template = {'datatype': np.float64, 'zlib': True, 'dimensions': ('OBSERVATION'), "fill_value": 99999.0}
     obs_float_template = {'datatype': np.float32, 'zlib': True, 'dimensions': ('OBSERVATION'), "fill_value": 99999.0}
-    obs_int_template = {'datatype': np.uint32, 'zlib': True, 'dimensions': ('OBSERVATION')}
-    inst_S256_template = {'datatype': 'str', 'dimensions': ('INSTRUMENT')}
+    obs_int_template = {'datatype': 'i1', 'zlib': True, 'dimensions': ('OBSERVATION')}
+    inst_S256_template = {'datatype': 'S1', 'dimensions': ('INSTRUMENT', "strlen")}
     inst_float_template ={'datatype': np.float32, 'dimensions': ('INSTRUMENT')}
     inst_double_template ={'datatype': np.float64, 'dimensions': ('INSTRUMENT')}
 
@@ -292,8 +293,8 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
             LATITUDE[index] = nc.LATITUDE.values
             LONGITUDE[index] = nc.LONGITUDE.values
             NOMINAL_DEPTH[index] = np.array(utils.get_nominal_depth(nc))
-            instrument_id[index] = utils.get_instrument_id(nc)
-            source_file[index] = file
+            source_file[index] = stringtochar(np.array(file, dtype='S256'))
+            instrument_id[index] = stringtochar(np.array(utils.get_instrument_id(nc), dtype='S256'))
 
     print(" ")
     ## add atributes
diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries_template.json b/aodntools/timeseries_products/velocity_hourly_timeseries_template.json
index 4fdb4ef..b3a9eec 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries_template.json
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries_template.json
@@ -184,7 +184,7 @@
             "long_name": "count data value in the bin, after rejection of flagged data",
             "cell_methods": "TIME:count"
             },
-        "cell_index": {
+        "CELL_INDEX": {
             "long_name": "index of the corresponding measuring cell",
             "comment": "the closest cell to the sensor is cell 0"
         },

From 2df71ef1e91ab389b0b96230e55309e5f3fd3cd9 Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Mon, 2 Mar 2020 13:04:41 +1100
Subject: [PATCH 07/33] integer type to numpy int16

---
 aodntools/timeseries_products/velocity_hourly_timeseries.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index 3396359..c4eaafc 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -199,7 +199,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
 
     obs_double_template = {'datatype': np.float64, 'zlib': True, 'dimensions': ('OBSERVATION'), "fill_value": 99999.0}
     obs_float_template = {'datatype': np.float32, 'zlib': True, 'dimensions': ('OBSERVATION'), "fill_value": 99999.0}
-    obs_int_template = {'datatype': 'i1', 'zlib': True, 'dimensions': ('OBSERVATION')}
+    obs_int_template = {'datatype': np.int16, 'zlib': True, 'dimensions': ('OBSERVATION')}
     inst_S256_template = {'datatype': 'S1', 'dimensions': ('INSTRUMENT', "strlen")}
     inst_float_template ={'datatype': np.float32, 'dimensions': ('INSTRUMENT')}
     inst_double_template ={'datatype': np.float64, 'dimensions': ('INSTRUMENT')}

From dac77e1ef93db0abe2321b8f8e94a3613b2b3382 Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Thu, 5 Mar 2020 16:41:12 +1100
Subject: [PATCH 08/33] remove check fo rseconds_to_middle and don't shift
 TIME. Add seconds_to_middle as a variable

---
 .../velocity_hourly_timeseries.py                | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index c4eaafc..743ae50 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -55,8 +55,8 @@ def check_file(nc, site_code):
         error_list.append('LATITUDE variable missing')
     if 'LONGITUDE' not in variables:
         error_list.append('LONGITUDE variable missing')
-    if 'seconds_to_middle_of_measurement' not in nc['TIME'].attrs:
-        error_list.append('seconds_to_middle_of_measurement not present in TIME')
+    # if 'seconds_to_middle_of_measurement' not in nc['TIME'].attrs:
+    #     error_list.append('seconds_to_middle_of_measurement not present in TIME')
 
     for variable in required_variables:
         if variable not in variables:
@@ -195,7 +195,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
     ds = Dataset(os.path.join(output_dir, temp_outfile), 'w', format='NETCDF4_CLASSIC')
     OBSERVATION = ds.createDimension('OBSERVATION', size=None)
     INSTRUMENT = ds.createDimension('INSTRUMENT', size=n_files)
-    STRING256 = ds.createDimension("strlen", 256)
+    STRING256 = ds.createDimension("strlen", size=256)
 
     obs_double_template = {'datatype': np.float64, 'zlib': True, 'dimensions': ('OBSERVATION'), "fill_value": 99999.0}
     obs_float_template = {'datatype': np.float32, 'zlib': True, 'dimensions': ('OBSERVATION'), "fill_value": 99999.0}
@@ -234,6 +234,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
     LATITUDE = ds.createVariable(varname='LATITUDE', **inst_double_template)
     LONGITUDE = ds.createVariable(varname='LONGITUDE', **inst_double_template)
     NOMINAL_DEPTH = ds.createVariable(varname='NOMINAL_DEPTH', **inst_float_template)
+    SECONDS_TO_MIDDLE = ds.createVariable(varname='SECONDS_TO_MIDDLE', **inst_float_template)
     CELL_INDEX = ds.createVariable(varname='CELL_INDEX', **obs_int_template)
 
     #SECONDS_TO_MIDDLE = ds.createVariable(varname='SECONDS_TO_MIDDLE', **inst_float_template)
@@ -265,8 +266,8 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
             n_measurements = len(nc.TIME)
 
             ## move timestamp to the middle of the measurement window
-            time_delta_ns = int(nc['TIME'].seconds_to_middle_of_measurement * 10**9)
-            nc['TIME'] = nc['TIME'] + np.timedelta64(time_delta_ns, 'ns')
+            # time_delta_ns = int(nc['TIME'].seconds_to_middle_of_measurement * 10**9)
+            # nc['TIME'] = nc['TIME'] + np.timedelta64(time_delta_ns, 'ns')
 
             if is_2D:
                 ## process all cells, one by one
@@ -295,6 +296,11 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
             NOMINAL_DEPTH[index] = np.array(utils.get_nominal_depth(nc))
             source_file[index] = stringtochar(np.array(file, dtype='S256'))
             instrument_id[index] = stringtochar(np.array(utils.get_instrument_id(nc), dtype='S256'))
+            ## add time offset to the middle of the measuring window, if it exists
+            if 'seconds_to_middle_of_measurement' in nc.TIME.attrs:
+                SECONDS_TO_MIDDLE[index] = nc.TIME.seconds_to_middle_of_measurement
+            else:
+                SECONDS_TO_MIDDLE[index] = np.nan
 
     print(" ")
     ## add atributes

From 38296bb0b9970361bf122c67e6bf6a35dc74cd96 Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Wed, 11 Mar 2020 08:43:58 +1100
Subject: [PATCH 09/33] implement process by chunks to reduce memory use

---
 .../velocity_hourly_timeseries.py             | 58 ++++++++++---------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index 743ae50..6d21482 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -176,12 +176,12 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
     for index, file in enumerate(files_to_agg):
         print(index, end=',', flush=True)
         with xr.open_dataset(os.path.join(input_dir, file)) as nc:
-            nc = utils.in_water(nc)
             error_list = check_file(nc, site_code)
             if error_list:
                 bad_files.append([file, error_list])
                 rejected_files.append(file)
     print(" ")
+    print(bad_files)
 
     ## remove bad files form the list
     for file in bad_files:
@@ -189,7 +189,6 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
 
     ## sort the files in chronological order
     files_to_agg = utils.sort_files(files_to_agg, input_dir=input_dir)
-    n_files = len(files_to_agg)
 
     ## create ncdf file, dimensions (unlimited) and variables
     ds = Dataset(os.path.join(output_dir, temp_outfile), 'w', format='NETCDF4_CLASSIC')
@@ -237,7 +236,6 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
     SECONDS_TO_MIDDLE = ds.createVariable(varname='SECONDS_TO_MIDDLE', **inst_float_template)
     CELL_INDEX = ds.createVariable(varname='CELL_INDEX', **obs_int_template)
 
-    #SECONDS_TO_MIDDLE = ds.createVariable(varname='SECONDS_TO_MIDDLE', **inst_float_template)
 
 
     ## main loop
@@ -261,33 +259,41 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
             else:
                 is_WCUR = False
 
-            ## in-water data only
-            nc = utils.in_water(nc)
-            n_measurements = len(nc.TIME)
-
-            ## move timestamp to the middle of the measurement window
-            # time_delta_ns = int(nc['TIME'].seconds_to_middle_of_measurement * 10**9)
-            # nc['TIME'] = nc['TIME'] + np.timedelta64(time_delta_ns, 'ns')
-
-            if is_2D:
-                ## process all cells, one by one
-                cells = nc.HEIGHT_ABOVE_SENSOR.values
-                for cell_idx, cell in enumerate(cells):
-                    ## get cell data, drop HEIGHT_ABOVE_SENSOR dim
-                    nc_cell = nc.where(nc.HEIGHT_ABOVE_SENSOR == cell, drop=True).squeeze('HEIGHT_ABOVE_SENSOR')
-                    ## convert to absolute DEPTH
-                    nc_cell['DEPTH'] = nc_cell['DEPTH'] - cell
-                    slice_end = get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun,
+
+            ## process in chunks
+            ## in water only
+            chunk_start = np.datetime64(nc.attrs['time_deployment_start'])
+            chunk_end = np.datetime64(nc.attrs['time_deployment_end'])
+
+            time_increment = 60*60*24*90    ## secs x mins x hours x days
+            chunk_increment = np.timedelta64(time_increment, 's')
+            chunk_partial = chunk_start + chunk_increment
+            chunk_index = 0
+            while chunk_start < chunk_partial and chunk_start <= chunk_end:
+                nc_chunk = nc.where((nc.TIME >= chunk_start) & (nc.TIME < chunk_partial), drop=True)
+                if is_2D:
+                    ## process all cells, one by one
+                    cells = nc_chunk.HEIGHT_ABOVE_SENSOR.values
+                    for cell_idx, cell in enumerate(cells):
+                        ## get cell data, drop HEIGHT_ABOVE_SENSOR dim
+                        nc_cell = nc_chunk.where(nc.HEIGHT_ABOVE_SENSOR == cell, drop=True).squeeze('HEIGHT_ABOVE_SENSOR')
+                        ## convert to absolute DEPTH
+                        nc_cell['DEPTH'] = nc_cell['DEPTH'] - cell
+                        slice_end = get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun,
+                                                         epoch, one_day, is_WCUR)
+                        CELL_INDEX[slice_start:slice_end] = np.full(slice_end - slice_start, cell_idx, dtype=np.uint32)
+
+                        slice_start = slice_end
+                else:
+                    slice_end = get_resampled_values(nc_chunk, ds, slice_start, varlist, binning_fun,
                                                      epoch, one_day, is_WCUR)
-                    CELL_INDEX[slice_start:slice_end] = np.full(slice_end - slice_start, cell_idx, dtype=np.uint32)
+                    CELL_INDEX[slice_start:slice_end] = np.full(slice_end - slice_start, 0, dtype=np.uint32)
 
                     slice_start = slice_end
-            else:
-                slice_end = get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun,
-                                                 epoch, one_day, is_WCUR)
-                CELL_INDEX[slice_start:slice_end] = np.full(slice_end - slice_start, 0, dtype=np.uint32)
+                chunk_start = chunk_partial
+                chunk_partial += chunk_increment
+                chunk_index += 1
 
-                slice_start = slice_end
 
             ## metadata variables
             instrument_index[slice_instrument_start:slice_end] = np.repeat(index, slice_end - slice_instrument_start)

From beb401810ee1ca4310297b7ce8d43512ee41fc5e Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Wed, 11 Mar 2020 08:44:54 +1100
Subject: [PATCH 10/33] add CELL_INDEX to documentation

---
 .../Documentation/velocity_aggregated_timeseries.md             | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/aodntools/timeseries_products/Documentation/velocity_aggregated_timeseries.md b/aodntools/timeseries_products/Documentation/velocity_aggregated_timeseries.md
index 1e0144c..9ff6fb8 100644
--- a/aodntools/timeseries_products/Documentation/velocity_aggregated_timeseries.md
+++ b/aodntools/timeseries_products/Documentation/velocity_aggregated_timeseries.md
@@ -90,6 +90,8 @@ In order to keep track of the provenance of the aggregated file, accessory varia
 - `LONGITUDE(INSTRUMENT)`: LONGITUDE per instrument.
 - `NOMINAL_DEPTH(INSTRUMENT)`: nominal depth per instrument, from the input file’s variable `NOMINAL_DEPTH` or global attribute instrument_nominal_depth.
 - `SECONDS_TO_MIDDLE(INSTRUMENT)`:  offset from the timestamp to the middle of the measurement window for each deployment
+- CELL_INDEX(OBSERVATION): index of the corresponding measuring cell
+
 
 
 ### Attributes

From e21db79791dce9010c609e116bf1363e44cfd9d2 Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Wed, 11 Mar 2020 09:11:12 +1100
Subject: [PATCH 11/33] documentation for the hourly product

---
 .../velocity_hourly_timeseries.md             | 125 ++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 aodntools/timeseries_products/Documentation/velocity_hourly_timeseries.md

diff --git a/aodntools/timeseries_products/Documentation/velocity_hourly_timeseries.md b/aodntools/timeseries_products/Documentation/velocity_hourly_timeseries.md
new file mode 100644
index 0000000..05623e6
--- /dev/null
+++ b/aodntools/timeseries_products/Documentation/velocity_hourly_timeseries.md
@@ -0,0 +1,125 @@
+# Velocity Aggregated Time Series Product
+
+- [Objective](#objective)
+- [Input](#input)
+- [Method](#method)
+- [Output](#output)
+
+
+
+
+## Objective
+
+This product provides aggregated quality controlled U, V, and W velocity time-series files for each mooring site, without any interpolation or filtering, except for the exclusion of the out-of-water data, binned into 1-hour intervals. For the profiling (ADCP) instruments, the absolute depth of the measuring cell is calculated using the `DEPTH` measured at the instrument and the `HEIGHT_ABOVE_SENSOR`.
+
+The output from a single run of the code will be an aggregated file of all available measurements of the velocity components UCUR, VCUR and (where available) WCUR at one mooring site, binned into 1-hour intervals.
+
+## Input
+
+The aggregation function will accept a list of input files, and the code of the mooring site (`site_code`), in addition to arguments that identify the path of input and output files.
+
+The code aggregates variables and files that meet the following requirements:
+
+- File contains data from only one deployment of one instrument;
+- File is a delayed-mode, quality-controlled product (file version label “FV01”);
+- File is compliant with CF-1.6 and IMOS-1.4 conventions;
+- File contains, at the minimum, the components of current velocity (`UCUR`, `VCUR`), and variables `TIME`, `DEPTH`, `LATITUDE`, `LONGITUDE`, and `HEIGHT_ABOVE_SENSOR` in the case of ADCPs;
+- All files to be aggregated are from the same site, and have the same `site_code` attribute; 
+- Variables to be aggregated have `TIME` and (optionally) `HEIGHT_ABOVE_SENSOR` as their only dimensions (or if `LATITUDE` and `LONGITUDE` are included as dimensions, they have size 1);
+- The in-water data are bounded by the global attributes `time_deployment_start` and `time_deployment_end`;
+
+
+
+The code is able to access the input files either locally, or remotely via the OPeNDAP protocol. 
+
+## Method
+
+Generating function: 
+
+```
+usage: velocity_aggregated_timeseries.py [-h] -site SITE_CODE -files FILENAMES
+                                         [-indir INPUT_DIR]
+                                         [-outdir OUTPUT_DIR]
+                                         [-download_url DOWNLOAD_URL]
+                                         [-opendap_url OPENDAP_URL]
+
+Concatenate X,Y,Z velocity variables from ALL instruments from ALL deployments
+from ONE site
+
+optional arguments:
+  -h, --help                    show this help message and exit
+  -site SITE_CODE               site code, like NRMMAI
+  -files FILENAMES              name of the file that contains the source URLs
+  -indir INPUT_DIR              base path of input files
+  -outdir OUTPUT_DIR            path where the result file will be written. Default ./
+  -download_url DOWNLOAD_URL    path to the download_url_prefix
+  -opendap_url OPENDAP_URL      path to the opendap_url_prefix
+
+
+```
+
+
+
+### Input file validation
+
+Before proceeding to the aggregation, each input file will be checked to ensure it meets the requirements (as specified above under Inputs). Any input files that fail to meet the requirements will be excluded from the aggregation, and their URL listed in a global attribute `rejected_files`.
+
+### Dimensions
+
+The dimensions of the resulting file  are determined as follows:
+
+- `OBSERVATION`:    the total number of observation records, excluding out-of-the-water data, in all input files;
+- `INSTRUMENT`:     the number of instruments (i.e. number of files);
+- `strlen`:         a fixed dimension of length 256 for character array variables.
+
+### Variables
+
+Only values flagged as “good” or “probably good” are included. The velocity variables are produced by flattening, and concatenating the arrays in each of the input files. The time of the measurement is shifted according to the seconds_to_middle_of_measurement attribute. The values are then averaged into one-hour time bins (independently within each depth cell for ADCPs). The resulting variables have dimension `OBSERVATION`. 
+
+The variable `TIME` from input files is re-shaped to match the flattened velocity variables, and replaced by one-hour timestamps. The binning intervals will be one hour long, centred on the hour (i.e. HH:00:00). Each timestamp will be repeated once for each ADCP depth cell.
+
+The `DEPTH` variables from input files are averaged into the same one-hour bins, and concatenated into a variable `DEPTH(OBSERVATION)`. In the case of ADCP instruments, the `HEIGHT_ABOVE_SENSOR`  is converted to absolute depth by subtracting each of the height values from the depth measurements at the instrument. 
+
+All output variables with the `INSTRUMENT` dimension are sorted in chronological order, and the input files aggregated chronologically, according to the global attribute time_deployment_start.
+
+In order to keep track of the provenance of the aggregated file, accessory variables are created:
+
+
+- `instrument_index(OBSERVATION)`: index [0:number of files] of the instrument used, referencing the `INSTRUMENT` dimension.
+- `source_file(INSTRUMENT, strlen)`: URLs of the files used
+- `instrument_id(INSTRUMENT, strlen)`: concatenated deployment_code, instrument and instrument_serial_number from the global attributes of each file
+- `LATITUDE(INSTRUMENT)`: LATITUDE per instrument.
+- `LONGITUDE(INSTRUMENT)`: LONGITUDE per instrument.
+- `NOMINAL_DEPTH(INSTRUMENT)`: nominal depth per instrument, from the input file’s variable `NOMINAL_DEPTH` or global attribute instrument_nominal_depth.
+- `CELL_INDEX(OBSERVATION)`: index of the corresponding measuring cell.
+
+
+
+### Attributes
+
+The variable attributes will comply with the IMOS metadata standards.
+
+The global metadata will be a set of IMOS standard attributes. Fixed attributes are read from a [JSON file](../velocity_aggregated_timeseries_template.json) that contains the {key:value} pairs for each of them.
+
+Attributes specific to each aggregated product, are added as follows:
+
+- `site_code`: obtained from the input files (should be the same in all of them);
+- `time_coverage_start`, `time_coverage_end`: set to the full range of TIME values in the aggregated file;
+- `geospatial_vertical_min`, `geospatial_vertical_max`: set to the full range of DEPTH values in the aggregated file;
+- `geospatial_lat_min`, `geospatial_lat_max` : set to the full range of LATITUDE values in the aggregated file;
+- `geospatial_lon_min`, `geospatial_lon_max`: set to the full range of LONGITUDE values in the aggregated file;
+- `date_created`: set to the date/time the product file is created;
+- `history`: set to “<date_created>: Aggregated file created.”;
+- `keywords`: set to a comma-separated list of the main variable names (“UCUR, VCUR, WCUR, DEPTH, AGGREGATED”);
+- `lineage`: a statement about how the file was created, including a link to the code used; 
+- `title`: "Long Timeseries Velocity Hourly Aggregated product: UCUR, VCUR, WCUR, DEPTH at <site_code>  between <time_coverage_start> and <time_coverage_end>"; 
+- `rejected_files`: a list of URLs for files that were in the input files list, but did not meet the input requirements. 
+
+
+## Output
+
+The output from a single run of the code will be an aggregated file of all available current velocity measurements at one mooring site.
+
+The product will be delivered, in netCDF4 classic format, compliant with the CF-1.6 and IMOS-1.4 conventions, and structured according to the [indexed ragged array representation](http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#_indexed_ragged_array_representation).
+
+

From f6121b337e1839e2d9cc480a07ad6b95a3e68106 Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Wed, 11 Mar 2020 09:11:38 +1100
Subject: [PATCH 12/33] documentation for the hourly product README

---
 aodntools/timeseries_products/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/aodntools/timeseries_products/README.md b/aodntools/timeseries_products/README.md
index b8407b9..257a878 100644
--- a/aodntools/timeseries_products/README.md
+++ b/aodntools/timeseries_products/README.md
@@ -7,6 +7,7 @@ Documentation:
 - [Hourly time series (non-velocity)](Documentation/Hourly_timeseries.md)
 - [Gridded time series (Temperature)](Documentation/Gridded_timeseries.md)
 - [Velocity aggregated time series](Documentation/Velocity_agrregated_timeseries.md)
+- [Velocity hourly time series](Documentation/velocity_hourly_timeseries.md)
 
 
 Please use the [issue tracker](https://github.com/aodn/python-aodntools/issues) for feedback and suggestions related to these products.

From 186521a30cd17c1d9b022d70e75eb38ceb51cad1 Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Wed, 11 Mar 2020 09:29:08 +1100
Subject: [PATCH 13/33] fix instrument dimension error

---
 aodntools/timeseries_products/velocity_hourly_timeseries.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index 6d21482..c374335 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -193,7 +193,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
     ## create ncdf file, dimensions (unlimited) and variables
     ds = Dataset(os.path.join(output_dir, temp_outfile), 'w', format='NETCDF4_CLASSIC')
     OBSERVATION = ds.createDimension('OBSERVATION', size=None)
-    INSTRUMENT = ds.createDimension('INSTRUMENT', size=n_files)
+    INSTRUMENT = ds.createDimension('INSTRUMENT', size=len(files_to_agg))
     STRING256 = ds.createDimension("strlen", size=256)
 
     obs_double_template = {'datatype': np.float64, 'zlib': True, 'dimensions': ('OBSERVATION'), "fill_value": 99999.0}

From 1a3b2e2ec5523208025c91e40e6bd781c1afb28b Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Thu, 12 Mar 2020 08:50:51 +1100
Subject: [PATCH 14/33] fix dtype in binning function

---
 .../timeseries_products/velocity_hourly_timeseries.py     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index c374335..cfccd30 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -88,13 +88,13 @@ def cell_velocity_resample(df, binning_function, is_WCUR):
     :return: binned U, v, W CUR according to the binning function
     """
     df_binned = df.apply(binning_function)
-    UCUR = df_binned['UCUR']
-    VCUR = df_binned['VCUR']
+    UCUR = np.array(df_binned['UCUR'])
+    VCUR = np.array(df_binned['VCUR'])
     if is_WCUR:
-        WCUR = df_binned['WCUR']
+        WCUR = np.array(df_binned['WCUR'])
     else:
         WCUR = np.full(len(df), np.nan)
-    DEPTH = df_binned['DEPTH']
+    DEPTH = np.array(df_binned['DEPTH'])
 
     return UCUR, VCUR, WCUR, DEPTH
 

From 3a2094fec354c6b6bedc9e83b75e9cecff04fa6b Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Thu, 12 Mar 2020 09:19:44 +1100
Subject: [PATCH 15/33] chunk size as variable

---
 aodntools/timeseries_products/velocity_hourly_timeseries.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index cfccd30..058f345 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -168,6 +168,8 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
     bad_files = []
     rejected_files = []
 
+    chunk_size = 90  ## size in days
+
     ## default name for temporary file. It will be renamed at the end
     _, temp_outfile = tempfile.mkstemp(suffix='.nc', dir=output_dir)
 
@@ -181,7 +183,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
                 bad_files.append([file, error_list])
                 rejected_files.append(file)
     print(" ")
-    print(bad_files)
+
 
     ## remove bad files form the list
     for file in bad_files:
@@ -265,7 +267,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
             chunk_start = np.datetime64(nc.attrs['time_deployment_start'])
             chunk_end = np.datetime64(nc.attrs['time_deployment_end'])
 
-            time_increment = 60*60*24*90    ## secs x mins x hours x days
+            time_increment = 60*60*24*chunk_size    ## secs x mins x hours x days
             chunk_increment = np.timedelta64(time_increment, 's')
             chunk_partial = chunk_start + chunk_increment
             chunk_index = 0

From 72cd57f4eb414a93f3e01b5c6b8a89f0775463f5 Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Thu, 12 Mar 2020 12:13:59 +1100
Subject: [PATCH 16/33] squeeze the dataset to remove extra dimensions in
 variables

---
 aodntools/timeseries_products/velocity_hourly_timeseries.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index 058f345..4337325 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -113,7 +113,7 @@ def get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun, epoch,
     :return: end index of the slice
     """
     nc_cell = nc_cell.where(nc_cell.DEPTH_quality_control < 4, drop=True)
-    nc_cell = nc_cell[varlist]
+    nc_cell = nc_cell[varlist].squeeze()
     nc_cell = nc_cell.to_dataframe()
     ## back the index 30min
     nc_cell.index = nc_cell.index - pd.Timedelta(30, units='m')

From 6131301617ceb64f0e60efac805715273bd72604 Mon Sep 17 00:00:00 2001
From: Eduardo Klein <ekleins@gmail.com>
Date: Fri, 13 Mar 2020 11:25:52 +1100
Subject: [PATCH 17/33] Update
 aodntools/timeseries_products/velocity_hourly_timeseries.py

Co-Authored-By: Marty Hidas <marty.hidas@utas.edu.au>
---
 aodntools/timeseries_products/velocity_hourly_timeseries.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index 4337325..dab55b5 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -145,9 +145,9 @@ def get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun, epoch,
 def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
                         download_url_prefix=None, opendap_url_prefix=None):
     """
-    Aggregate U, V and W CUR variables from all deployments at one site.
-    the vertical cells are flattened and related to its depth
-    additional metadata variables are stored to track the origin of the data
+    Aggregate U, V and W CUR variables from the given files (from the same site) and average into hourly bins.
+    The vertical cells are flattened and the actual depth of each is calculated.
+    Additional metadata variables are stored to track the origin of the data.
     :param files_to_agg: list of files to aggregate
     :param site_code: site code
     :param input_dir: base path where source files are stored

From ba68960926c51f2a3a5de3d8a7c2b6c9d04987a3 Mon Sep 17 00:00:00 2001
From: Eduardo Klein <ekleins@gmail.com>
Date: Fri, 13 Mar 2020 11:26:04 +1100
Subject: [PATCH 18/33] Update
 aodntools/timeseries_products/velocity_hourly_timeseries.py

Co-Authored-By: Marty Hidas <marty.hidas@utas.edu.au>
---
 aodntools/timeseries_products/velocity_hourly_timeseries.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index dab55b5..716c7d4 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -378,7 +378,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
     facility_code = utils.get_facility_code(os.path.join(input_dir, files_to_agg[0]))
     data_code = 'VZ'
     product_type = 'hourly-timeseries'
-    file_version = 1
+    file_version = 2
     output_name = '_'.join(['IMOS', facility_code, data_code, time_start_filename, site_code, ('FV0'+str(file_version)),
                             ("velocity-"+product_type),
                             ('END-'+ time_end_filename), 'C-' + datetime.utcnow().strftime(file_timeformat)]) + '.nc'

From 39749605b13d7d701b6e7f295f27c7998b3f9696 Mon Sep 17 00:00:00 2001
From: Eduardo Klein <ekleins@gmail.com>
Date: Fri, 13 Mar 2020 11:36:22 +1100
Subject: [PATCH 19/33] Update
 aodntools/timeseries_products/velocity_hourly_timeseries.py

Co-Authored-By: Marty Hidas <marty.hidas@utas.edu.au>
---
 aodntools/timeseries_products/velocity_hourly_timeseries.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index 716c7d4..c57bdca 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -116,7 +116,7 @@ def get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun, epoch,
     nc_cell = nc_cell[varlist].squeeze()
     nc_cell = nc_cell.to_dataframe()
     ## back the index 30min
-    nc_cell.index = nc_cell.index - pd.Timedelta(30, units='m')
+    nc_cell.index = nc_cell.index - pd.Timedelta(minutes=30)
 
     nc_cell_1H = nc_cell.resample('1H')
     slice_end = len(nc_cell_1H) + slice_start

From 535256c92b7906f5882de260b3b6840172c844c2 Mon Sep 17 00:00:00 2001
From: Eduardo Klein <ekleins@gmail.com>
Date: Fri, 13 Mar 2020 11:37:18 +1100
Subject: [PATCH 20/33] Update
 aodntools/timeseries_products/velocity_hourly_timeseries.py

Co-Authored-By: Marty Hidas <marty.hidas@utas.edu.au>
---
 aodntools/timeseries_products/velocity_hourly_timeseries.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index c57bdca..6d32df5 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -359,7 +359,8 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
 
     ## add version
     github_comment = ('\nThis file was created using https://github.com/aodn/python-aodntools/blob/'
-                      '{v}/aodntools/timeseries_products/aggregated_timeseries.py'.format(v=__version__))
+                      '{v}/aodntools/timeseries_products/{f}'.format(v=__version__, f=os.path.basename(__file__))
+                      )
     global_attribute_dictionary['lineage'] += github_comment
 
     global_attribute_dictionary.update(add_attribute)

From 1ce15085cbb30715f00b8e541019e619233b20c4 Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Fri, 13 Mar 2020 11:43:33 +1100
Subject: [PATCH 21/33] changes according to review

---
 .../velocity_hourly_timeseries.py                    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index 4337325..52986bd 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -142,8 +142,8 @@ def get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun, epoch,
 
 
 ## MAIN FUNCTION
-def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
-                        download_url_prefix=None, opendap_url_prefix=None):
+def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
+                               download_url_prefix=None, opendap_url_prefix=None):
     """
     Aggregate U, V and W CUR variables from all deployments at one site.
     the vertical cells are flattened and related to its depth
@@ -278,7 +278,7 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
                     cells = nc_chunk.HEIGHT_ABOVE_SENSOR.values
                     for cell_idx, cell in enumerate(cells):
                         ## get cell data, drop HEIGHT_ABOVE_SENSOR dim
-                        nc_cell = nc_chunk.where(nc.HEIGHT_ABOVE_SENSOR == cell, drop=True).squeeze('HEIGHT_ABOVE_SENSOR')
+                        nc_cell = nc_chunk.sel(HEIGHT_ABOVE_SENSOR=cell)
                         ## convert to absolute DEPTH
                         nc_cell['DEPTH'] = nc_cell['DEPTH'] - cell
                         slice_end = get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun,
@@ -409,6 +409,6 @@ def velocity_aggregated(files_to_agg, site_code, input_dir='', output_dir='./',
         files_to_agg = [line.rstrip() for line in ff]
 
 
-    print(velocity_aggregated(files_to_agg=files_to_agg, site_code=args.site_code,
-                              input_dir=args.input_dir, output_dir=args.output_dir,
-                              download_url_prefix=args.download_url, opendap_url_prefix=args.opendap_url))
+    print(velocity_hourly_aggregated(files_to_agg=files_to_agg, site_code=args.site_code,
+                                     input_dir=args.input_dir, output_dir=args.output_dir,
+                                     download_url_prefix=args.download_url, opendap_url_prefix=args.opendap_url))

From f8ca463086745b00d58d482a86436b51820a9c32 Mon Sep 17 00:00:00 2001
From: diodon <ekleins@gmail.com>
Date: Fri, 13 Mar 2020 15:58:05 +1100
Subject: [PATCH 22/33] replace variables with QC>2 with NaNs

---
 aodntools/timeseries_products/velocity_hourly_timeseries.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index 52986bd..da347f3 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -112,7 +112,6 @@ def get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun, epoch,
     :param is_WCUR: flag indicating if WCUR is present
     :return: end index of the slice
     """
-    nc_cell = nc_cell.where(nc_cell.DEPTH_quality_control < 4, drop=True)
     nc_cell = nc_cell[varlist].squeeze()
     nc_cell = nc_cell.to_dataframe()
     ## back the index 30min
@@ -261,6 +260,10 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir
             else:
                 is_WCUR = False
 
+            ## mask values with QC flag>2
+            for var in varlist:
+                nc[var] = nc[var].where(nc[var+'_quality_control']<3)
+
 
             ## process in chunks
             ## in water only

From 8e99d5fc0341e9ef6d4ce1101796328210acb7c2 Mon Sep 17 00:00:00 2001
From: mhidas <marty.hidas@utas.edu.au>
Date: Wed, 18 Mar 2020 20:54:16 +1100
Subject: [PATCH 23/33] Bump version to 1.3.0

---
 .bumpversion.cfg      | 2 +-
 aodntools/__init__.py | 2 +-
 setup.py              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 568565e..d97a799 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.2.5
+current_version = 1.3.0
 commit = False
 tag = False
 tag_name = {new_version}
diff --git a/aodntools/__init__.py b/aodntools/__init__.py
index 09964d6..19b4f1d 100644
--- a/aodntools/__init__.py
+++ b/aodntools/__init__.py
@@ -1 +1 @@
-__version__ = '1.2.5'
+__version__ = '1.3.0'
diff --git a/setup.py b/setup.py
index 931dda1..fa5ffa9 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
 
 setup(
     name=PACKAGE_NAME,
-    version='1.2.5',
+    version='1.3.0',
     packages=find_packages(exclude=PACKAGE_EXCLUDES),
     package_data=PACKAGE_DATA,
     url='https://github.com/aodn',

From bd151bce480eec61bb0b76f16b5f4cd103b7fd4a Mon Sep 17 00:00:00 2001
From: mhidas <marty.hidas@utas.edu.au>
Date: Tue, 31 Mar 2020 16:43:47 +1100
Subject: [PATCH 24/33] Update global attributes and code comments for no time
 shift (also update description of CELL_INDEX and SECONDS_TO_MIDDLE)

---
 .../timeseries_products/velocity_hourly_timeseries.py     | 4 +---
 .../velocity_hourly_timeseries_template.json              | 8 +++++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index c2fa655..4bf5c8c 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -27,7 +27,6 @@ def check_file(nc, site_code):
     NOMINAL_DEPTH is not present as variable or attribute
     file_version is not FV01
     if LATITUDE or LONIGITUDE dimension has length >1
-    if TIME.seconds_to_middle_of_measurement exist
 
     :param nc: xarray dataset
     :param site_code: code of the mooring site
@@ -55,8 +54,6 @@ def check_file(nc, site_code):
         error_list.append('LATITUDE variable missing')
     if 'LONGITUDE' not in variables:
         error_list.append('LONGITUDE variable missing')
-    # if 'seconds_to_middle_of_measurement' not in nc['TIME'].attrs:
-    #     error_list.append('seconds_to_middle_of_measurement not present in TIME')
 
     for variable in required_variables:
         if variable not in variables:
@@ -116,6 +113,7 @@ def get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun, epoch,
     nc_cell = nc_cell.to_dataframe()
     ## back the index 30min
     nc_cell.index = nc_cell.index - pd.Timedelta(minutes=30)
+    # TODO: shift timestamps to centre of sampling interval
 
     nc_cell_1H = nc_cell.resample('1H')
     slice_end = len(nc_cell_1H) + slice_start
diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries_template.json b/aodntools/timeseries_products/velocity_hourly_timeseries_template.json
index b3a9eec..0023cec 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries_template.json
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries_template.json
@@ -186,18 +186,20 @@
             },
         "CELL_INDEX": {
             "long_name": "index of the corresponding measuring cell",
-            "comment": "the closest cell to the sensor is cell 0"
+            "comment": "Cell index is included for reference only and cannot be used to extract values at constant depth. The number and vertical spacing of cells can vary by instrument and deployment. The actual depth of any given cell can change between deployments, and also varies with time during a deployment. The closest cell to the sensor has index 0."
         },
         "SECONDS_TO_MIDDLE": {
-            "long_name": "number of seconds from timestamp to middle of the measurement window"
+            "long_name": "offset from recorded timestamp to middle of the measurement window in the input file",
+            "units": "s"
         }
     },
     "_global":{
-        "abstract": "Hourly Time-series Product: This file contains all measurements of quality controled U, V and W sea water velocity variables from all instruments deployed at the selected site. Timestamps are chronologically ordered, and binned into 1 hour time intervals. Instrument details are stored as a variable in order to keep a record of the origin of each measurement. Out-of-water measurements have been excluded.",
+        "abstract": "Hourly Time-series Product: This file contains all measurements of quality-controlled U, V and W sea water velocity variables from all instruments deployed at the selected site, binned into 1-hour time intervals. Out-of-water measurements, and those flagged as bad by IMOS standard automated quality-control procedures, have been excluded. Timestamps in the input files indicate the start of each measurement interval (up to an hour in duration), and these have not been shifted to the centre of the interval before binning. Instrument details are stored as variables in order to keep a record of the origin of each measurement.",
         "acknowledgement": "Any users of IMOS data are required to clearly acknowledge the source of the material derived from IMOS in the format: \"Data was sourced from the Integrated Marine Observing System (IMOS) - IMOS is a national collaborative research infrastructure, supported by the Australian Government.\" If relevant, also credit other organisations involved in collection of this particular datastream (as listed in 'credit' in the metadata record).",
         "author": "Klein, Eduardo",
         "author_email": "eduardo.kleinsalas@utas.edu.au",
         "citation": "The citation in a list of references is: \"IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access].\".",
+        "comment": "Timestamps in the input files indicate the start of each measurement interval (instrument-dependent; up to an hour in duration), and these have not been shifted to the centre of the interval before binning. This could lead to an artificial shift of up to half an hour in the output data. The size of this shift, where known, has been recorded in the SECONDS_TO_MIDDLE variable.",
         "Conventions": "CF-1.6,IMOS-1.4",
         "data_centre": "Australian Ocean Data Network (AODN)",
         "data_centre_email": "info@aodn.org.au",

From 2bdf8c28f98fb2dcec6220c842bef61163bb4f6d Mon Sep 17 00:00:00 2001
From: mhidas <marty.hidas@utas.edu.au>
Date: Tue, 31 Mar 2020 17:13:25 +1100
Subject: [PATCH 25/33] import check_file from velocity_aggregated code

---
 .../velocity_hourly_timeseries.py             | 57 +------------------
 1 file changed, 1 insertion(+), 56 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index 4bf5c8c..c0b6c7b 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -11,6 +11,7 @@
 from pkg_resources import resource_filename
 from aodntools import __version__
 import aodntools.timeseries_products.aggregated_timeseries as utils
+from aodntools.timeseries_products.velocity_aggregated_timeseries import check_file
 
 import xarray as xr
 import pandas as pd
@@ -19,62 +20,6 @@
 TEMPLATE_JSON = resource_filename(__name__, 'velocity_hourly_timeseries_template.json')
 
 
-def check_file(nc, site_code):
-    """
-    Return list of errors found in the file:
-    Variables of interest are present
-    TIME, DEPTH, LATITUDE, LONGITUDE,  is present
-    NOMINAL_DEPTH is not present as variable or attribute
-    file_version is not FV01
-    if LATITUDE or LONIGITUDE dimension has length >1
-
-    :param nc: xarray dataset
-    :param site_code: code of the mooring site
-    :return: dictionary with the file name and list of failed tests
-    """
-
-    attributes = list(nc.attrs)
-    variables = list(nc.variables)
-    allowed_dimensions = ['TIME', 'LATITUDE', 'LONGITUDE', 'HEIGHT_ABOVE_SENSOR']
-    required_variables = ['UCUR', 'VCUR', 'WCUR', 'DEPTH']
-    error_list = []
-
-    if nc.site_code != site_code:
-        error_list.append('Wrong site_code: ' + nc.site_code)
-
-    if 'Level 1' not in nc.file_version:
-        error_list.append('Wrong file version: ' + nc.file_version)
-    if 'DEPTH' not in variables:
-        error_list.append('DEPTH variable missing')
-    if 'DEPTH' not in variables and 'HEIGHT_ABOVE_SENSOR' not in variables:
-        error_list.append('DEPTH and HEIGHT_ABOVE_SENSOR missing')
-    if 'TIME' not in variables:
-        error_list.append('TIME variable missing')
-    if 'LATITUDE' not in variables:
-        error_list.append('LATITUDE variable missing')
-    if 'LONGITUDE' not in variables:
-        error_list.append('LONGITUDE variable missing')
-
-    for variable in required_variables:
-        if variable not in variables:
-            error_list.append(variable + ' variable missing')
-        else:
-            VoIdimensions = list(nc[variable].dims)
-            if 'TIME' not in VoIdimensions:
-                error_list.append('TIME is not a dimension for ' + variable)
-            if 'LATITUDE' in VoIdimensions and len(nc.LATITUDE) > 1:
-                error_list.append('more than one LATITUDE for ' + variable)
-            if 'LONGITUDE' in VoIdimensions and len(nc.LONGITUDE) > 1:
-                error_list.append('more than one LONGITUDE for ' + variable)
-            for dim in VoIdimensions:
-                if dim not in allowed_dimensions:
-                    error_list.append('not allowed dimension: ' + dim)
-
-    if 'NOMINAL_DEPTH' not in variables and 'instrument_nominal_depth' not in attributes:
-        error_list.append('no NOMINAL_DEPTH')
-
-    return error_list
-
 def cell_velocity_resample(df, binning_function, is_WCUR):
     """
     Resample a dataset to a specific time_interval.

From 76432e4d1299b3ea506b67f59575c8c2374daaf0 Mon Sep 17 00:00:00 2001
From: mhidas <marty.hidas@utas.edu.au>
Date: Tue, 31 Mar 2020 17:40:37 +1100
Subject: [PATCH 26/33] various attribute fixes * make cell_methods CF
 compliant * more accurate description in long_name * update author_email to
 AODN one

---
 .../velocity_hourly_timeseries_template.json  | 67 +++++++++----------
 1 file changed, 33 insertions(+), 34 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries_template.json b/aodntools/timeseries_products/velocity_hourly_timeseries_template.json
index 0023cec..9f9db2b 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries_template.json
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries_template.json
@@ -38,31 +38,30 @@
             "units": "m",
             "valid_max": 12000.0,
             "valid_min": -5.0,
-            "DEPTH:cell_methods": "TIME:mean (interval: 1 hr comment: time mid point)"
+            "cell_methods": "TIME: mean"
             },
         "DEPTH_max": {
             "units": "m",
             "standard_name": "depth",
-            "long_name": "max data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:max"
+            "long_name": "maximum depth value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME: maximum"
             },
         "DEPTH_min": {
             "units": "m",
             "standard_name": "depth",
-            "long_name": "min data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:min"
+            "long_name": "minimum depth value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME: minimum"
             },
         "DEPTH_std": {
             "units": "m",
             "standard_name": "depth",
-            "long_name": "std data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:std"
+            "long_name": "standard deviation of depth values in the bin, after rejection of flagged data",
+            "cell_methods": "TIME: standard_deviation"
             },
         "DEPTH_count": {
             "standard_name": "depth number_of_observations",
             "units": "1",
-            "long_name": "count data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:count"
+            "long_name": "number of depth observations in the bin, after rejection of flagged data"
             },
         "NOMINAL_DEPTH": {
             "axis": "Z",
@@ -90,32 +89,32 @@
             "long_name": "eastward_sea_water_velocity",
             "standard_name": "eastward_sea_water_velocity",
             "units": "m s-1",
+            "cell_methods": "TIME: mean",
             "valid_max": 10.0,
             "valid_min": -10.0
             },
         "UCUR_max": {
             "units": "m s-1",
             "standard_name": "eastward_sea_water_velocity",
-            "long_name": "max data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:max"
+            "long_name": "maximum eastward_sea_water_velocity value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME: maximum"
             },
         "UCUR_min": {
             "units": "m s-1",
             "standard_name": "eastward_sea_water_velocity",
-            "long_name": "min data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:min"
+            "long_name": "minimum eastward_sea_water_velocity value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME: minimum"
             },
         "UCUR_std": {
             "units": "m s-1",
             "standard_name": "eastward_sea_water_velocity",
-            "long_name": "std data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:std"
+            "long_name": "standard deviation of eastward_sea_water_velocity values in the bin, after rejection of flagged data",
+            "cell_methods": "TIME: standard_deviation"
             },
         "UCUR_count": {
             "standard_name": "eastward_sea_water_velocity number_of_observations",
             "units": "1",
-            "long_name": "count data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:count"
+            "long_name": "number of eastward_sea_water_velocity observations in the bin, after rejection of flagged data"
             },
         "VCUR": {
             "coordinates": "TIME DEPTH LATITUDE LONGITUDE",
@@ -123,32 +122,32 @@
             "long_name": "northward_sea_water_velocity",
             "standard_name": "northward_sea_water_velocity",
             "units": "m s-1",
+            "cell_methods": "TIME: mean",
             "valid_max": 10.0,
             "valid_min": -10.0
             },
         "VCUR_max": {
             "units": "m s-1",
             "standard_name": "northward_sea_water_velocity",
-            "long_name": "max data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:max"
+            "long_name": "maximum northward_sea_water_velocity value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME: maximum"
             },
         "VCUR_min": {
             "units": "m s-1",
             "standard_name": "northward_sea_water_velocity",
-            "long_name": "min data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:min"
+            "long_name": "minimum northward_sea_water_velocity value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME: minimum"
             },
         "VCUR_std": {
             "units": "m s-1",
             "standard_name": "northward_sea_water_velocity",
-            "long_name": "std data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:std"
+            "long_name": "standard deviation of northward_sea_water_velocity values in the bin, after rejection of flagged data",
+            "cell_methods": "TIME: standard_deviation"
             },
         "VCUR_count": {
             "standard_name": "northward_sea_water_velocity number_of_observations",
             "units": "1",
-            "long_name": "count data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:count"
+            "long_name": "number of northward_sea_water_velocity observations in the bin, after rejection of flagged data"
             },
 
         "WCUR": {
@@ -157,32 +156,32 @@
             "long_name": "upward_sea_water_velocity",
             "standard_name": "upward_sea_water_velocity",
             "units": "m s-1",
+            "cell_methods": "TIME: mean",
             "valid_max": 5.0,
             "valid_min": -5.0
             },
         "WCUR_max": {
             "units": "m s-1",
             "standard_name": "upward_sea_water_velocity",
-            "long_name": "max data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:max"
+            "long_name": "maximum upward_sea_water_velocity value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME: maximum"
             },
         "WCUR_min": {
             "units": "m s-1",
             "standard_name": "upward_sea_water_velocity",
-            "long_name": "min data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:min"
+            "long_name": "minimum upward_sea_water_velocity value in the bin, after rejection of flagged data",
+            "cell_methods": "TIME: minimum"
             },
         "WCUR_std": {
             "units": "m s-1",
             "standard_name": "upward_sea_water_velocity",
-            "long_name": "std data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:std"
+            "long_name": "standard deviation of upward_sea_water_velocity values in the bin, after rejection of flagged data",
+            "cell_methods": "TIME: standard_deviation"
             },
         "WCUR_count": {
             "standard_name": "upward_sea_water_velocity number_of_observations",
             "units": "1",
-            "long_name": "count data value in the bin, after rejection of flagged data",
-            "cell_methods": "TIME:count"
+            "long_name": "number of upward_sea_water_velocity observations in the bin, after rejection of flagged data"
             },
         "CELL_INDEX": {
             "long_name": "index of the corresponding measuring cell",
@@ -197,7 +196,7 @@
         "abstract": "Hourly Time-series Product: This file contains all measurements of quality-controlled U, V and W sea water velocity variables from all instruments deployed at the selected site, binned into 1-hour time intervals. Out-of-water measurements, and those flagged as bad by IMOS standard automated quality-control procedures, have been excluded. Timestamps in the input files indicate the start of each measurement interval (up to an hour in duration), and these have not been shifted to the centre of the interval before binning. Instrument details are stored as variables in order to keep a record of the origin of each measurement.",
         "acknowledgement": "Any users of IMOS data are required to clearly acknowledge the source of the material derived from IMOS in the format: \"Data was sourced from the Integrated Marine Observing System (IMOS) - IMOS is a national collaborative research infrastructure, supported by the Australian Government.\" If relevant, also credit other organisations involved in collection of this particular datastream (as listed in 'credit' in the metadata record).",
         "author": "Klein, Eduardo",
-        "author_email": "eduardo.kleinsalas@utas.edu.au",
+        "author_email": "info@aodn.org.au",
         "citation": "The citation in a list of references is: \"IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access].\".",
         "comment": "Timestamps in the input files indicate the start of each measurement interval (instrument-dependent; up to an hour in duration), and these have not been shifted to the centre of the interval before binning. This could lead to an artificial shift of up to half an hour in the output data. The size of this shift, where known, has been recorded in the SECONDS_TO_MIDDLE variable.",
         "Conventions": "CF-1.6,IMOS-1.4",

From d2cda9fa4053a56bc0480c308ad6d53fe33bb711 Mon Sep 17 00:00:00 2001
From: mhidas <marty.hidas@utas.edu.au>
Date: Wed, 1 Apr 2020 11:30:00 +1100
Subject: [PATCH 27/33] make bad_files a dict like in other products

---
 .../velocity_hourly_timeseries.py                 | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index c0b6c7b..ce15b47 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -96,7 +96,7 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir
     :param output_dir: path where the result file will be written
     :param download_url_prefix: URL prefix for file download (to be prepended to paths in files_to_agg)
     :param opendap_url_prefix: URL prefix for OPENAP access (to be prepended to paths in files_to_agg)
-    :return: file path of the aggregated product, list of rejected files
+    :return: file path of the hourly aggregated product, dict of rejected files: errors
     """
 
     varlist = ['UCUR', 'VCUR', 'WCUR', 'DEPTH']
@@ -107,8 +107,7 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir
     epoch = np.datetime64("1950-01-01T00:00:00")
     one_day = np.timedelta64(1, 'D')
 
-    bad_files = []
-    rejected_files = []
+    bad_files = {}
 
     chunk_size = 90  ## size in days
 
@@ -122,14 +121,12 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir
         with xr.open_dataset(os.path.join(input_dir, file)) as nc:
             error_list = check_file(nc, site_code)
             if error_list:
-                bad_files.append([file, error_list])
-                rejected_files.append(file)
+                bad_files.update({file: error_list})
     print(" ")
 
-
     ## remove bad files form the list
-    for file in bad_files:
-        files_to_agg.remove(file[0])
+    for file in bad_files.keys():
+        files_to_agg.remove(file)
 
     ## sort the files in chronological order
     files_to_agg = utils.sort_files(files_to_agg, input_dir=input_dir)
@@ -296,7 +293,7 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir
                     'date_created':             datetime.utcnow().strftime(timeformat),
                     'history':                  datetime.utcnow().strftime(timeformat) + ': Aggregated file created.',
                     'keywords':                 ', '.join(varlist + ['AGGREGATED']),
-                    'rejected_files':           "\n".join(rejected_files),
+                    'rejected_files':           "\n".join(bad_files.keys()),
                     'contributor_name':        "; ".join(contributor_name),
                     'contributor_email':       "; ".join(contributor_email),
                     'contributor_role':        "; ".join(contributor_role),

From b9de2873bcab189bbe94652477ceb37fcbda4c45 Mon Sep 17 00:00:00 2001
From: mhidas <marty.hidas@utas.edu.au>
Date: Wed, 1 Apr 2020 11:42:24 +1100
Subject: [PATCH 28/33] a bit of code clean-up (no functional changes) *
 reorder imports * global variable for max QC flag to include * minor doc
 string edits * clearer variable renames

---
 .../velocity_hourly_timeseries.py             | 54 ++++++++-----------
 1 file changed, 22 insertions(+), 32 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index ce15b47..c3dfa16 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -1,31 +1,29 @@
+import argparse
+import json
 import os
-import sys
-import tempfile
 import shutil
-import numpy as np
-from netCDF4 import Dataset, num2date, stringtochar
-import json
+import tempfile
 from datetime import datetime
-import argparse
 
+import numpy as np
+import pandas as pd
+import xarray as xr
+from netCDF4 import Dataset, num2date, stringtochar
 from pkg_resources import resource_filename
-from aodntools import __version__
+
 import aodntools.timeseries_products.aggregated_timeseries as utils
+from aodntools import __version__
 from aodntools.timeseries_products.velocity_aggregated_timeseries import check_file
 
-import xarray as xr
-import pandas as pd
-
-
 TEMPLATE_JSON = resource_filename(__name__, 'velocity_hourly_timeseries_template.json')
-
+QC_FLAG_MAX = 2
 
 def cell_velocity_resample(df, binning_function, is_WCUR):
     """
     Resample a dataset to a specific time_interval.
     if WCUR not present, returns nan
     :param df: grouped dataframe
-    :param binning_function: function used for binning as non string standard numpy function
+    :param binning_function: name of standard numpy function used for binning
     :param is_WCUR: True if WCUR is present in nc, False otherwise
     :return: binned U, v, W CUR according to the binning function
     """
@@ -41,14 +39,14 @@ def cell_velocity_resample(df, binning_function, is_WCUR):
     return UCUR, VCUR, WCUR, DEPTH
 
 
-def get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun, epoch, one_day, is_WCUR):
+def get_resampled_values(nc_cell, ds, slice_start, varlist, binning_function, epoch, one_day, is_WCUR):
     """
     get U, V, W current values resampled
     :param nc_cell: xarray DATASET
     :param ds: netcdf4 dataset
     :param slice_start: start index of the slice
     :param varlist: list of variable names to subset the dataset
-    :param binnig_fun: list of function names for binning
+    :param binning_function: list of numpy function names for binning
     :param one_day: timedelta one day
     :param epoch: base epoch
     :param is_WCUR: flag indicating if WCUR is present
@@ -73,7 +71,7 @@ def get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun, epoch,
     ds['WCUR'][slice_start:slice_end], \
     ds['DEPTH'][slice_start:slice_end] = cell_velocity_resample(nc_cell_1H, 'mean', is_WCUR)
 
-    for method in binning_fun:
+    for method in binning_function:
         ds['UCUR_' + method][slice_start:slice_end], \
         ds['VCUR_' + method][slice_start:slice_end], \
         ds['WCUR_' + method][slice_start:slice_end], \
@@ -132,7 +130,7 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir
     files_to_agg = utils.sort_files(files_to_agg, input_dir=input_dir)
 
     ## create ncdf file, dimensions (unlimited) and variables
-    ds = Dataset(os.path.join(output_dir, temp_outfile), 'w', format='NETCDF4_CLASSIC')
+    ds = Dataset(temp_outfile, 'w', format='NETCDF4_CLASSIC')
     OBSERVATION = ds.createDimension('OBSERVATION', size=None)
     INSTRUMENT = ds.createDimension('INSTRUMENT', size=len(files_to_agg))
     STRING256 = ds.createDimension("strlen", size=256)
@@ -190,20 +188,12 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir
 
         with xr.open_dataset(os.path.join(input_dir, file)) as nc:
 
-            if 'HEIGHT_ABOVE_SENSOR' in list(nc.variables):
-                is_2D = True
-            else:
-                is_2D = False
-
-            if 'WCUR' in list(nc.data_vars):
-                is_WCUR = True
-            else:
-                is_WCUR = False
+            is_2D = 'HEIGHT_ABOVE_SENSOR' in list(nc.variables)
+            is_WCUR = 'WCUR' in list(nc.data_vars)
 
             ## mask values with QC flag>2
             for var in varlist:
-                nc[var] = nc[var].where(nc[var+'_quality_control']<3)
-
+                nc[var] = nc[var].where(nc[var+'_quality_control'] <= QC_FLAG_MAX)
 
             ## process in chunks
             ## in water only
@@ -218,12 +208,12 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir
                 nc_chunk = nc.where((nc.TIME >= chunk_start) & (nc.TIME < chunk_partial), drop=True)
                 if is_2D:
                     ## process all cells, one by one
-                    cells = nc_chunk.HEIGHT_ABOVE_SENSOR.values
-                    for cell_idx, cell in enumerate(cells):
+                    heights = nc_chunk.HEIGHT_ABOVE_SENSOR.values
+                    for cell_idx, cell_height in enumerate(heights):
                         ## get cell data, drop HEIGHT_ABOVE_SENSOR dim
-                        nc_cell = nc_chunk.sel(HEIGHT_ABOVE_SENSOR=cell)
+                        nc_cell = nc_chunk.sel(HEIGHT_ABOVE_SENSOR=cell_height)
                         ## convert to absolute DEPTH
-                        nc_cell['DEPTH'] = nc_cell['DEPTH'] - cell
+                        nc_cell['DEPTH'] = nc_cell['DEPTH'] - cell_height
                         slice_end = get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun,
                                                          epoch, one_day, is_WCUR)
                         CELL_INDEX[slice_start:slice_end] = np.full(slice_end - slice_start, cell_idx, dtype=np.uint32)

From 786d8ae204a5b5ff7b9388b504aec224d7ff2e21 Mon Sep 17 00:00:00 2001
From: mhidas <marty.hidas@utas.edu.au>
Date: Thu, 19 Mar 2020 11:42:15 +1100
Subject: [PATCH 29/33] rename DataFrame object to distinguish it from xarray
 Dataset

---
 .../velocity_hourly_timeseries.py                 | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index c3dfa16..73444e9 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -52,30 +52,29 @@ def get_resampled_values(nc_cell, ds, slice_start, varlist, binning_function, ep
     :param is_WCUR: flag indicating if WCUR is present
     :return: end index of the slice
     """
-    nc_cell = nc_cell[varlist].squeeze()
-    nc_cell = nc_cell.to_dataframe()
+    df_cell = nc_cell[varlist].squeeze().to_dataframe()
     ## back the index 30min
-    nc_cell.index = nc_cell.index - pd.Timedelta(minutes=30)
+    df_cell.index = df_cell.index - pd.Timedelta(minutes=30)
     # TODO: shift timestamps to centre of sampling interval
 
-    nc_cell_1H = nc_cell.resample('1H')
-    slice_end = len(nc_cell_1H) + slice_start
+    df_cell_1H = df_cell.resample('1H')
+    slice_end = len(df_cell_1H) + slice_start
 
     ## move time it forward and get it
-    time_slice = ((np.fromiter(nc_cell_1H.groups.keys(), dtype='M8[ns]') + np.timedelta64(1, 'h')) - epoch) / one_day
+    time_slice = ((np.fromiter(df_cell_1H.groups.keys(), dtype='M8[ns]') + np.timedelta64(1, 'h')) - epoch) / one_day
     ds['TIME'][slice_start:slice_end] = time_slice
 
     # take the mean of the variables
     ds['UCUR'][slice_start:slice_end], \
     ds['VCUR'][slice_start:slice_end], \
     ds['WCUR'][slice_start:slice_end], \
-    ds['DEPTH'][slice_start:slice_end] = cell_velocity_resample(nc_cell_1H, 'mean', is_WCUR)
+    ds['DEPTH'][slice_start:slice_end] = cell_velocity_resample(df_cell_1H, 'mean', is_WCUR)
 
     for method in binning_function:
         ds['UCUR_' + method][slice_start:slice_end], \
         ds['VCUR_' + method][slice_start:slice_end], \
         ds['WCUR_' + method][slice_start:slice_end], \
-        ds['DEPTH_' + method][slice_start:slice_end] = cell_velocity_resample(nc_cell_1H, method, is_WCUR)
+        ds['DEPTH_' + method][slice_start:slice_end] = cell_velocity_resample(df_cell_1H, method, is_WCUR)
 
     return slice_end
 

From b2e8f4ff2916ee338966c1a26806ed047c6fa2be Mon Sep 17 00:00:00 2001
From: mhidas <marty.hidas@utas.edu.au>
Date: Thu, 19 Mar 2020 11:30:57 +1100
Subject: [PATCH 30/33] simplify 30min time shift for resample

---
 aodntools/timeseries_products/velocity_hourly_timeseries.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index 73444e9..b6d7ab0 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -54,14 +54,14 @@ def get_resampled_values(nc_cell, ds, slice_start, varlist, binning_function, ep
     """
     df_cell = nc_cell[varlist].squeeze().to_dataframe()
     ## back the index 30min
-    df_cell.index = df_cell.index - pd.Timedelta(minutes=30)
+    df_cell.index = df_cell.index + pd.Timedelta(minutes=30)
     # TODO: shift timestamps to centre of sampling interval
 
     df_cell_1H = df_cell.resample('1H')
     slice_end = len(df_cell_1H) + slice_start
 
     ## move time it forward and get it
-    time_slice = ((np.fromiter(df_cell_1H.groups.keys(), dtype='M8[ns]') + np.timedelta64(1, 'h')) - epoch) / one_day
+    time_slice = (np.fromiter(df_cell_1H.groups.keys(), dtype='M8[ns]') - epoch) / one_day
     ds['TIME'][slice_start:slice_end] = time_slice
 
     # take the mean of the variables

From 1d72e12fb9e5f120a4eebcc3e2cc4786ce6fe643 Mon Sep 17 00:00:00 2001
From: mhidas <marty.hidas@utas.edu.au>
Date: Wed, 1 Apr 2020 12:54:34 +1100
Subject: [PATCH 31/33] update documentation to reflect code changes (add
 comments about not centering the timestamps before binning)

---
 .../Documentation/velocity_hourly_timeseries.md    | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/aodntools/timeseries_products/Documentation/velocity_hourly_timeseries.md b/aodntools/timeseries_products/Documentation/velocity_hourly_timeseries.md
index 05623e6..ddb7386 100644
--- a/aodntools/timeseries_products/Documentation/velocity_hourly_timeseries.md
+++ b/aodntools/timeseries_products/Documentation/velocity_hourly_timeseries.md
@@ -1,4 +1,4 @@
-# Velocity Aggregated Time Series Product
+# Velocity Hourly Time Series Product
 
 - [Objective](#objective)
 - [Input](#input)
@@ -10,7 +10,7 @@
 
 ## Objective
 
-This product provides aggregated quality controlled U, V, and W velocity time-series files for each mooring site, without any interpolation or filtering, except for the exclusion of the out-of-water data, binned into 1-hour intervals. For the profiling (ADCP) instruments, the absolute depth of the measuring cell is calculated using the `DEPTH` measured at the instrument and the `HEIGHT_ABOVE_SENSOR`.
+This product provides aggregated quality controlled U, V, and W velocity time-series files for each mooring site, binned into 1-hour intervals, including only in-water data flagged as "good" or "probably good" in the input files. QC flags are not included. Statistics related to the averaging process will be stored as variables (standard deviation, minimum and maximum values, number of records binned). For the profiling (ADCP) instruments, the absolute depth of each measuring cell is calculated using the `DEPTH` measured at the instrument and the `HEIGHT_ABOVE_SENSOR` coordinate.
 
 The output from a single run of the code will be an aggregated file of all available measurements of the velocity components UCUR, VCUR and (where available) WCUR at one mooring site, binned into 1-hour intervals.
 
@@ -74,13 +74,15 @@ The dimensions of the resulting file  are determined as follows:
 
 ### Variables
 
-Only values flagged as “good” or “probably good” are included. The velocity variables are produced by flattening, and concatenating the arrays in each of the input files. The time of the measurement is shifted according to the seconds_to_middle_of_measurement attribute. The values are then averaged into one-hour time bins (independently within each depth cell for ADCPs). The resulting variables have dimension `OBSERVATION`. 
+Only in-water velocity measurements flagged as “good” or “probably good” in the input files are included. These values are averaged into one-hour time bins (independently within each depth cell for ADCPs). Timestamps in the input files indicate the start of each measurement interval, and these _have not been shifted to the centre of the interval before binning_. This could lead to an artificial shift of up to half an hour in the output data. The size of this shift, where known, has been recorded in the `SECONDS_TO_MIDDLE` variable.
 
-The variable `TIME` from input files is re-shaped to match the flattened velocity variables, and replaced by one-hour timestamps. The binning intervals will be one hour long, centred on the hour (i.e. HH:00:00). Each timestamp will be repeated once for each ADCP depth cell.
+After this averaging, the velocity variables are flattened into one dimensional arrays, and the arrays from each input file are concatenated into the output file. The resulting variables have dimension `OBSERVATION`. 
+
+The binning intervals will be one hour long, centred on the hour (i.e. HH:00:00). Each timestamp will be repeated once for each ADCP depth cell, in order to match the shape of the velocity variables. The `TIME` coordinate variable in the output file also has dimension `OBSERVATION`.
 
 The `DEPTH` variables from input files are averaged into the same one-hour bins, and concatenated into a variable `DEPTH(OBSERVATION)`. In the case of ADCP instruments, the `HEIGHT_ABOVE_SENSOR`  is converted to absolute depth by subtracting each of the height values from the depth measurements at the instrument. 
 
-All output variables with the `INSTRUMENT` dimension are sorted in chronological order, and the input files aggregated chronologically, according to the global attribute time_deployment_start.
+All output variables with the `INSTRUMENT` dimension are sorted in chronological order, and the input files aggregated chronologically, according to the global attribute `time_deployment_start`.
 
 In order to keep track of the provenance of the aggregated file, accessory variables are created:
 
@@ -99,7 +101,7 @@ In order to keep track of the provenance of the aggregated file, accessory varia
 
 The variable attributes will comply with the IMOS metadata standards.
 
-The global metadata will be a set of IMOS standard attributes. Fixed attributes are read from a [JSON file](../velocity_aggregated_timeseries_template.json) that contains the {key:value} pairs for each of them.
+The global metadata will be a set of IMOS standard attributes. Fixed attributes are read from a [JSON file](../velocity_hourly_timeseries_template.json) that contains the {key:value} pairs for each of them.
 
 Attributes specific to each aggregated product, are added as follows:
 

From 0de304e21e87f4424ee615f80af7580d0aa60006 Mon Sep 17 00:00:00 2001
From: mhidas <marty.hidas@utas.edu.au>
Date: Wed, 1 Apr 2020 12:56:51 +1100
Subject: [PATCH 32/33] fixup: remove unused -path command-line argument

---
 aodntools/timeseries_products/velocity_hourly_timeseries.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index b6d7ab0..b10a48c 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -327,7 +327,6 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir
     parser = argparse.ArgumentParser(description="Concatenate X,Y,Z velocity variables from ALL instruments from ALL deployments from ONE site")
     parser.add_argument('-site', dest='site_code', help='site code, like NRMMAI',  required=True)
     parser.add_argument('-files', dest='filenames', help='name of the file that contains the source URLs', required=True)
-    parser.add_argument('-path', dest='output_path', help='path where the result file will be written. Default: ./', default='./', required=False)
     parser.add_argument('-indir', dest='input_dir', help='base path of input files', default='', required=False)
     parser.add_argument('-outdir', dest='output_dir', help='path where the result file will be written. Default ./',
                         default='./', required=False)

From 23325940f319a020751a1c38cd3cfce7d587d2de Mon Sep 17 00:00:00 2001
From: mhidas <marty.hidas@utas.edu.au>
Date: Thu, 2 Apr 2020 17:18:02 +1100
Subject: [PATCH 33/33] rename and refactor get_resampled_values in response to
 review

---
 .../velocity_hourly_timeseries.py             | 60 +++++++++----------
 1 file changed, 27 insertions(+), 33 deletions(-)

diff --git a/aodntools/timeseries_products/velocity_hourly_timeseries.py b/aodntools/timeseries_products/velocity_hourly_timeseries.py
index b10a48c..d3cd104 100644
--- a/aodntools/timeseries_products/velocity_hourly_timeseries.py
+++ b/aodntools/timeseries_products/velocity_hourly_timeseries.py
@@ -17,20 +17,24 @@
 
 TEMPLATE_JSON = resource_filename(__name__, 'velocity_hourly_timeseries_template.json')
 QC_FLAG_MAX = 2
+TIME_UNITS = "days since 1950-01-01 00:00:00 UTC"
+TIME_CALENDAR = "gregorian"
+TIME_EPOCH = np.datetime64("1950-01-01T00:00:00")
+ONE_DAY = np.timedelta64(1, 'D')
 
-def cell_velocity_resample(df, binning_function, is_WCUR):
+
+def cell_velocity_resample(df, binning_function):
     """
     Resample a dataset to a specific time_interval.
     if WCUR not present, returns nan
     :param df: grouped dataframe
     :param binning_function: name of standard numpy function used for binning
-    :param is_WCUR: True if WCUR is present in nc, False otherwise
     :return: binned U, v, W CUR according to the binning function
     """
     df_binned = df.apply(binning_function)
     UCUR = np.array(df_binned['UCUR'])
     VCUR = np.array(df_binned['VCUR'])
-    if is_WCUR:
+    if 'WCUR' in df_binned:
         WCUR = np.array(df_binned['WCUR'])
     else:
         WCUR = np.full(len(df), np.nan)
@@ -39,42 +43,40 @@ def cell_velocity_resample(df, binning_function, is_WCUR):
     return UCUR, VCUR, WCUR, DEPTH
 
 
-def get_resampled_values(nc_cell, ds, slice_start, varlist, binning_function, epoch, one_day, is_WCUR):
+def append_resampled_values(nc_cell, ds, slice_start, binning_functions):
     """
-    get U, V, W current values resampled
-    :param nc_cell: xarray DATASET
-    :param ds: netcdf4 dataset
+    Resample U, V, W current and depth values from a single ADCP cell into hourly bins, and
+    append the mean values to the corresponding variables in the output dataset (starting at
+    index slice_start), along with additional statistical variables specified by binning_functions.
+    :param nc_cell: input xarray Dataset representing a single ADCP cell (or point time series)
+    :param ds: output netcdf4 Dataset to update with resampled values
     :param slice_start: start index of the slice
-    :param varlist: list of variable names to subset the dataset
-    :param binning_function: list of numpy function names for binning
-    :param one_day: timedelta one day
-    :param epoch: base epoch
-    :param is_WCUR: flag indicating if WCUR is present
+    :param binning_functions: list of numpy function names for binning
     :return: end index of the slice
     """
-    df_cell = nc_cell[varlist].squeeze().to_dataframe()
-    ## back the index 30min
+    df_cell = nc_cell.squeeze().to_dataframe()
+    # shift the index forward 30min to centre the bins on the hour
     df_cell.index = df_cell.index + pd.Timedelta(minutes=30)
     # TODO: shift timestamps to centre of sampling interval
 
     df_cell_1H = df_cell.resample('1H')
     slice_end = len(df_cell_1H) + slice_start
 
-    ## move time it forward and get it
-    time_slice = (np.fromiter(df_cell_1H.groups.keys(), dtype='M8[ns]') - epoch) / one_day
+    # set binned timestamps
+    time_slice = (np.fromiter(df_cell_1H.groups.keys(), dtype='M8[ns]') - TIME_EPOCH) / ONE_DAY
     ds['TIME'][slice_start:slice_end] = time_slice
 
     # take the mean of the variables
     ds['UCUR'][slice_start:slice_end], \
     ds['VCUR'][slice_start:slice_end], \
     ds['WCUR'][slice_start:slice_end], \
-    ds['DEPTH'][slice_start:slice_end] = cell_velocity_resample(df_cell_1H, 'mean', is_WCUR)
+    ds['DEPTH'][slice_start:slice_end] = cell_velocity_resample(df_cell_1H, 'mean')
 
-    for method in binning_function:
+    for method in binning_functions:
         ds['UCUR_' + method][slice_start:slice_end], \
         ds['VCUR_' + method][slice_start:slice_end], \
         ds['WCUR_' + method][slice_start:slice_end], \
-        ds['DEPTH_' + method][slice_start:slice_end] = cell_velocity_resample(df_cell_1H, method, is_WCUR)
+        ds['DEPTH_' + method][slice_start:slice_end] = cell_velocity_resample(df_cell_1H, method)
 
     return slice_end
 
@@ -98,11 +100,6 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir
 
     varlist = ['UCUR', 'VCUR', 'WCUR', 'DEPTH']
     binning_fun = ['max', 'min', 'std', 'count']
-    
-    time_units="days since 1950-01-01 00:00:00 UTC"
-    time_calendar="gregorian"
-    epoch = np.datetime64("1950-01-01T00:00:00")
-    one_day = np.timedelta64(1, 'D')
 
     bad_files = {}
 
@@ -188,7 +185,6 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir
         with xr.open_dataset(os.path.join(input_dir, file)) as nc:
 
             is_2D = 'HEIGHT_ABOVE_SENSOR' in list(nc.variables)
-            is_WCUR = 'WCUR' in list(nc.data_vars)
 
             ## mask values with QC flag>2
             for var in varlist:
@@ -213,14 +209,12 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir
                         nc_cell = nc_chunk.sel(HEIGHT_ABOVE_SENSOR=cell_height)
                         ## convert to absolute DEPTH
                         nc_cell['DEPTH'] = nc_cell['DEPTH'] - cell_height
-                        slice_end = get_resampled_values(nc_cell, ds, slice_start, varlist, binning_fun,
-                                                         epoch, one_day, is_WCUR)
+                        slice_end = append_resampled_values(nc_cell[varlist], ds, slice_start, binning_fun)
                         CELL_INDEX[slice_start:slice_end] = np.full(slice_end - slice_start, cell_idx, dtype=np.uint32)
 
                         slice_start = slice_end
                 else:
-                    slice_end = get_resampled_values(nc_chunk, ds, slice_start, varlist, binning_fun,
-                                                     epoch, one_day, is_WCUR)
+                    slice_end = append_resampled_values(nc_chunk[varlist], ds, slice_start, binning_fun)
                     CELL_INDEX[slice_start:slice_end] = np.full(slice_end - slice_start, 0, dtype=np.uint32)
 
                     slice_start = slice_end
@@ -260,10 +254,10 @@ def velocity_hourly_aggregated(files_to_agg, site_code, input_dir='', output_dir
     timeformat = '%Y-%m-%dT%H:%M:%SZ'
     file_timeformat = '%Y%m%d'
 
-    time_start = num2date(np.min(TIME[:]), time_units, time_calendar).strftime(timeformat)
-    time_end = num2date(np.max(TIME[:]), time_units, time_calendar).strftime(timeformat)
-    time_start_filename = num2date(np.min(TIME[:]), time_units, time_calendar).strftime(file_timeformat)
-    time_end_filename = num2date(np.max(TIME[:]), time_units, time_calendar).strftime(file_timeformat)
+    time_start = num2date(np.min(TIME[:]), TIME_UNITS, TIME_CALENDAR).strftime(timeformat)
+    time_end = num2date(np.max(TIME[:]), TIME_UNITS, TIME_CALENDAR).strftime(timeformat)
+    time_start_filename = num2date(np.min(TIME[:]), TIME_UNITS, TIME_CALENDAR).strftime(file_timeformat)
+    time_end_filename = num2date(np.max(TIME[:]), TIME_UNITS, TIME_CALENDAR).strftime(file_timeformat)
 
 
     contributor_name, contributor_email, contributor_role = utils.get_contributors(files_to_agg=files_to_agg, input_dir=input_dir)