diff --git a/doc/esmvalcore/datafinder.rst b/doc/esmvalcore/datafinder.rst index 206b3d1766..cf7caf0791 100644 --- a/doc/esmvalcore/datafinder.rst +++ b/doc/esmvalcore/datafinder.rst @@ -50,7 +50,7 @@ section. Data retrieval ============== Data retrieval in ESMValTool has two main aspects from the user's point of -view: +view: * data can be found by the tool, subject to availability on disk; * it is the user's responsibility to set the correct data retrieval parameters; @@ -73,7 +73,7 @@ set the paths are ``rootpath`` and ``drs``. ``rootpath`` contains pointers to of directory structure the root paths are structured by. It is important to first discuss the ``drs`` parameter: as we've seen in the previous section, the DRS as a standard is used for both file naming conventions and for directory -structures. +structures. .. _config-user-drs: @@ -81,7 +81,7 @@ Explaining ``config-user/drs: CMIP5:`` or ``config-user/drs: CMIP6:`` --------------------------------------------------------------------- Whreas ESMValTool will **always** use the CMOR standard for file naming (please refer above), by setting the ``drs`` parameter the user tells the tool what -type of root paths they need the data from, e.g.: +type of root paths they need the data from, e.g.: .. code-block:: yaml @@ -156,7 +156,7 @@ Explaining ``config-user/rootpath:`` * ``default``: this is the `root` path(s) to where files are stored without any DRS-like directory structure; in a nutshell, this is a single directory that should contain all the files needed by the run, without any sub-directory - structure. + structure. * ``RAWOBS``: this is the `root` path(s) to where the raw observational data files are stored; this is used by ``cmorize_obs``. @@ -168,7 +168,7 @@ information on the specific datasets that are needed for the analysis. This information, together with the CMOR convention for naming files (see CMOR-DRS_) will allow the tool to search and find the right files. The specific datasets are listed in any recipe, under either the ``datasets`` and/or -``additional_datasets`` sections, e.g. +``additional_datasets`` sections, e.g. .. code-block:: yaml @@ -217,7 +217,7 @@ CMOR-DRS_ and establish the path to the files as: /badc/cmip6/data/CMIP6/CMIP/MOHC/UKESM1-0-LL/historical/r1i1p1f2/Amon then look for variable ``ta`` and specifically the latest version of the data -file: +file: .. code-block:: @@ -257,3 +257,26 @@ Since observational data are organized in Tiers depending on their level of public availability, the ``default`` directory must be structured accordingly with sub-directories ``TierX`` (``Tier1``, ``Tier2`` or ``Tier3``), even when ``drs: default``. + +Data concatenation from multiple sources +======================================== + +Oftentimes data retrieving results in assembling a continuous data stream from +multiple files or even, multiple experiments. The internal mechanism through which +the assembly is done is via cube concatenation. One peculiarity of iris concatenation +(see `iris cube concatenation `_) +is that it doesn't allow for concatenating time-overlapping cubes; this case is rather +frequent with data from models overlapping in time, and is accounted for by a function that performs a +flexible concatenation between two cubes, depending on the particular setup: + +* cubes overlap in time: resulting cube is made up of the overlapping data plus left and + right hand sides on each side of the overlapping data; note that in the case of the cubes + coming from different experiments the resulting concatenated cube will have composite data + made up from multiple experiments: assume [cube1: exp1, cube2: exp2] and cube1 starts before cube2, + and cube2 finishes after cube1, then the concatenated cube will be made up of cube2: exp2 plus the + section of cube1: exp1 that contains data not provided in cube2: exp2; +* cubes don't overlap in time: data from the two cubes is bolted together; + +Note that two cube concatenation is the base operation of an iterative process of reducing multiple cubes +from multiple data segments via cube concatenation ie if there is no time-overlapping data, the +cubes concatenation is performed in one step. diff --git a/esmvalcore/preprocessor/_io.py b/esmvalcore/preprocessor/_io.py index 22dc6b1c58..468ef023b7 100644 --- a/esmvalcore/preprocessor/_io.py +++ b/esmvalcore/preprocessor/_io.py @@ -13,6 +13,7 @@ import yaml from .._task import write_ncl_settings +from ._time import extract_time logger = logging.getLogger(__name__) @@ -87,9 +88,20 @@ def _fix_cube_attributes(cubes): def concatenate(cubes): """Concatenate all cubes after fixing metadata.""" _fix_cube_attributes(cubes) + concatenated = iris.cube.CubeList(cubes).concatenate() + if len(concatenated) == 2: + try: + concatenated[0].coord('time') + concatenated[1].coord('time') + except iris.exceptions.CoordinateNotFoundError: + pass + else: + concatenated = _concatenate_overlapping_cubes(concatenated) + if len(concatenated) == 1: return concatenated[0] + logger.error('Can not concatenate cubes into a single one.') logger.error('Resulting cubes:') for cube in concatenated: @@ -269,3 +281,90 @@ def _write_ncl_metadata(output_dir, metadata): write_ncl_settings(info, filename) return filename + + +def _concatenate_overlapping_cubes(cubes): + """Concatenate time-overlapping cubes (two cubes only).""" + # we arrange [cube1, cube2] so that cube1.start <= cube2.start + if cubes[0].coord('time').points[0] <= cubes[1].coord('time').points[0]: + cubes = [cubes[0], cubes[1]] + logger.debug( + "Will attempt to concatenate cubes %s " + "and %s in this order", cubes[0], cubes[1]) + else: + cubes = [cubes[1], cubes[0]] + logger.debug( + "Will attempt to concatenate cubes %s " + "and %s in this order", cubes[1], cubes[0]) + + # get time end points + time_1 = cubes[0].coord('time') + time_2 = cubes[1].coord('time') + data_start_1 = time_1.cell(0).point + data_start_2 = time_2.cell(0).point + data_end_1 = time_1.cell(-1).point + data_end_2 = time_2.cell(-1).point + + # case 1: both cubes start at the same time -> return longer cube + if data_start_1 == data_start_2: + if data_end_1 <= data_end_2: + logger.debug( + "Both cubes start at the same time but cube %s " + "ends before %s", cubes[0], cubes[1]) + logger.debug("Cube %s contains all needed data so using it fully", + cubes[1]) + cubes = [cubes[1]] + else: + logger.debug( + "Both cubes start at the same time but cube %s " + "ends before %s", cubes[1], cubes[0]) + logger.debug("Cube %s contains all needed data so using it fully", + cubes[0]) + cubes = [cubes[0]] + + # case 2: cube1 starts before cube2 + else: + # find time overlap, if any + start_overlap = next((time_1.units.num2date(t) + for t in time_1.points if t in time_2.points), + None) + # case 2.0: no overlap (new iris implementaion does allow + # concatenation of cubes with no overlap) + if not start_overlap: + logger.debug( + "Unable to concatenate non-overlapping cubes\n%s\nand\n%s" + "separated in time.", cubes[0], cubes[1]) + # case 2.1: cube1 ends after cube2 -> return cube1 + elif data_end_1 > data_end_2: + cubes = [cubes[0]] + logger.debug("Using only data from %s", cubes[0]) + # case 2.2: cube1 ends before cube2 -> use full cube2 and shorten cube1 + else: + logger.debug( + "Extracting time slice between %s and %s from cube %s to use " + "it for concatenation with cube %s", "-".join([ + str(data_start_1.year), + str(data_start_1.month), + str(data_start_1.day) + ]), "-".join([ + str(start_overlap.year), + str(start_overlap.month), + str(start_overlap.day) + ]), cubes[0], cubes[1]) + c1_delta = extract_time(cubes[0], data_start_1.year, + data_start_1.month, data_start_1.day, + start_overlap.year, start_overlap.month, + start_overlap.day) + cubes = iris.cube.CubeList([c1_delta, cubes[1]]) + logger.debug("Attempting concatenatenation of %s with %s", + c1_delta, cubes[1]) + try: + cubes = [iris.cube.CubeList(cubes).concatenate_cube()] + except iris.exceptions.ConcatenateError as ex: + logger.error('Can not concatenate cubes: %s', ex) + logger.error('Cubes:') + for cube in cubes: + logger.error(cube) + raise ex + + return cubes diff --git a/tests/integration/preprocessor/_io/test_concatenate.py b/tests/integration/preprocessor/_io/test_concatenate.py index 276ada273c..2ee24bae0d 100644 --- a/tests/integration/preprocessor/_io/test_concatenate.py +++ b/tests/integration/preprocessor/_io/test_concatenate.py @@ -3,45 +3,174 @@ import unittest import numpy as np +from cf_units import Unit from iris.coords import DimCoord from iris.cube import Cube +from iris.exceptions import ConcatenateError from esmvalcore.preprocessor import _io class TestConcatenate(unittest.TestCase): """Tests for :func:`esmvalcore.preprocessor._io.concatenate`.""" - def setUp(self): """Start tests.""" - coord = DimCoord([1, 2], var_name='coord') - second_coord = coord.copy([3, 4]) - third_coord = coord.copy([5, 6]) + self._model_coord = DimCoord([1., 2.], + var_name='time', + standard_name='time', + units='days since 1950-01-01') self.raw_cubes = [] + self._add_cube([1., 2.], [1., 2.]) + self._add_cube([3., 4.], [3., 4.]) + self._add_cube([5., 6.], [5., 6.]) + + def _add_cube(self, data, coord): self.raw_cubes.append( - Cube([1, 2], var_name='sample', dim_coords_and_dims=((coord, - 0), ))) - self.raw_cubes.append( - Cube([3, 4], - var_name='sample', - dim_coords_and_dims=((second_coord, 0), ))) - self.raw_cubes.append( - Cube([5, 6], + Cube(data, var_name='sample', - dim_coords_and_dims=((third_coord, 0), ))) + dim_coords_and_dims=((self._model_coord.copy(coord), 0), ))) def test_concatenate(self): """Test concatenation of two cubes.""" concatenated = _io.concatenate(self.raw_cubes) - self.assertTrue((concatenated.coord('coord').points == np.array( - [1, 2, 3, 4, 5, 6])).all()) + np.testing.assert_array_equal( + concatenated.coord('time').points, np.array([1, 2, 3, 4, 5, 6])) - def test_fail_with_duplicates(self): - """Test exception raised if two cubes are overlapping.""" - self.raw_cubes.append(self.raw_cubes[0].copy()) - with self.assertRaises(ValueError): + def test_concatenate_with_overlap(self): + """Test concatenation of time overalapping cubes""" + self._add_cube([6.5, 7.5], [6., 7.]) + concatenated = _io.concatenate(self.raw_cubes) + np.testing.assert_array_equal( + concatenated.coord('time').points, + np.array([1., 2., 3., 4., 5., 6., 7.])) + np.testing.assert_array_equal(concatenated.data, + np.array([1., 2., 3., 4., 5., 6.5, 7.5])) + + def test_concatenate_with_overlap_2(self): + """Test a more generic case.""" + self._add_cube([65., 75.], [3., 200.]) + self._add_cube([65., 75.], [1000., 7000.]) + concatenated = _io.concatenate(self.raw_cubes) + np.testing.assert_array_equal( + concatenated.coord('time').points, + np.array([1., 2., 3., 4., 5., 6., 1000., 7000.])) + + def test_concatenate_with_overlap_same_start(self): + """Test a more generic case.""" + cube1 = self.raw_cubes[0] + raw_cubes = [ + cube1, + ] + time_coord = DimCoord([1., 7.], + var_name='time', + standard_name='time', + units='days since 1950-01-01') + raw_cubes.append( + Cube([33., 55.], + var_name='sample', + dim_coords_and_dims=((time_coord, 0), ))) + concatenated = _io.concatenate(raw_cubes) + np.testing.assert_array_equal( + concatenated.coord('time').points, np.array([1., 7.])) + raw_cubes.reverse() + concatenated = _io.concatenate(raw_cubes) + np.testing.assert_array_equal( + concatenated.coord('time').points, np.array([1., 7.])) + + def test_concatenate_with_iris_exception(self): + """Test a more generic case.""" + time_coord_1 = DimCoord([1.5, 5., 7.], + var_name='time', + standard_name='time', + units='days since 1950-01-01') + cube1 = Cube([33., 55., 77.], + var_name='sample', + dim_coords_and_dims=((time_coord_1, 0), )) + time_coord_2 = DimCoord([1., 5., 7.], + var_name='time', + standard_name='time', + units='days since 1950-01-01') + cube2 = Cube([33., 55., 77.], + var_name='sample', + dim_coords_and_dims=((time_coord_2, 0), )) + cubes_single_ovlp = [cube2, cube1] + with self.assertRaises(ConcatenateError): + _io.concatenate(cubes_single_ovlp) + + def test_concatenate_with_order(self): + """Test a more generic case.""" + time_coord_1 = DimCoord([1.5, 2., 5., 7.], + var_name='time', + standard_name='time', + units='days since 1950-01-01') + cube1 = Cube([33., 44., 55., 77.], + var_name='sample', + dim_coords_and_dims=((time_coord_1, 0), )) + time_coord_2 = DimCoord([1., 2., 5., 7., 100.], + var_name='time', + standard_name='time', + units='days since 1950-01-01') + cube2 = Cube([33., 44., 55., 77., 1000.], + var_name='sample', + dim_coords_and_dims=((time_coord_2, 0), )) + cubes_ordered = [cube2, cube1] + concatenated = _io.concatenate(cubes_ordered) + np.testing.assert_array_equal( + concatenated.coord('time').points, np.array([1., 2., 5., 7., + 100.])) + cubes_reverse = [cube1, cube2] + concatenated = _io.concatenate(cubes_reverse) + np.testing.assert_array_equal( + concatenated.coord('time').points, np.array([1., 2., 5., 7., + 100.])) + + def test_fail_on_calendar_concatenate_with_overlap(self): + """Test fail of concatenation with overlap.""" + time_coord = DimCoord([3., 7000.], + var_name='time', + standard_name='time', + units=Unit('days since 1950-01-01', + calendar='360_day')) + self.raw_cubes.append( + Cube([33., 55.], + var_name='sample', + dim_coords_and_dims=((time_coord, 0), ))) + with self.assertRaises(TypeError): _io.concatenate(self.raw_cubes) + def test_fail_on_units_concatenate_with_overlap(self): + """Test fail of concatenation with overlap.""" + time_coord_1 = DimCoord([3., 7000.], + var_name='time', + standard_name='time', + units=Unit('days since 1950-01-01', + calendar='360_day')) + time_coord_2 = DimCoord([3., 9000.], + var_name='time', + standard_name='time', + units=Unit('days since 1950-01-01', + calendar='360_day')) + time_coord_3 = DimCoord([3., 9000.], + var_name='time', + standard_name='time', + units=Unit('days since 1850-01-01', + calendar='360_day')) + raw_cubes = [] + raw_cubes.append( + Cube([33., 55.], + var_name='sample', + dim_coords_and_dims=((time_coord_1, 0), ))) + raw_cubes.append( + Cube([33., 55.], + var_name='sample', + dim_coords_and_dims=((time_coord_2, 0), ))) + raw_cubes.append( + Cube([33., 55.], + var_name='sample', + dim_coords_and_dims=((time_coord_3, 0), ))) + with self.assertRaises(ValueError): + _io.concatenate(raw_cubes) + def test_fail_metadata_differs(self): """Test exception raised if two cubes have different metadata.""" self.raw_cubes[0].units = 'm' @@ -111,4 +240,4 @@ def test_fix_attributes(self): self.raw_cubes[idx].attributes.update(differing_attrs[idx]) _io._fix_cube_attributes(self.raw_cubes) # noqa for cube in self.raw_cubes: - self.assertTrue(cube.attributes == resulting_attrs) + self.assertEqual(cube.attributes, resulting_attrs)