ESMValGroup · mattiarighi · Feb 11, 2020 · Sep 26, 2019 · Sep 26, 2019 · Sep 26, 2019
diff --git a/doc/esmvalcore/datafinder.rst b/doc/esmvalcore/datafinder.rst
@@ -50,7 +50,7 @@ section.
 Data retrieval
 ==============
 Data retrieval in ESMValTool has two main aspects from the user's point of
-view: 
+view:
 
 * data can be found by the tool, subject to availability on disk;
 * it is the user's responsibility to set the correct data retrieval parameters;
@@ -73,15 +73,15 @@ set the paths are ``rootpath`` and ``drs``. ``rootpath`` contains pointers to
 of directory structure the root paths are structured by. It is important to
 first discuss the ``drs`` parameter: as we've seen in the previous section, the
 DRS as a standard is used for both file naming conventions and for directory
-structures. 
+structures.
 
 .. _config-user-drs:
 
 Explaining ``config-user/drs: CMIP5:`` or ``config-user/drs: CMIP6:``
 ---------------------------------------------------------------------
 Whreas ESMValTool will **always** use the CMOR standard for file naming (please
 refer above), by setting the ``drs`` parameter the user tells the tool what
-type of root paths they need the data from, e.g.: 
+type of root paths they need the data from, e.g.:
 
   .. code-block:: yaml
 
@@ -156,7 +156,7 @@ Explaining ``config-user/rootpath:``
 * ``default``: this is the `root` path(s) to where files are stored without any
   DRS-like directory structure; in a nutshell, this is a single directory that
   should contain all the files needed by the run, without any sub-directory
-  structure. 
+  structure.
 
 * ``RAWOBS``: this is the `root` path(s) to where the raw observational data
   files are stored; this is used by ``cmorize_obs``.
@@ -168,7 +168,7 @@ information on the specific datasets that are needed for the analysis. This
 information, together with the CMOR convention for naming files (see CMOR-DRS_)
 will allow the tool to search and find the right files. The specific
 datasets are listed in any recipe, under either the ``datasets`` and/or
-``additional_datasets`` sections, e.g. 
+``additional_datasets`` sections, e.g.
 
 .. code-block:: yaml
 
@@ -217,7 +217,7 @@ CMOR-DRS_ and establish the path to the files as:
   /badc/cmip6/data/CMIP6/CMIP/MOHC/UKESM1-0-LL/historical/r1i1p1f2/Amon
 
 then look for variable ``ta`` and specifically the latest version of the data
-file: 
+file:
 
 .. code-block::
 
@@ -257,3 +257,26 @@ Since observational data are organized in Tiers depending on their level of
 public availability, the ``default`` directory must be structured accordingly
 with sub-directories ``TierX`` (``Tier1``, ``Tier2`` or ``Tier3``), even when
 ``drs: default``.
+
+Data concatenation from multiple sources
+========================================
+
+Oftentimes data retrieving results in assembling a continuous data stream from
+multiple files or even, multiple experiments. The internal mechanism through which
+the assembly is done is via cube concatenation. One peculiarity of iris concatenation
+(see `iris cube concatenation <https://scitools.org.uk/iris/docs/latest/userguide/merge_and_concat.html>`_)
+is that it doesn't allow for concatenating time-overlapping cubes; this case is rather
+frequent with data from models overlapping in time, and is accounted for by a function that performs a
+flexible concatenation between two cubes, depending on the particular setup:
+
+* cubes overlap in time: resulting cube is made up of the overlapping data plus left and
+  right hand sides on each side of the overlapping data; note that in the case of the cubes
+  coming from different experiments the resulting concatenated cube will have composite data
+  made up from multiple experiments: assume [cube1: exp1, cube2: exp2] and cube1 starts before cube2,
+  and cube2 finishes after cube1, then the concatenated cube will be made up of cube2: exp2 plus the
+  section of cube1: exp1 that contains data not provided in cube2: exp2;
+* cubes don't overlap in time: data from the two cubes is bolted together;
+
+Note that two cube concatenation is the base operation of an iterative process of reducing multiple cubes
+from multiple data segments via cube concatenation ie if there is no time-overlapping data, the
+cubes concatenation is performed in one step.
diff --git a/esmvalcore/preprocessor/_io.py b/esmvalcore/preprocessor/_io.py
@@ -13,6 +13,7 @@
 import yaml
 
 from .._task import write_ncl_settings
+from ._time import extract_time
 
 logger = logging.getLogger(__name__)
 
@@ -87,9 +88,20 @@ def _fix_cube_attributes(cubes):
 def concatenate(cubes):
     """Concatenate all cubes after fixing metadata."""
     _fix_cube_attributes(cubes)
+
     concatenated = iris.cube.CubeList(cubes).concatenate()
+    if len(concatenated) == 2:
+        try:
+            concatenated[0].coord('time')
+            concatenated[1].coord('time')
+        except iris.exceptions.CoordinateNotFoundError:
+            pass
+        else:
+            concatenated = _concatenate_overlapping_cubes(concatenated)
+
     if len(concatenated) == 1:
         return concatenated[0]
+
     logger.error('Can not concatenate cubes into a single one.')
     logger.error('Resulting cubes:')
     for cube in concatenated:
@@ -269,3 +281,90 @@ def _write_ncl_metadata(output_dir, metadata):
     write_ncl_settings(info, filename)
 
     return filename
+
+
+def _concatenate_overlapping_cubes(cubes):
+    """Concatenate time-overlapping cubes (two cubes only)."""
+    # we arrange [cube1, cube2] so that cube1.start <= cube2.start
+    if cubes[0].coord('time').points[0] <= cubes[1].coord('time').points[0]:
+        cubes = [cubes[0], cubes[1]]
+        logger.debug(
+            "Will attempt to concatenate cubes %s "
+            "and %s in this order", cubes[0], cubes[1])
+    else:
+        cubes = [cubes[1], cubes[0]]
+        logger.debug(
+            "Will attempt to concatenate cubes %s "
+            "and %s in this order", cubes[1], cubes[0])
+
+    # get time end points
+    time_1 = cubes[0].coord('time')
+    time_2 = cubes[1].coord('time')
+    data_start_1 = time_1.cell(0).point
+    data_start_2 = time_2.cell(0).point
+    data_end_1 = time_1.cell(-1).point
+    data_end_2 = time_2.cell(-1).point
+
+    # case 1: both cubes start at the same time -> return longer cube
+    if data_start_1 == data_start_2:
+        if data_end_1 <= data_end_2:
+            logger.debug(
+                "Both cubes start at the same time but cube %s "
+                "ends before %s", cubes[0], cubes[1])
+            logger.debug("Cube %s contains all needed data so using it fully",
+                         cubes[1])
+            cubes = [cubes[1]]
+        else:
+            logger.debug(
+                "Both cubes start at the same time but cube %s "
+                "ends before %s", cubes[1], cubes[0])
+            logger.debug("Cube %s contains all needed data so using it fully",
+                         cubes[0])
+            cubes = [cubes[0]]
+
+    # case 2: cube1 starts before cube2
+    else:
+        # find time overlap, if any
+        start_overlap = next((time_1.units.num2date(t)
+                              for t in time_1.points if t in time_2.points),
+                             None)
+        # case 2.0: no overlap (new iris implementaion does allow
+        # concatenation of cubes with no overlap)
+        if not start_overlap:
+            logger.debug(
+                "Unable to concatenate non-overlapping cubes\n%s\nand\n%s"
+                "separated in time.", cubes[0], cubes[1])
+        # case 2.1: cube1 ends after cube2 -> return cube1
+        elif data_end_1 > data_end_2:
+            cubes = [cubes[0]]
+            logger.debug("Using only data from %s", cubes[0])
+        # case 2.2: cube1 ends before cube2 -> use full cube2 and shorten cube1
+        else:
+            logger.debug(
+                "Extracting time slice between %s and %s from cube %s to use "
+                "it for concatenation with cube %s", "-".join([
+                    str(data_start_1.year),
+                    str(data_start_1.month),
+                    str(data_start_1.day)
+                ]), "-".join([
+                    str(start_overlap.year),
+                    str(start_overlap.month),
+                    str(start_overlap.day)
+                ]), cubes[0], cubes[1])
+            c1_delta = extract_time(cubes[0], data_start_1.year,
+                                    data_start_1.month, data_start_1.day,
+                                    start_overlap.year, start_overlap.month,
+                                    start_overlap.day)
+            cubes = iris.cube.CubeList([c1_delta, cubes[1]])
+            logger.debug("Attempting concatenatenation of %s with %s",
+                         c1_delta, cubes[1])
+            try:
+                cubes = [iris.cube.CubeList(cubes).concatenate_cube()]
+            except iris.exceptions.ConcatenateError as ex:
+                logger.error('Can not concatenate cubes: %s', ex)
+                logger.error('Cubes:')
+                for cube in cubes:
+                    logger.error(cube)
+                raise ex
+
+    return cubes