From 16ba4a2a996a404a80f5dedacb6811e91c49331a Mon Sep 17 00:00:00 2001
From: Franziska Winterstein <franziska.winterstein@dlr.de>
Date: Thu, 28 May 2026 17:03:22 +0200
Subject: [PATCH 01/21] implemented the option to test for significance when
 comparing with a reference

---
 .../config/configurations/defaults/extra_facets_emac.yml   | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/esmvalcore/config/configurations/defaults/extra_facets_emac.yml b/esmvalcore/config/configurations/defaults/extra_facets_emac.yml
index 169a69860f..8ac3d9d05d 100644
--- a/esmvalcore/config/configurations/defaults/extra_facets_emac.yml
+++ b/esmvalcore/config/configurations/defaults/extra_facets_emac.yml
@@ -257,9 +257,12 @@ projects:
           ta:
             raw_name: [tm1_cav, tm1_ave, tm1]
             channel: Amon
+          # tro3:
+          #   raw_name: [O3_cav, O3_ave, O3]
+          #   channel: Amon
           tro3:
-            raw_name: [O3_cav, O3_ave, O3]
-            channel: Amon
+            raw_name: [O3_p39_cav, O3_cav, O3_ave, O3]
+            channel: AERmon
           ua:
             raw_name: [um1_cav, um1_ave, um1]
             channel: Amon

From 1f407788c833f239c6d073aca4d9c5176f580d65 Mon Sep 17 00:00:00 2001
From: Franziska Winterstein <franziska.winterstein@dlr.de>
Date: Thu, 11 Jun 2026 14:06:13 +0200
Subject: [PATCH 02/21] correct attaching of ancillary variable

---
 esmvalcore/preprocessor/_compare_with_refs.py | 36 +++++++++++++++++--
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py
index 4e6fc6c180..b812ce754c 100644
--- a/esmvalcore/preprocessor/_compare_with_refs.py
+++ b/esmvalcore/preprocessor/_compare_with_refs.py
@@ -8,10 +8,12 @@
 
 import dask
 import dask.array as da
+import dask.array.stats
 import iris.analysis
 import iris.analysis.stats
 import numpy as np
 from iris.common.metadata import CubeMetadata
+from iris.coords import AncillaryVariable
 from iris.coords import CellMethod
 from iris.cube import Cube, CubeList
 from scipy.stats import wasserstein_distance
@@ -47,6 +49,7 @@ def bias(
     products: set[PreprocessorFile] | Iterable[Cube],
     reference: Cube | None = None,
     bias_type: BiasType = "absolute",
+    ttest: bool = False,
     denominator_mask_threshold: float = 1e-3,
     keep_reference_dataset: bool = False,
 ) -> set[PreprocessorFile] | CubeList:
@@ -81,6 +84,11 @@ def bias(
     bias_type:
         Bias type that is calculated. Must be one of ``'absolute'`` (dataset -
         ref) or ``'relative'`` ((dataset - ref) / ref).
+    ttest:
+        If ``True`` calculate the p-value according to the Welch's Test between
+        dataset and reference. The p-value is attached as an ancillary variable
+        to the dataset. A Welch's Test is chosen, since unequal variances and
+        sample sizes are likely.
     denominator_mask_threshold:
         Threshold to mask values close to zero in the denominator (i.e., the
         reference dataset) during the calculation of relative biases. All
@@ -138,7 +146,7 @@ def bias(
 
     # If input is an Iterable of Cube objects, calculate bias for each element
     if all_cubes_given:
-        cubes = [_calculate_bias(c, reference, bias_type) for c in products]
+        cubes = [_calculate_bias(c, reference, bias_type, ttest) for c in products]
         return CubeList(cubes)
 
     # Otherwise, iterate over all input products, calculate bias and adapt
@@ -150,7 +158,7 @@ def bias(
         cube = concatenate(product.cubes)
 
         # Calculate bias
-        cube = _calculate_bias(cube, reference, bias_type)
+        cube = _calculate_bias(cube, reference, bias_type, ttest)
 
         # Adapt metadata and provenance information
         product.attributes["units"] = str(cube.units)
@@ -196,10 +204,13 @@ def _get_ref(
     return (reference, ref_product)
 
 
-def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType) -> Cube:
+def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType, ttest: bool) -> Cube:
     """Calculate bias for a single cube relative to a reference cube."""
     cube_metadata = cube.metadata
 
+    if ttest:
+        p_value, remaining_axis = _calculate_ttest(cube, reference)
+
     if bias_type == "absolute":
         cube = cube - reference
         new_units = cube.units
@@ -215,10 +226,29 @@ def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType) -> Cube:
 
     cube.metadata = cube_metadata
     cube.units = new_units
+    # Attach ancillary variable after the calculation
+    # otherwise, it will be lost
+    if ttest:
+        cube.add_ancillary_variable(
+            AncillaryVariable(p_value,
+                              long_name="pvalue"),
+            remaining_axis)
 
     return cube
 
 
+def _calculate_ttest(cube: Cube, reference: Cube) -> Cube:
+    """Calculate the Welch's t-test and attach p_value as an ancillary variable."""
+    axis = cube.coord_dims("time")[0]
+    remaining_axis = sorted(set(range(cube.ndim)) - set([axis]))
+
+    ttest = dask.array.stats.ttest_ind(cube, reference, 0, False)
+    shape = tuple([list(cube.shape)[dim] for dim in remaining_axis])
+    p_value = da.from_delayed(ttest.pvalue, shape, dtype=float)
+
+    return p_value, remaining_axis
+
+
 MetricType = Literal[
     "rmse",
     "weighted_rmse",

From e22937c8ea5866340f77848ed07b0f3330415267 Mon Sep 17 00:00:00 2001
From: Franziska Winterstein <franziska.winterstein@dlr.de>
Date: Fri, 12 Jun 2026 11:34:06 +0200
Subject: [PATCH 03/21] prepared ttest as its own preprocessor function, still
 open how to keep ancillary variables attached through arbitrary preprocessor
 functions

---
 esmvalcore/preprocessor/__init__.py           |   4 +-
 esmvalcore/preprocessor/_compare_with_refs.py | 151 ++++++++++++++++--
 2 files changed, 141 insertions(+), 14 deletions(-)

diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
index 798edb36a8..4a16f06207 100644
--- a/esmvalcore/preprocessor/__init__.py
+++ b/esmvalcore/preprocessor/__init__.py
@@ -31,7 +31,7 @@
     meridional_statistics,
     zonal_statistics,
 )
-from esmvalcore.preprocessor._compare_with_refs import bias, distance_metric
+from esmvalcore.preprocessor._compare_with_refs import bias, distance_metric, ttest
 from esmvalcore.preprocessor._concatenate import concatenate
 from esmvalcore.preprocessor._cycles import amplitude
 from esmvalcore.preprocessor._dask_progress import _compute_with_progress
@@ -219,6 +219,7 @@
     # Comparison with reference datasets
     "bias",
     "distance_metric",
+    "ttest",
     # Remove supplementary variables from cube
     "remove_supplementary_variables",
     # Save to file
@@ -256,6 +257,7 @@
 MULTI_MODEL_FUNCTIONS: set[str] = {
     "bias",
     "distance_metric",
+    "ttest",
     "ensemble_statistics",
     "multi_model_statistics",
     "mask_multimodel",
diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py
index b812ce754c..f7f396a3b1 100644
--- a/esmvalcore/preprocessor/_compare_with_refs.py
+++ b/esmvalcore/preprocessor/_compare_with_refs.py
@@ -45,6 +45,107 @@
 BiasType = Literal["absolute", "relative"]
 
 
+def ttest(
+    products: set[PreprocessorFile] | Iterable[Cube],
+    reference: Cube | None = None,
+    siglvl: float = 0.05,
+) -> set[PreprocessorFile] | CubeList:
+    """Calculate the ttest between dataset and reference dataset.
+
+    The reference dataset needs to be broadcastable to all input `products`.
+    This supports `iris' rich broadcasting abilities
+    <https://scitools-iris.readthedocs.io/en/stable/userguide/cube_maths.
+    html#calculating-a-cube-anomaly>`__. To ensure this, the preprocessors
+    :func:`esmvalcore.preprocessor.regrid` and/or
+    :func:`esmvalcore.preprocessor.regrid_time` might be helpful.
+
+    Notes
+    -----
+    The reference dataset can be specified with the `reference` argument. If
+    `reference` is ``None``, exactly one input dataset in the `products` set
+    needs to have the facet ``reference_for_bias: true`` defined in the recipe.
+    Please do **not** specify the option `reference` when using this
+    preprocessor function in a recipe.
+
+    Parameters
+    ----------
+    products:
+        Input datasets/cubes for which the significance is calculated relative
+        to a reference dataset/cube.
+    reference:
+        Cube which is used as reference for the bttest calculation. If ``None``,
+        `products` needs to be a :obj:`set` of
+        `~esmvalcore.preprocessor.PreprocessorFile` objects and exactly one
+        dataset in `products` needs the facet ``reference_for_bias: true``. Do
+        not specify this argument in a recipe.
+    siglvl:
+        significance level which is used to create the mask of significant and
+        non-significant data points.
+
+    Returns
+    -------
+    set[PreprocessorFile] | CubeList
+        Output datasets/cubes. Will be a :obj:`set` of
+        :class:`~esmvalcore.preprocessor.PreprocessorFile` objects if
+        `products` is also one, a :class:`~iris.cube.CubeList` otherwise.
+        The cubes contain an attached ancillary variable which is a mask of the
+        cube defining significant (1) and non-significant (0) values related
+        to the bias (in terms of absolute) of cube and reference.
+
+    Raises
+    ------
+    ValueError
+        Not exactly one input datasets contains the facet ``reference_for_bias:
+        true`` if ``reference=None``; ``reference=None`` and the input products
+        are given as iterable of :class:`~iris.cube.Cube` objects;
+        ``bias_type`` is not one of ``'absolute'`` or ``'relative'``.
+    """
+    ref_product = None
+    all_cubes_given = all(isinstance(p, Cube) for p in products)
+
+    # Get reference cube if not explicitly given
+    if reference is None:
+        if all_cubes_given:
+            msg = (
+                "A list of Cubes is given to this preprocessor; please "
+                "specify a `reference`"
+            )
+            raise ValueError(msg)
+        (reference, ref_product) = _get_ref(products, "reference_for_bias")
+    else:
+        ref_product = None
+
+    # If input is an Iterable of Cube objects, calculate ttest for each element
+    if all_cubes_given:
+        cubes = [_calculate_ttest(c, reference, siglvl) for c in products]
+        return CubeList(cubes)
+
+    # Otherwise, iterate over all input products, calculate ttest and adapt
+    # metadata and provenance information accordingly
+    output_products = set()
+    for product in products:
+        if product == ref_product:
+            continue
+        cube = concatenate(product.cubes)
+
+        # Calculate ttest
+        cube = _calculate_ttest(cube, reference, siglvl)
+
+        # Adapt metadata and provenance information
+        product.attributes["units"] = str(cube.units)
+        if ref_product is not None:
+            product.wasderivedfrom(ref_product)
+
+        product.cubes = CubeList([cube])
+        output_products.add(product)
+    print(cube)
+    # Add reference dataset to output (always)
+    if ref_product is not None:
+        output_products.add(ref_product)
+
+    return output_products
+
+
 def bias(
     products: set[PreprocessorFile] | Iterable[Cube],
     reference: Cube | None = None,
@@ -208,8 +309,8 @@ def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType, ttest: boo
     """Calculate bias for a single cube relative to a reference cube."""
     cube_metadata = cube.metadata
 
-    if ttest:
-        p_value, remaining_axis = _calculate_ttest(cube, reference)
+    # if ttest:
+    #     p_value, remaining_axis = _calculate_ttest(cube, reference)
 
     if bias_type == "absolute":
         cube = cube - reference
@@ -226,27 +327,51 @@ def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType, ttest: boo
 
     cube.metadata = cube_metadata
     cube.units = new_units
+
     # Attach ancillary variable after the calculation
     # otherwise, it will be lost
-    if ttest:
-        cube.add_ancillary_variable(
-            AncillaryVariable(p_value,
-                              long_name="pvalue"),
-            remaining_axis)
+    # if ttest:
+    #     cube.add_ancillary_variable(
+    #         AncillaryVariable(p_value,
+    #                           long_name="pvalue"),
+    #         remaining_axis)
 
     return cube
 
 
-def _calculate_ttest(cube: Cube, reference: Cube) -> Cube:
-    """Calculate the Welch's t-test and attach p_value as an ancillary variable."""
+# def _calculate_ttest(cube: Cube, reference: Cube) -> Cube:
+ #    """Calculate the Welch's t-test and attach p_value as an ancillary variable."""
+ #    axis = cube.coord_dims("time")[0]
+ #    remaining_axis = sorted(set(range(cube.ndim)) - set([axis]))
+
+ #    ttest = dask.array.stats.ttest_ind(cube, reference, 0, False)
+ #    shape = tuple([list(cube.shape)[dim] for dim in remaining_axis])
+ #    p_value = da.from_delayed(ttest.pvalue, shape, dtype=float)
+
+ #    return p_value, remaining_axis
+
+
+def _calculate_ttest(cube: Cube, reference: Cube, siglvl: float) -> Cube:
+    """Calculate the Welch's t-test and attach a mask of significance."""
+    cube_metadata = cube.metadata
+
     axis = cube.coord_dims("time")[0]
-    remaining_axis = sorted(set(range(cube.ndim)) - set([axis]))
+    remaining_axes = sorted(set(range(cube.ndim)) - set([axis]))
 
     ttest = dask.array.stats.ttest_ind(cube, reference, 0, False)
-    shape = tuple([list(cube.shape)[dim] for dim in remaining_axis])
+    shape = tuple([list(cube.shape)[dim] for dim in remaining_axes])
     p_value = da.from_delayed(ttest.pvalue, shape, dtype=float)
 
-    return p_value, remaining_axis
+    significance = da.where(p_value <= siglvl, 1, 0)
+
+    cube.add_ancillary_variable(
+        AncillaryVariable(significance, long_name="significance"),
+        remaining_axes
+    )
+
+    cube.metadata = cube_metadata
+
+    return cube
 
 
 MetricType = Literal[
@@ -267,7 +392,7 @@ def distance_metric(
     keep_reference_dataset: bool = True,
     **kwargs: Any,
 ) -> set[PreprocessorFile] | CubeList:
-    r"""Calculate distance metrics.
+    """Calculate distance metrics.
 
     All input datasets need to have identical dimensional coordinates. This can
     for example be ensured with the preprocessors

From fc338d067324766b1d2cb2dcb425aea873592564 Mon Sep 17 00:00:00 2001
From: Franziska Winterstein <franziska.winterstein@dlr.de>
Date: Tue, 23 Jun 2026 18:06:50 +0200
Subject: [PATCH 04/21] Reverted code s.t. p_value is attached and not a mask
 0/1

---
 esmvalcore/preprocessor/_compare_with_refs.py | 86 +++++++++----------
 1 file changed, 40 insertions(+), 46 deletions(-)

diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py
index f7f396a3b1..7b21a75887 100644
--- a/esmvalcore/preprocessor/_compare_with_refs.py
+++ b/esmvalcore/preprocessor/_compare_with_refs.py
@@ -48,7 +48,9 @@
 def ttest(
     products: set[PreprocessorFile] | Iterable[Cube],
     reference: Cube | None = None,
-    siglvl: float = 0.05,
+    coordinate: str | None = 'time',
+    equal_var: bool = False,
+    **kwargs: Any,
 ) -> set[PreprocessorFile] | CubeList:
     """Calculate the ttest between dataset and reference dataset.
 
@@ -73,24 +75,31 @@ def ttest(
         Input datasets/cubes for which the significance is calculated relative
         to a reference dataset/cube.
     reference:
-        Cube which is used as reference for the bttest calculation. If ``None``,
+        Cube which is used as reference for the ttest calculation. If ``None``,
         `products` needs to be a :obj:`set` of
         `~esmvalcore.preprocessor.PreprocessorFile` objects and exactly one
         dataset in `products` needs the facet ``reference_for_bias: true``. Do
         not specify this argument in a recipe.
-    siglvl:
-        significance level which is used to create the mask of significant and
-        non-significant data points.
+    coordinate:
+        Coordinate axis of the input along which to compute the statistic.
+        Default is 'time'.
+    equal_var:
+        If False (default), perform Welch's t-test, which does not assume equal
+        population variances. Can be set to True, to perform a standard
+        independent 2 sample test that assumes equal population variances, if
+        equality is assured
+    **kwargs:
+        Any other parameter allowed by dask.array.ttest_ind.
 
     Returns
     -------
     set[PreprocessorFile] | CubeList
         Output datasets/cubes. Will be a :obj:`set` of
-        :class:`~esmvalcore.preprocessor.PreprocessorFile` objects if
-        `products` is also one, a :class:`~iris.cube.CubeList` otherwise.
-        The cubes contain an attached ancillary variable which is a mask of the
-        cube defining significant (1) and non-significant (0) values related
-        to the bias (in terms of absolute) of cube and reference.
+        :class:`~esmvalcore.preprocessor.PreprocessorFile` objects if `products`
+        is also one, a :class:`~iris.cube.CubeList` otherwise.  The cubes
+        contain an attached ancillary variable which is the p-value w.r.t. the
+        chosen ttest related to the bias (in terms of absolute) of cube and
+        reference.
 
     Raises
     ------
@@ -99,6 +108,7 @@ def ttest(
         true`` if ``reference=None``; ``reference=None`` and the input products
         are given as iterable of :class:`~iris.cube.Cube` objects;
         ``bias_type`` is not one of ``'absolute'`` or ``'relative'``.
+
     """
     ref_product = None
     all_cubes_given = all(isinstance(p, Cube) for p in products)
@@ -117,7 +127,8 @@ def ttest(
 
     # If input is an Iterable of Cube objects, calculate ttest for each element
     if all_cubes_given:
-        cubes = [_calculate_ttest(c, reference, siglvl) for c in products]
+        cubes = [_calculate_ttest(c, reference, coordinate, equal_var, **kwargs)
+                 for c in products]
         return CubeList(cubes)
 
     # Otherwise, iterate over all input products, calculate ttest and adapt
@@ -129,7 +140,8 @@ def ttest(
         cube = concatenate(product.cubes)
 
         # Calculate ttest
-        cube = _calculate_ttest(cube, reference, siglvl)
+        cube = _calculate_ttest(cube, reference, coordinate, equal_var,
+                                **kwargs)
 
         # Adapt metadata and provenance information
         product.attributes["units"] = str(cube.units)
@@ -138,7 +150,7 @@ def ttest(
 
         product.cubes = CubeList([cube])
         output_products.add(product)
-    print(cube)
+
     # Add reference dataset to output (always)
     if ref_product is not None:
         output_products.add(ref_product)
@@ -185,11 +197,6 @@ def bias(
     bias_type:
         Bias type that is calculated. Must be one of ``'absolute'`` (dataset -
         ref) or ``'relative'`` ((dataset - ref) / ref).
-    ttest:
-        If ``True`` calculate the p-value according to the Welch's Test between
-        dataset and reference. The p-value is attached as an ancillary variable
-        to the dataset. A Welch's Test is chosen, since unequal variances and
-        sample sizes are likely.
     denominator_mask_threshold:
         Threshold to mask values close to zero in the denominator (i.e., the
         reference dataset) during the calculation of relative biases. All
@@ -309,8 +316,9 @@ def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType, ttest: boo
     """Calculate bias for a single cube relative to a reference cube."""
     cube_metadata = cube.metadata
 
-    # if ttest:
-    #     p_value, remaining_axis = _calculate_ttest(cube, reference)
+    # save ancillary variables as they get removed in the bias calculation
+    anc_var = cube.ancillary_variables()
+    anc_var_dims = [ cube.ancillary_variable_dims(anc) for anc in anc_var ]
 
     if bias_type == "absolute":
         cube = cube - reference
@@ -328,44 +336,30 @@ def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType, ttest: boo
     cube.metadata = cube_metadata
     cube.units = new_units
 
-    # Attach ancillary variable after the calculation
-    # otherwise, it will be lost
-    # if ttest:
-    #     cube.add_ancillary_variable(
-    #         AncillaryVariable(p_value,
-    #                           long_name="pvalue"),
-    #         remaining_axis)
+    # reattach ancillary variables
+    for anc, dims in zip(anc_var, anc_var_dims):
+        cube.add_ancillary_variable(anc, dims)
 
     return cube
 
 
-# def _calculate_ttest(cube: Cube, reference: Cube) -> Cube:
- #    """Calculate the Welch's t-test and attach p_value as an ancillary variable."""
- #    axis = cube.coord_dims("time")[0]
- #    remaining_axis = sorted(set(range(cube.ndim)) - set([axis]))
-
- #    ttest = dask.array.stats.ttest_ind(cube, reference, 0, False)
- #    shape = tuple([list(cube.shape)[dim] for dim in remaining_axis])
- #    p_value = da.from_delayed(ttest.pvalue, shape, dtype=float)
-
- #    return p_value, remaining_axis
-
-
-def _calculate_ttest(cube: Cube, reference: Cube, siglvl: float) -> Cube:
-    """Calculate the Welch's t-test and attach a mask of significance."""
+def _calculate_ttest(cube: Cube, reference: Cube, coordinate, equal_var, **kwargs) -> Cube:
+    """Calculate the Welch's t-test and attach the p-value."""
     cube_metadata = cube.metadata
 
-    axis = cube.coord_dims("time")[0]
+    axis = cube.coord_dims(coordinate)[0]
     remaining_axes = sorted(set(range(cube.ndim)) - set([axis]))
 
-    ttest = dask.array.stats.ttest_ind(cube, reference, 0, False)
+    ttest = dask.array.stats.ttest_ind(cube,
+                                       reference,
+                                       axis=axis,
+                                       equal_var=equal_var)
     shape = tuple([list(cube.shape)[dim] for dim in remaining_axes])
-    p_value = da.from_delayed(ttest.pvalue, shape, dtype=float)
 
-    significance = da.where(p_value <= siglvl, 1, 0)
+    p_value = da.from_delayed(ttest.pvalue, shape, dtype=float)
 
     cube.add_ancillary_variable(
-        AncillaryVariable(significance, long_name="significance"),
+        AncillaryVariable(p_value, long_name="p_value"),
         remaining_axes
     )
 

From 6cf75acd7790b45a435607f54af82db4778de05c Mon Sep 17 00:00:00 2001
From: Franziska Winterstein <franziska.winterstein@dlr.de>
Date: Wed, 24 Jun 2026 12:22:14 +0200
Subject: [PATCH 05/21] added example recipe where ttest is used

---
 test_ttest.yml | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 test_ttest.yml

diff --git a/test_ttest.yml b/test_ttest.yml
new file mode 100644
index 0000000000..18c20ed49b
--- /dev/null
+++ b/test_ttest.yml
@@ -0,0 +1,90 @@
+# ESMValTool
+# This recipe was successfully run with
+#   - ESMValTool version:
+#   - ESMValCore version:
+---
+documentation:
+  title: Example recipe for monitor/multi_datasets.py
+
+  description: This recipe includes the zonal mean run by the diagnostic monitor/multi_datasets.py and MESSy data.
+
+  authors:
+    - winterstein_franziska
+
+  maintainer:
+    - winterstein_franziska
+
+  references:
+    - acknow_project
+
+datasets:
+  - {alias: ERA5, project: native6, dataset: ERA5, type: reanaly, version: v1, tier: 3, reference_for_bias: True}
+  - {alias: RD1-PSrad-02, dataset: EMAC, project: EMAC, ensemble: EMAC, exp: RD1-PSrad-02, supplementary_variables: [{short_name: areacella, skip: true}] }
+
+preprocessors:
+
+  zonal_mean_bias:
+    custom_order: true
+    regrid:
+      scheme: linear
+      target_grid: ERA5
+      lon_offset: false
+    extract_levels:
+      levels: {cmor_table: CMIP6, coordinate: plev39}
+      scheme: linear
+      coordinate: air_pressure
+    zonal_statistics:
+      operator: mean
+    annual_statistics:
+      operator: mean
+    regrid_time:
+      frequency: 'yr'
+    ttest: true
+    climate_statistics:
+      operator: mean
+      period: full
+    bias:
+      bias_type: absolute
+      # Note that if bias is used you need to specify 'reference_to_bias: True'
+      # in that dataset above, which should serve as the reference
+      # bias_type "absolute" is recommended for temperature
+      # for other tracer it could make sense to use "relative"
+
+diagnostics:
+
+  ttest:
+    description: Zonal mean bias of temperature with ttest
+    themes:
+      - phys
+    realms:
+      - atmos
+    variables:
+      ta:
+        mip: Amon
+        channel: Amon
+        raw_name: tm1_p39_cav
+        start_year: 2000
+        end_year: 2001
+        preprocessor: zonal_mean_bias
+    scripts:
+      plot_zonal_mean:
+        script: monitor/multi_datasets.py
+        facets_used_for_labels: alias
+        plot_folder: '{plot_dir}'
+        plot_filename: '{plot_type}_{alias}-reference_{preprocessor}'
+        plots:
+          zonal_mean_profile:
+            siglvl: 0.05
+            common_cbar: true
+            plot_kwargs:
+              default:
+                cmap: 'RdBu'
+                levels: [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
+            pyplot_kwargs:
+              title: '{alias} - ERA5'
+              # this corresponds to the requirements of copernicus
+              xticks:
+                ticks: [-60, -30, 0, 30, 60]
+                labels: ["60° S", "30° S", "0°", "30° N", "60° N"]
+              xlabel: latitude
+              ylabel: Pressure [hPa]

From 48844bcc2bdb3ec8b78c9d39a3c052384bcaac99 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 09:07:26 +0200
Subject: [PATCH 06/21] Remove recipe from repo

---
 test_ttest.yml | 90 --------------------------------------------------
 1 file changed, 90 deletions(-)
 delete mode 100644 test_ttest.yml

diff --git a/test_ttest.yml b/test_ttest.yml
deleted file mode 100644
index 18c20ed49b..0000000000
--- a/test_ttest.yml
+++ /dev/null
@@ -1,90 +0,0 @@
-# ESMValTool
-# This recipe was successfully run with
-#   - ESMValTool version:
-#   - ESMValCore version:
----
-documentation:
-  title: Example recipe for monitor/multi_datasets.py
-
-  description: This recipe includes the zonal mean run by the diagnostic monitor/multi_datasets.py and MESSy data.
-
-  authors:
-    - winterstein_franziska
-
-  maintainer:
-    - winterstein_franziska
-
-  references:
-    - acknow_project
-
-datasets:
-  - {alias: ERA5, project: native6, dataset: ERA5, type: reanaly, version: v1, tier: 3, reference_for_bias: True}
-  - {alias: RD1-PSrad-02, dataset: EMAC, project: EMAC, ensemble: EMAC, exp: RD1-PSrad-02, supplementary_variables: [{short_name: areacella, skip: true}] }
-
-preprocessors:
-
-  zonal_mean_bias:
-    custom_order: true
-    regrid:
-      scheme: linear
-      target_grid: ERA5
-      lon_offset: false
-    extract_levels:
-      levels: {cmor_table: CMIP6, coordinate: plev39}
-      scheme: linear
-      coordinate: air_pressure
-    zonal_statistics:
-      operator: mean
-    annual_statistics:
-      operator: mean
-    regrid_time:
-      frequency: 'yr'
-    ttest: true
-    climate_statistics:
-      operator: mean
-      period: full
-    bias:
-      bias_type: absolute
-      # Note that if bias is used you need to specify 'reference_to_bias: True'
-      # in that dataset above, which should serve as the reference
-      # bias_type "absolute" is recommended for temperature
-      # for other tracer it could make sense to use "relative"
-
-diagnostics:
-
-  ttest:
-    description: Zonal mean bias of temperature with ttest
-    themes:
-      - phys
-    realms:
-      - atmos
-    variables:
-      ta:
-        mip: Amon
-        channel: Amon
-        raw_name: tm1_p39_cav
-        start_year: 2000
-        end_year: 2001
-        preprocessor: zonal_mean_bias
-    scripts:
-      plot_zonal_mean:
-        script: monitor/multi_datasets.py
-        facets_used_for_labels: alias
-        plot_folder: '{plot_dir}'
-        plot_filename: '{plot_type}_{alias}-reference_{preprocessor}'
-        plots:
-          zonal_mean_profile:
-            siglvl: 0.05
-            common_cbar: true
-            plot_kwargs:
-              default:
-                cmap: 'RdBu'
-                levels: [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
-            pyplot_kwargs:
-              title: '{alias} - ERA5'
-              # this corresponds to the requirements of copernicus
-              xticks:
-                ticks: [-60, -30, 0, 30, 60]
-                labels: ["60° S", "30° S", "0°", "30° N", "60° N"]
-              xlabel: latitude
-              ylabel: Pressure [hPa]

From 04802a9e58f1dbfb5983bc06ae5f7b06c00a2b4e Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 09:24:10 +0200
Subject: [PATCH 07/21] pre-commit

---
 esmvalcore/preprocessor/__init__.py           | 10 ++-
 esmvalcore/preprocessor/_compare_with_refs.py | 82 ++++++++++---------
 2 files changed, 51 insertions(+), 41 deletions(-)

diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
index 4a16f06207..6418a0289f 100644
--- a/esmvalcore/preprocessor/__init__.py
+++ b/esmvalcore/preprocessor/__init__.py
@@ -31,7 +31,11 @@
     meridional_statistics,
     zonal_statistics,
 )
-from esmvalcore.preprocessor._compare_with_refs import bias, distance_metric, ttest
+from esmvalcore.preprocessor._compare_with_refs import (
+    bias,
+    distance_metric,
+    t_test,
+)
 from esmvalcore.preprocessor._concatenate import concatenate
 from esmvalcore.preprocessor._cycles import amplitude
 from esmvalcore.preprocessor._dask_progress import _compute_with_progress
@@ -219,7 +223,7 @@
     # Comparison with reference datasets
     "bias",
     "distance_metric",
-    "ttest",
+    "t_test",
     # Remove supplementary variables from cube
     "remove_supplementary_variables",
     # Save to file
@@ -257,7 +261,7 @@
 MULTI_MODEL_FUNCTIONS: set[str] = {
     "bias",
     "distance_metric",
-    "ttest",
+    "t_test",
     "ensemble_statistics",
     "multi_model_statistics",
     "mask_multimodel",
diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py
index 7b21a75887..f80b6eb8d4 100644
--- a/esmvalcore/preprocessor/_compare_with_refs.py
+++ b/esmvalcore/preprocessor/_compare_with_refs.py
@@ -13,8 +13,7 @@
 import iris.analysis.stats
 import numpy as np
 from iris.common.metadata import CubeMetadata
-from iris.coords import AncillaryVariable
-from iris.coords import CellMethod
+from iris.coords import AncillaryVariable, AuxCoord, CellMethod, DimCoord
 from iris.cube import Cube, CubeList
 from scipy.stats import wasserstein_distance
 
@@ -45,14 +44,13 @@
 BiasType = Literal["absolute", "relative"]
 
 
-def ttest(
+def t_test(
     products: set[PreprocessorFile] | Iterable[Cube],
     reference: Cube | None = None,
-    coordinate: str | None = 'time',
-    equal_var: bool = False,
+    coordinate: str | None = "time",
     **kwargs: Any,
 ) -> set[PreprocessorFile] | CubeList:
-    """Calculate the ttest between dataset and reference dataset.
+    """Calculate the t-test between dataset and reference dataset.
 
     The reference dataset needs to be broadcastable to all input `products`.
     This supports `iris' rich broadcasting abilities
@@ -75,7 +73,7 @@ def ttest(
         Input datasets/cubes for which the significance is calculated relative
         to a reference dataset/cube.
     reference:
-        Cube which is used as reference for the ttest calculation. If ``None``,
+        Cube which is used as reference for the t-test calculation. If ``None``,
         `products` needs to be a :obj:`set` of
         `~esmvalcore.preprocessor.PreprocessorFile` objects and exactly one
         dataset in `products` needs the facet ``reference_for_bias: true``. Do
@@ -83,13 +81,8 @@ def ttest(
     coordinate:
         Coordinate axis of the input along which to compute the statistic.
         Default is 'time'.
-    equal_var:
-        If False (default), perform Welch's t-test, which does not assume equal
-        population variances. Can be set to True, to perform a standard
-        independent 2 sample test that assumes equal population variances, if
-        equality is assured
     **kwargs:
-        Any other parameter allowed by dask.array.ttest_ind.
+        Additional keyword arguments passed to :func:`dask.array.ttest_ind`.
 
     Returns
     -------
@@ -98,7 +91,7 @@ def ttest(
         :class:`~esmvalcore.preprocessor.PreprocessorFile` objects if `products`
         is also one, a :class:`~iris.cube.CubeList` otherwise.  The cubes
         contain an attached ancillary variable which is the p-value w.r.t. the
-        chosen ttest related to the bias (in terms of absolute) of cube and
+        chosen t-test related to the bias (in terms of absolute) of cube and
         reference.
 
     Raises
@@ -125,13 +118,15 @@ def ttest(
     else:
         ref_product = None
 
-    # If input is an Iterable of Cube objects, calculate ttest for each element
+    # If input is an Iterable of Cube objects, calculate t-test for each element
     if all_cubes_given:
-        cubes = [_calculate_ttest(c, reference, coordinate, equal_var, **kwargs)
-                 for c in products]
+        cubes = [
+            _calculate_t_test(c, reference, coordinate, **kwargs)
+            for c in products
+        ]
         return CubeList(cubes)
 
-    # Otherwise, iterate over all input products, calculate ttest and adapt
+    # Otherwise, iterate over all input products, calculate t-test and adapt
     # metadata and provenance information accordingly
     output_products = set()
     for product in products:
@@ -139,9 +134,8 @@ def ttest(
             continue
         cube = concatenate(product.cubes)
 
-        # Calculate ttest
-        cube = _calculate_ttest(cube, reference, coordinate, equal_var,
-                                **kwargs)
+        # Calculate t-test
+        cube = _calculate_t_test(cube, reference, coordinate, **kwargs)
 
         # Adapt metadata and provenance information
         product.attributes["units"] = str(cube.units)
@@ -162,7 +156,6 @@ def bias(
     products: set[PreprocessorFile] | Iterable[Cube],
     reference: Cube | None = None,
     bias_type: BiasType = "absolute",
-    ttest: bool = False,
     denominator_mask_threshold: float = 1e-3,
     keep_reference_dataset: bool = False,
 ) -> set[PreprocessorFile] | CubeList:
@@ -254,7 +247,7 @@ def bias(
 
     # If input is an Iterable of Cube objects, calculate bias for each element
     if all_cubes_given:
-        cubes = [_calculate_bias(c, reference, bias_type, ttest) for c in products]
+        cubes = [_calculate_bias(c, reference, bias_type) for c in products]
         return CubeList(cubes)
 
     # Otherwise, iterate over all input products, calculate bias and adapt
@@ -266,7 +259,7 @@ def bias(
         cube = concatenate(product.cubes)
 
         # Calculate bias
-        cube = _calculate_bias(cube, reference, bias_type, ttest)
+        cube = _calculate_bias(cube, reference, bias_type)
 
         # Adapt metadata and provenance information
         product.attributes["units"] = str(cube.units)
@@ -312,13 +305,19 @@ def _get_ref(
     return (reference, ref_product)
 
 
-def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType, ttest: bool) -> Cube:
+def _calculate_bias(
+    cube: Cube,
+    reference: Cube,
+    bias_type: BiasType,
+) -> Cube:
     """Calculate bias for a single cube relative to a reference cube."""
     cube_metadata = cube.metadata
 
     # save ancillary variables as they get removed in the bias calculation
-    anc_var = cube.ancillary_variables()
-    anc_var_dims = [ cube.ancillary_variable_dims(anc) for anc in anc_var ]
+    ancillary_variables_and_dims = [
+        (a, cube.ancillary_variable_dims(a))
+        for a in cube.ancillary_variables()
+    ]
 
     if bias_type == "absolute":
         cube = cube - reference
@@ -337,30 +336,37 @@ def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType, ttest: boo
     cube.units = new_units
 
     # reattach ancillary variables
-    for anc, dims in zip(anc_var, anc_var_dims):
-        cube.add_ancillary_variable(anc, dims)
+    for ancillary_variable, dims in ancillary_variables_and_dims:
+        cube.add_ancillary_variable(ancillary_variable, dims)
 
     return cube
 
 
-def _calculate_ttest(cube: Cube, reference: Cube, coordinate, equal_var, **kwargs) -> Cube:
+def _calculate_t_test(
+    cube: Cube,
+    reference: Cube,
+    coordinate: str | AuxCoord | DimCoord,
+    **kwargs: Any,
+) -> Cube:
     """Calculate the Welch's t-test and attach the p-value."""
     cube_metadata = cube.metadata
 
     axis = cube.coord_dims(coordinate)[0]
-    remaining_axes = sorted(set(range(cube.ndim)) - set([axis]))
+    remaining_axes = sorted(set(range(cube.ndim)) - {axis})
 
-    ttest = dask.array.stats.ttest_ind(cube,
-                                       reference,
-                                       axis=axis,
-                                       equal_var=equal_var)
+    t_test = dask.array.stats.ttest_ind(
+        cube,
+        reference,
+        axis=axis,
+        **kwargs,
+    )
     shape = tuple([list(cube.shape)[dim] for dim in remaining_axes])
 
-    p_value = da.from_delayed(ttest.pvalue, shape, dtype=float)
+    p_value = da.from_delayed(t_test.pvalue, shape, dtype=float)
 
     cube.add_ancillary_variable(
         AncillaryVariable(p_value, long_name="p_value"),
-        remaining_axes
+        remaining_axes,
     )
 
     cube.metadata = cube_metadata
@@ -386,7 +392,7 @@ def distance_metric(
     keep_reference_dataset: bool = True,
     **kwargs: Any,
 ) -> set[PreprocessorFile] | CubeList:
-    """Calculate distance metrics.
+    r"""Calculate distance metrics.
 
     All input datasets need to have identical dimensional coordinates. This can
     for example be ensured with the preprocessors

From 1caccba288fedc4d06948b056134560cf8ddb6ed Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 09:24:38 +0200
Subject: [PATCH 08/21] Optimize preprocessor order

---
 esmvalcore/preprocessor/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
index 6418a0289f..520c67ddb0 100644
--- a/esmvalcore/preprocessor/__init__.py
+++ b/esmvalcore/preprocessor/__init__.py
@@ -221,9 +221,9 @@
     # Multi model statistics
     "multi_model_statistics",
     # Comparison with reference datasets
+    "t_test",
     "bias",
     "distance_metric",
-    "t_test",
     # Remove supplementary variables from cube
     "remove_supplementary_variables",
     # Save to file
@@ -261,11 +261,11 @@
 MULTI_MODEL_FUNCTIONS: set[str] = {
     "bias",
     "distance_metric",
-    "t_test",
     "ensemble_statistics",
     "multi_model_statistics",
     "mask_multimodel",
     "mask_fillvalues",
+    "t_test",
 }
 
 

From b4e86cf19f643accc3cde9f340e8293cf0ff5723 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 09:25:57 +0200
Subject: [PATCH 09/21] Optimize function order

---
 esmvalcore/preprocessor/_compare_with_refs.py | 280 +++++++++---------
 1 file changed, 140 insertions(+), 140 deletions(-)

diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py
index f80b6eb8d4..afc7769414 100644
--- a/esmvalcore/preprocessor/_compare_with_refs.py
+++ b/esmvalcore/preprocessor/_compare_with_refs.py
@@ -44,114 +44,6 @@
 BiasType = Literal["absolute", "relative"]
 
 
-def t_test(
-    products: set[PreprocessorFile] | Iterable[Cube],
-    reference: Cube | None = None,
-    coordinate: str | None = "time",
-    **kwargs: Any,
-) -> set[PreprocessorFile] | CubeList:
-    """Calculate the t-test between dataset and reference dataset.
-
-    The reference dataset needs to be broadcastable to all input `products`.
-    This supports `iris' rich broadcasting abilities
-    <https://scitools-iris.readthedocs.io/en/stable/userguide/cube_maths.
-    html#calculating-a-cube-anomaly>`__. To ensure this, the preprocessors
-    :func:`esmvalcore.preprocessor.regrid` and/or
-    :func:`esmvalcore.preprocessor.regrid_time` might be helpful.
-
-    Notes
-    -----
-    The reference dataset can be specified with the `reference` argument. If
-    `reference` is ``None``, exactly one input dataset in the `products` set
-    needs to have the facet ``reference_for_bias: true`` defined in the recipe.
-    Please do **not** specify the option `reference` when using this
-    preprocessor function in a recipe.
-
-    Parameters
-    ----------
-    products:
-        Input datasets/cubes for which the significance is calculated relative
-        to a reference dataset/cube.
-    reference:
-        Cube which is used as reference for the t-test calculation. If ``None``,
-        `products` needs to be a :obj:`set` of
-        `~esmvalcore.preprocessor.PreprocessorFile` objects and exactly one
-        dataset in `products` needs the facet ``reference_for_bias: true``. Do
-        not specify this argument in a recipe.
-    coordinate:
-        Coordinate axis of the input along which to compute the statistic.
-        Default is 'time'.
-    **kwargs:
-        Additional keyword arguments passed to :func:`dask.array.ttest_ind`.
-
-    Returns
-    -------
-    set[PreprocessorFile] | CubeList
-        Output datasets/cubes. Will be a :obj:`set` of
-        :class:`~esmvalcore.preprocessor.PreprocessorFile` objects if `products`
-        is also one, a :class:`~iris.cube.CubeList` otherwise.  The cubes
-        contain an attached ancillary variable which is the p-value w.r.t. the
-        chosen t-test related to the bias (in terms of absolute) of cube and
-        reference.
-
-    Raises
-    ------
-    ValueError
-        Not exactly one input datasets contains the facet ``reference_for_bias:
-        true`` if ``reference=None``; ``reference=None`` and the input products
-        are given as iterable of :class:`~iris.cube.Cube` objects;
-        ``bias_type`` is not one of ``'absolute'`` or ``'relative'``.
-
-    """
-    ref_product = None
-    all_cubes_given = all(isinstance(p, Cube) for p in products)
-
-    # Get reference cube if not explicitly given
-    if reference is None:
-        if all_cubes_given:
-            msg = (
-                "A list of Cubes is given to this preprocessor; please "
-                "specify a `reference`"
-            )
-            raise ValueError(msg)
-        (reference, ref_product) = _get_ref(products, "reference_for_bias")
-    else:
-        ref_product = None
-
-    # If input is an Iterable of Cube objects, calculate t-test for each element
-    if all_cubes_given:
-        cubes = [
-            _calculate_t_test(c, reference, coordinate, **kwargs)
-            for c in products
-        ]
-        return CubeList(cubes)
-
-    # Otherwise, iterate over all input products, calculate t-test and adapt
-    # metadata and provenance information accordingly
-    output_products = set()
-    for product in products:
-        if product == ref_product:
-            continue
-        cube = concatenate(product.cubes)
-
-        # Calculate t-test
-        cube = _calculate_t_test(cube, reference, coordinate, **kwargs)
-
-        # Adapt metadata and provenance information
-        product.attributes["units"] = str(cube.units)
-        if ref_product is not None:
-            product.wasderivedfrom(ref_product)
-
-        product.cubes = CubeList([cube])
-        output_products.add(product)
-
-    # Add reference dataset to output (always)
-    if ref_product is not None:
-        output_products.add(ref_product)
-
-    return output_products
-
-
 def bias(
     products: set[PreprocessorFile] | Iterable[Cube],
     reference: Cube | None = None,
@@ -342,38 +234,6 @@ def _calculate_bias(
     return cube
 
 
-def _calculate_t_test(
-    cube: Cube,
-    reference: Cube,
-    coordinate: str | AuxCoord | DimCoord,
-    **kwargs: Any,
-) -> Cube:
-    """Calculate the Welch's t-test and attach the p-value."""
-    cube_metadata = cube.metadata
-
-    axis = cube.coord_dims(coordinate)[0]
-    remaining_axes = sorted(set(range(cube.ndim)) - {axis})
-
-    t_test = dask.array.stats.ttest_ind(
-        cube,
-        reference,
-        axis=axis,
-        **kwargs,
-    )
-    shape = tuple([list(cube.shape)[dim] for dim in remaining_axes])
-
-    p_value = da.from_delayed(t_test.pvalue, shape, dtype=float)
-
-    cube.add_ancillary_variable(
-        AncillaryVariable(p_value, long_name="p_value"),
-        remaining_axes,
-    )
-
-    cube.metadata = cube_metadata
-
-    return cube
-
-
 MetricType = Literal[
     "rmse",
     "weighted_rmse",
@@ -752,3 +612,143 @@ def _get_emd(
     if np.ma.is_masked(arr) or np.ma.is_masked(ref_arr):
         return np.ma.masked  # this is safe because PMFs will be masked arrays
     return wasserstein_distance(bin_centers, bin_centers, arr, ref_arr)
+
+
+def t_test(
+    products: set[PreprocessorFile] | Iterable[Cube],
+    reference: Cube | None = None,
+    coordinate: str | None = "time",
+    **kwargs: Any,
+) -> set[PreprocessorFile] | CubeList:
+    """Calculate the t-test between dataset and reference dataset.
+
+    The reference dataset needs to be broadcastable to all input `products`.
+    This supports `iris' rich broadcasting abilities
+    <https://scitools-iris.readthedocs.io/en/stable/userguide/cube_maths.
+    html#calculating-a-cube-anomaly>`__. To ensure this, the preprocessors
+    :func:`esmvalcore.preprocessor.regrid` and/or
+    :func:`esmvalcore.preprocessor.regrid_time` might be helpful.
+
+    Notes
+    -----
+    The reference dataset can be specified with the `reference` argument. If
+    `reference` is ``None``, exactly one input dataset in the `products` set
+    needs to have the facet ``reference_for_bias: true`` defined in the recipe.
+    Please do **not** specify the option `reference` when using this
+    preprocessor function in a recipe.
+
+    Parameters
+    ----------
+    products:
+        Input datasets/cubes for which the significance is calculated relative
+        to a reference dataset/cube.
+    reference:
+        Cube which is used as reference for the t-test calculation. If ``None``,
+        `products` needs to be a :obj:`set` of
+        `~esmvalcore.preprocessor.PreprocessorFile` objects and exactly one
+        dataset in `products` needs the facet ``reference_for_bias: true``. Do
+        not specify this argument in a recipe.
+    coordinate:
+        Coordinate axis of the input along which to compute the statistic.
+        Default is 'time'.
+    **kwargs:
+        Additional keyword arguments passed to :func:`dask.array.ttest_ind`.
+
+    Returns
+    -------
+    set[PreprocessorFile] | CubeList
+        Output datasets/cubes. Will be a :obj:`set` of
+        :class:`~esmvalcore.preprocessor.PreprocessorFile` objects if `products`
+        is also one, a :class:`~iris.cube.CubeList` otherwise.  The cubes
+        contain an attached ancillary variable which is the p-value w.r.t. the
+        chosen t-test related to the bias (in terms of absolute) of cube and
+        reference.
+
+    Raises
+    ------
+    ValueError
+        Not exactly one input datasets contains the facet ``reference_for_bias:
+        true`` if ``reference=None``; ``reference=None`` and the input products
+        are given as iterable of :class:`~iris.cube.Cube` objects;
+        ``bias_type`` is not one of ``'absolute'`` or ``'relative'``.
+
+    """
+    ref_product = None
+    all_cubes_given = all(isinstance(p, Cube) for p in products)
+
+    # Get reference cube if not explicitly given
+    if reference is None:
+        if all_cubes_given:
+            msg = (
+                "A list of Cubes is given to this preprocessor; please "
+                "specify a `reference`"
+            )
+            raise ValueError(msg)
+        (reference, ref_product) = _get_ref(products, "reference_for_bias")
+    else:
+        ref_product = None
+
+    # If input is an Iterable of Cube objects, calculate t-test for each element
+    if all_cubes_given:
+        cubes = [
+            _calculate_t_test(c, reference, coordinate, **kwargs)
+            for c in products
+        ]
+        return CubeList(cubes)
+
+    # Otherwise, iterate over all input products, calculate t-test and adapt
+    # metadata and provenance information accordingly
+    output_products = set()
+    for product in products:
+        if product == ref_product:
+            continue
+        cube = concatenate(product.cubes)
+
+        # Calculate t-test
+        cube = _calculate_t_test(cube, reference, coordinate, **kwargs)
+
+        # Adapt metadata and provenance information
+        product.attributes["units"] = str(cube.units)
+        if ref_product is not None:
+            product.wasderivedfrom(ref_product)
+
+        product.cubes = CubeList([cube])
+        output_products.add(product)
+
+    # Add reference dataset to output (always)
+    if ref_product is not None:
+        output_products.add(ref_product)
+
+    return output_products
+
+
+def _calculate_t_test(
+    cube: Cube,
+    reference: Cube,
+    coordinate: str | AuxCoord | DimCoord,
+    **kwargs: Any,
+) -> Cube:
+    """Calculate the Welch's t-test and attach the p-value."""
+    cube_metadata = cube.metadata
+
+    axis = cube.coord_dims(coordinate)[0]
+    remaining_axes = sorted(set(range(cube.ndim)) - {axis})
+
+    t_test = dask.array.stats.ttest_ind(
+        cube,
+        reference,
+        axis=axis,
+        **kwargs,
+    )
+    shape = tuple([list(cube.shape)[dim] for dim in remaining_axes])
+
+    p_value = da.from_delayed(t_test.pvalue, shape, dtype=float)
+
+    cube.add_ancillary_variable(
+        AncillaryVariable(p_value, long_name="p_value"),
+        remaining_axes,
+    )
+
+    cube.metadata = cube_metadata
+
+    return cube

From 720faab93424ea5f0b28bb26b6e89d8c7b3ea8c5 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 10:14:20 +0200
Subject: [PATCH 10/21] Add support for lazy and non-lazy data

---
 esmvalcore/preprocessor/_compare_with_refs.py | 84 +++++++++++--------
 1 file changed, 49 insertions(+), 35 deletions(-)

diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py
index afc7769414..6be0a48d6b 100644
--- a/esmvalcore/preprocessor/_compare_with_refs.py
+++ b/esmvalcore/preprocessor/_compare_with_refs.py
@@ -8,14 +8,13 @@
 
 import dask
 import dask.array as da
-import dask.array.stats
 import iris.analysis
 import iris.analysis.stats
 import numpy as np
 from iris.common.metadata import CubeMetadata
-from iris.coords import AncillaryVariable, AuxCoord, CellMethod, DimCoord
+from iris.coords import AncillaryVariable, CellMethod
 from iris.cube import Cube, CubeList
-from scipy.stats import wasserstein_distance
+from scipy.stats import wasserstein_distance, ttest_ind
 
 from esmvalcore.iris_helpers import (
     ignore_iris_vague_metadata_warnings,
@@ -34,7 +33,7 @@
 if TYPE_CHECKING:
     from collections.abc import Iterable
 
-    from iris.coords import Coord
+    from iris.coords import AuxCoord, Coord, DimCoord
 
     from esmvalcore.preprocessor import PreprocessorFile
 
@@ -168,7 +167,7 @@ def bias(
     return output_products
 
 
-def _get_ref(
+def     _get_ref(
     products: Iterable[PreprocessorFile],
     ref_tag: str,
 ) -> tuple[Cube, PreprocessorFile]:
@@ -617,10 +616,13 @@ def _get_emd(
 def t_test(
     products: set[PreprocessorFile] | Iterable[Cube],
     reference: Cube | None = None,
-    coordinate: str | None = "time",
+    coordinate: str | AuxCoord | DimCoord | None = "time",
     **kwargs: Any,
 ) -> set[PreprocessorFile] | CubeList:
-    """Calculate the t-test between dataset and reference dataset.
+    """Perform t-test between dataset and reference dataset.
+
+    This is a test for the null hypothesis that 2 independent samples (dataset
+    and reference dataset) have identical average (expected) values.
 
     The reference dataset needs to be broadcastable to all input `products`.
     This supports `iris' rich broadcasting abilities
@@ -633,8 +635,8 @@ def t_test(
     -----
     The reference dataset can be specified with the `reference` argument. If
     `reference` is ``None``, exactly one input dataset in the `products` set
-    needs to have the facet ``reference_for_bias: true`` defined in the recipe.
-    Please do **not** specify the option `reference` when using this
+    needs to have the facet ``reference_for_t_test: true`` defined in the
+    recipe. Please do **not** specify the option `reference` when using this
     preprocessor function in a recipe.
 
     Parameters
@@ -643,34 +645,35 @@ def t_test(
         Input datasets/cubes for which the significance is calculated relative
         to a reference dataset/cube.
     reference:
-        Cube which is used as reference for the t-test calculation. If ``None``,
-        `products` needs to be a :obj:`set` of
+        Cube which is used as reference for the t-test calculation. If
+        ``None``, `products` needs to be a :obj:`set` of
         `~esmvalcore.preprocessor.PreprocessorFile` objects and exactly one
-        dataset in `products` needs the facet ``reference_for_bias: true``. Do
-        not specify this argument in a recipe.
+        dataset in `products` needs the facet ``reference_for_t_test: true``.
+        Do not specify this argument in a recipe.
     coordinate:
-        Coordinate axis of the input along which to compute the statistic.
-        Default is 'time'.
+        Coordinate axis of the input along which to compute the statistic. If
+        ``None``, the input will be raveled before computing the statistic.
     **kwargs:
-        Additional keyword arguments passed to :func:`dask.array.ttest_ind`.
+        Additional keyword arguments passed to :func:`scipy.stats.ttest_ind`
+        (not all input data are lazy) or
+        :func:`dask.array.stats.ttest_ind.ttest_ind` (all input data are lazy).
 
     Returns
     -------
-    set[PreprocessorFile] | CubeList
+    :
         Output datasets/cubes. Will be a :obj:`set` of
-        :class:`~esmvalcore.preprocessor.PreprocessorFile` objects if `products`
-        is also one, a :class:`~iris.cube.CubeList` otherwise.  The cubes
-        contain an attached ancillary variable which is the p-value w.r.t. the
-        chosen t-test related to the bias (in terms of absolute) of cube and
-        reference.
+        :class:`~esmvalcore.preprocessor.PreprocessorFile` objects if
+        `products` is also one, a :class:`~iris.cube.CubeList` otherwise. The
+        cubes contain an attached ancillary variable which is the p-value
+        w.r.t. the chosen t-test.
 
     Raises
     ------
     ValueError
-        Not exactly one input datasets contains the facet ``reference_for_bias:
-        true`` if ``reference=None``; ``reference=None`` and the input products
-        are given as iterable of :class:`~iris.cube.Cube` objects;
-        ``bias_type`` is not one of ``'absolute'`` or ``'relative'``.
+        Not exactly one input datasets contains the facet
+        ``reference_for_t_test: true`` if ``reference=None``;
+        ``reference=None`` and the input products are given as iterable of
+        :class:`~iris.cube.Cube` objectss.
 
     """
     ref_product = None
@@ -684,7 +687,7 @@ def t_test(
                 "specify a `reference`"
             )
             raise ValueError(msg)
-        (reference, ref_product) = _get_ref(products, "reference_for_bias")
+        (reference, ref_product) = _get_ref(products, "reference_for_t_test")
     else:
         ref_product = None
 
@@ -708,7 +711,6 @@ def t_test(
         cube = _calculate_t_test(cube, reference, coordinate, **kwargs)
 
         # Adapt metadata and provenance information
-        product.attributes["units"] = str(cube.units)
         if ref_product is not None:
             product.wasderivedfrom(ref_product)
 
@@ -725,21 +727,33 @@ def t_test(
 def _calculate_t_test(
     cube: Cube,
     reference: Cube,
-    coordinate: str | AuxCoord | DimCoord,
+    coordinate: str | AuxCoord | DimCoord | None,
     **kwargs: Any,
 ) -> Cube:
-    """Calculate the Welch's t-test and attach the p-value."""
+    """Calculate the t-test and attach the p-value as ancillary variable to cube."""
     cube_metadata = cube.metadata
 
-    axis = cube.coord_dims(coordinate)[0]
-    remaining_axes = sorted(set(range(cube.ndim)) - {axis})
+    if coordinate is None:
+        axis = None
+        remaining_axes = list(range(cube.ndim))
+    else:
+        axis = cube.coord_dims(coordinate)[0]
+        remaining_axes = sorted(set(range(cube.ndim)) - {axis})
 
-    t_test = dask.array.stats.ttest_ind(
-        cube,
-        reference,
+    if cube.has_lazy_data() and reference.has_lazy_data():
+        ttest_func = da.stats.ttest_ind
+    else:
+        ttest_func = ttest_ind
+
+    t_test = ttest_func(
+        cube.core_data(),
+        reference.core_data(),
         axis=axis,
         **kwargs,
     )
+
+    print(t_test.pvalue)
+    assert 0
     shape = tuple([list(cube.shape)[dim] for dim in remaining_axes])
 
     p_value = da.from_delayed(t_test.pvalue, shape, dtype=float)

From 81928338a19b2fc6d24856ab497a67cd3af076c2 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 11:07:54 +0200
Subject: [PATCH 11/21] pre-commit

---
 esmvalcore/preprocessor/_compare_with_refs.py | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py
index 6be0a48d6b..2035088734 100644
--- a/esmvalcore/preprocessor/_compare_with_refs.py
+++ b/esmvalcore/preprocessor/_compare_with_refs.py
@@ -14,7 +14,7 @@
 from iris.common.metadata import CubeMetadata
 from iris.coords import AncillaryVariable, CellMethod
 from iris.cube import Cube, CubeList
-from scipy.stats import wasserstein_distance, ttest_ind
+from scipy.stats import ttest_ind, wasserstein_distance
 
 from esmvalcore.iris_helpers import (
     ignore_iris_vague_metadata_warnings,
@@ -167,7 +167,7 @@ def bias(
     return output_products
 
 
-def     _get_ref(
+def _get_ref(
     products: Iterable[PreprocessorFile],
     ref_tag: str,
 ) -> tuple[Cube, PreprocessorFile]:
@@ -226,7 +226,7 @@ def _calculate_bias(
     cube.metadata = cube_metadata
     cube.units = new_units
 
-    # reattach ancillary variables
+    # Reattach ancillary variables
     for ancillary_variable, dims in ancillary_variables_and_dims:
         cube.add_ancillary_variable(ancillary_variable, dims)
 
@@ -673,7 +673,7 @@ def t_test(
         Not exactly one input datasets contains the facet
         ``reference_for_t_test: true`` if ``reference=None``;
         ``reference=None`` and the input products are given as iterable of
-        :class:`~iris.cube.Cube` objectss.
+        :class:`~iris.cube.Cube` objects.
 
     """
     ref_product = None
@@ -717,7 +717,7 @@ def t_test(
         product.cubes = CubeList([cube])
         output_products.add(product)
 
-    # Add reference dataset to output (always)
+    # Add reference dataset to output
     if ref_product is not None:
         output_products.add(ref_product)
 
@@ -731,20 +731,21 @@ def _calculate_t_test(
     **kwargs: Any,
 ) -> Cube:
     """Calculate the t-test and attach the p-value as ancillary variable to cube."""
+    cube = cube.copy()  # do not modify input cube
     cube_metadata = cube.metadata
 
+    remaining_axes: tuple[int, ...]
     if coordinate is None:
         axis = None
-        remaining_axes = list(range(cube.ndim))
+        remaining_axes = ()
     else:
         axis = cube.coord_dims(coordinate)[0]
-        remaining_axes = sorted(set(range(cube.ndim)) - {axis})
+        remaining_axes = tuple(sorted(set(range(cube.ndim)) - {axis}))
 
     if cube.has_lazy_data() and reference.has_lazy_data():
         ttest_func = da.stats.ttest_ind
     else:
         ttest_func = ttest_ind
-
     t_test = ttest_func(
         cube.core_data(),
         reference.core_data(),
@@ -752,17 +753,15 @@ def _calculate_t_test(
         **kwargs,
     )
 
-    print(t_test.pvalue)
-    assert 0
-    shape = tuple([list(cube.shape)[dim] for dim in remaining_axes])
-
-    p_value = da.from_delayed(t_test.pvalue, shape, dtype=float)
-
     cube.add_ancillary_variable(
-        AncillaryVariable(p_value, long_name="p_value"),
+        AncillaryVariable(
+            t_test.pvalue,
+            long_name="p-value",
+            var_name="pvalue",
+            units="1",
+        ),
         remaining_axes,
     )
-
     cube.metadata = cube_metadata
 
     return cube

From 16f3c2a0081a2ca9e191360be0609bcc1aacd79a Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 11:08:05 +0200
Subject: [PATCH 12/21] Add tests for t_test with products

---
 .../test_compare_with_refs.py                 | 174 ++++++++++++++++--
 1 file changed, 158 insertions(+), 16 deletions(-)

diff --git a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
index d9de9c7d4b..5271305e63 100644
--- a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
+++ b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
@@ -1,17 +1,22 @@
 """Unit tests for :mod:`esmvalcore.preprocessor._compare_with_refs`."""
 
 import contextlib
+import re
 
 import dask.array as da
 import iris
 import numpy as np
 import pytest
 from cf_units import Unit
-from iris.coords import CellMeasure, CellMethod
+from iris.coords import AncillaryVariable, CellMeasure, CellMethod
 from iris.cube import Cube, CubeList
 from iris.exceptions import CoordinateNotFoundError
 
-from esmvalcore.preprocessor._compare_with_refs import bias, distance_metric
+from esmvalcore.preprocessor._compare_with_refs import (
+    bias,
+    distance_metric,
+    t_test,
+)
 from tests import PreprocessorFile
 
 
@@ -100,20 +105,36 @@ def ref_cubes():
 ]
 
 
+@pytest.mark.parametrize("use_reference_product", [True, False])
 @pytest.mark.parametrize(("bias_type", "data", "units"), TEST_BIAS)
-def test_bias_products(regular_cubes, ref_cubes, bias_type, data, units):
+def test_bias_products(
+    regular_cubes,
+    ref_cubes,
+    bias_type,
+    data,
+    units,
+    use_reference_product,
+):
     """Test calculation of bias with products."""
-    ref_product = PreprocessorFile(
-        ref_cubes,
-        "REF",
-        {"reference_for_bias": True},
-    )
     products = {
         PreprocessorFile(regular_cubes, "A", {"dataset": "a"}),
         PreprocessorFile(regular_cubes, "B", {"dataset": "b"}),
-        ref_product,
     }
-    out_products = bias(products, bias_type=bias_type)
+
+    if use_reference_product:
+        ref_product = PreprocessorFile(
+            ref_cubes,
+            "REF",
+            {"reference_for_bias": True},
+        )
+        products.add(ref_product)
+        expected_ancestors = {ref_product}
+        reference = None
+    else:
+        expected_ancestors = set()
+        reference = ref_cubes[0]
+
+    out_products = bias(products, reference=reference, bias_type=bias_type)
 
     assert isinstance(out_products, set)
     out_dict = products_set_to_dict(out_products)
@@ -131,8 +152,8 @@ def test_bias_products(regular_cubes, ref_cubes, bias_type, data, units):
     assert out_cube.units == units
     assert out_cube.dim_coords == regular_cubes[0].dim_coords
     assert out_cube.aux_coords == regular_cubes[0].aux_coords
-    product_a.wasderivedfrom.assert_called_once()
-    assert product_a.mock_ancestors == {ref_product}
+    assert product_a.wasderivedfrom.call_count == len(expected_ancestors)
+    assert product_a.mock_ancestors == expected_ancestors
 
     product_b = out_dict["B"]
     assert product_b.filename == "B"
@@ -146,8 +167,8 @@ def test_bias_products(regular_cubes, ref_cubes, bias_type, data, units):
     assert out_cube.units == units
     assert out_cube.dim_coords == regular_cubes[0].dim_coords
     assert out_cube.aux_coords == regular_cubes[0].aux_coords
-    product_b.wasderivedfrom.assert_called_once()
-    assert product_b.mock_ancestors == {ref_product}
+    assert product_b.wasderivedfrom.call_count == len(expected_ancestors)
+    assert product_b.mock_ancestors == expected_ancestors
 
 
 @pytest.mark.parametrize(("bias_type", "data", "units"), TEST_BIAS)
@@ -382,8 +403,8 @@ def test_invalid_bias_type(regular_cubes, ref_cubes):
         bias(products, bias_type="invalid_bias_type")
 
 
-def test_reference_none_cubes(regular_cubes):
-    """Test reference=None with with cubes."""
+def test_bias_reference_none_cubes(regular_cubes):
+    """Test bias with reference=None and cubes given."""
     msg = (
         "A list of Cubes is given to this preprocessor; please specify a "
         "`reference`"
@@ -932,3 +953,124 @@ def test_distance_metric_no_lon_for_area_weights(regular_cubes, metric, error):
             reference=ref_cube,
             coords=["time", "latitude"],
         )
+
+
+@pytest.mark.parametrize("use_reference_product", [True, False])
+def test_t_test_products(regular_cubes, ref_cubes, use_reference_product):  # noqa: PLR0915
+    """Test calculation of t_test with products."""
+    products = {
+        PreprocessorFile(regular_cubes, "A", {"dataset": "a"}),
+        PreprocessorFile(regular_cubes, "B", {"dataset": "b"}),
+    }
+
+    if use_reference_product:
+        ref_product = PreprocessorFile(
+            ref_cubes,
+            "REF",
+            {"reference_for_t_test": True},
+        )
+        products.add(ref_product)
+        expected_ancestors = {ref_product}
+        reference = None
+    else:
+        expected_ancestors = set()
+        reference = ref_cubes[0]
+
+    out_products = t_test(products, reference=reference)
+
+    expected_ancillary_variable = AncillaryVariable(
+        np.array(
+            [[1.0, 0.6666667], [0.42264974, 0.46547753]],
+            dtype=np.float32,
+        ),
+        long_name="p-value",
+        var_name="pvalue",
+        units="1",
+    )
+
+    assert isinstance(out_products, set)
+    out_dict = products_set_to_dict(out_products)
+    assert len(out_dict) == len(products)
+
+    product_a = out_dict["A"]
+    assert product_a.filename == "A"
+    assert product_a.attributes == {"dataset": "a"}
+    assert len(product_a.cubes) == 1
+    out_cube = product_a.cubes[0]
+    assert out_cube.dtype == np.float32
+    assert_allclose(out_cube.data, regular_cubes[0].data)
+    assert out_cube.var_name == "tas"
+    assert out_cube.standard_name == "air_temperature"
+    assert out_cube.units == "K"
+    assert out_cube.dim_coords == regular_cubes[0].dim_coords
+    assert out_cube.aux_coords == regular_cubes[0].aux_coords
+    assert out_cube.ancillary_variables() == [expected_ancillary_variable]
+    assert product_a.wasderivedfrom.call_count == len(expected_ancestors)
+    assert product_a.mock_ancestors == expected_ancestors
+
+    product_b = out_dict["B"]
+    assert product_b.filename == "B"
+    assert product_b.attributes == {"dataset": "b"}
+    assert len(product_b.cubes) == 1
+    out_cube = product_b.cubes[0]
+    assert out_cube.dtype == np.float32
+    assert_allclose(out_cube.data, regular_cubes[0].data)
+    assert out_cube.var_name == "tas"
+    assert out_cube.standard_name == "air_temperature"
+    assert out_cube.units == "K"
+    assert out_cube.dim_coords == regular_cubes[0].dim_coords
+    assert out_cube.aux_coords == regular_cubes[0].aux_coords
+    assert out_cube.ancillary_variables() == [expected_ancillary_variable]
+    assert product_b.wasderivedfrom.call_count == len(expected_ancestors)
+    assert product_b.mock_ancestors == expected_ancestors
+
+    if use_reference_product:
+        product_ref = out_dict["REF"]
+        assert product_ref.filename == "REF"
+        assert product_ref.attributes == {"reference_for_t_test": True}
+        assert len(product_ref.cubes) == 1
+        out_cube = product_ref.cubes[0]
+        assert out_cube.dtype == np.float32
+        assert_allclose(out_cube.data, ref_cubes[0].data)
+        assert out_cube.var_name == "tas"
+        assert out_cube.standard_name == "air_temperature"
+        assert out_cube.units == "K"
+        assert out_cube.dim_coords == ref_cubes[0].dim_coords
+        assert out_cube.aux_coords == ref_cubes[0].aux_coords
+        assert out_cube.ancillary_variables() == []
+        product_ref.wasderivedfrom.assert_not_called()
+        assert product_ref.mock_ancestors == set()
+
+
+def test_t_test_reference_none_cubes(regular_cubes):
+    """Test t_test with reference=None and cubes given."""
+    msg = (
+        r"A list of Cubes is given to this preprocessor; please specify a "
+        r"`reference`"
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        t_test(regular_cubes)
+
+
+def test_no_reference_for_t_test(regular_cubes, ref_cubes):
+    """Test fail when no reference_for_t_test is given."""
+    products = {
+        PreprocessorFile(regular_cubes, "A", {}),
+        PreprocessorFile(regular_cubes, "B", {}),
+        PreprocessorFile(ref_cubes, "REF", {}),
+    }
+    msg = r"Expected exactly 1 dataset with 'reference_for_t_test: true', found 0"
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        t_test(products)
+
+
+def test_two_references_for_t_test(regular_cubes, ref_cubes):
+    """Test fail when two reference_for_t_test products are given."""
+    products = {
+        PreprocessorFile(regular_cubes, "A", {"reference_for_t_test": False}),
+        PreprocessorFile(ref_cubes, "REF1", {"reference_for_t_test": True}),
+        PreprocessorFile(ref_cubes, "REF2", {"reference_for_t_test": True}),
+    }
+    msg = r"Expected exactly 1 dataset with 'reference_for_t_test: true', found 2"
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        t_test(products)

From 93ee1342c18f7ad0e77135ba411d70f5eae5149d Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 11:16:12 +0200
Subject: [PATCH 13/21] Remove duplicate bias test

---
 .../test_compare_with_refs.py                 | 40 ++++++-------------
 1 file changed, 12 insertions(+), 28 deletions(-)

diff --git a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
index 5271305e63..51ab2bdb15 100644
--- a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
+++ b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
@@ -105,36 +105,20 @@ def ref_cubes():
 ]
 
 
-@pytest.mark.parametrize("use_reference_product", [True, False])
 @pytest.mark.parametrize(("bias_type", "data", "units"), TEST_BIAS)
-def test_bias_products(
-    regular_cubes,
-    ref_cubes,
-    bias_type,
-    data,
-    units,
-    use_reference_product,
-):
+def test_bias_products(regular_cubes, ref_cubes, bias_type, data, units):
     """Test calculation of bias with products."""
+    ref_product = PreprocessorFile(
+        ref_cubes,
+        "REF",
+        {"reference_for_bias": True},
+    )
     products = {
         PreprocessorFile(regular_cubes, "A", {"dataset": "a"}),
         PreprocessorFile(regular_cubes, "B", {"dataset": "b"}),
+        ref_product,
     }
-
-    if use_reference_product:
-        ref_product = PreprocessorFile(
-            ref_cubes,
-            "REF",
-            {"reference_for_bias": True},
-        )
-        products.add(ref_product)
-        expected_ancestors = {ref_product}
-        reference = None
-    else:
-        expected_ancestors = set()
-        reference = ref_cubes[0]
-
-    out_products = bias(products, reference=reference, bias_type=bias_type)
+    out_products = bias(products, bias_type=bias_type)
 
     assert isinstance(out_products, set)
     out_dict = products_set_to_dict(out_products)
@@ -152,8 +136,8 @@ def test_bias_products(
     assert out_cube.units == units
     assert out_cube.dim_coords == regular_cubes[0].dim_coords
     assert out_cube.aux_coords == regular_cubes[0].aux_coords
-    assert product_a.wasderivedfrom.call_count == len(expected_ancestors)
-    assert product_a.mock_ancestors == expected_ancestors
+    product_a.wasderivedfrom.assert_called_once()
+    assert product_a.mock_ancestors == {ref_product}
 
     product_b = out_dict["B"]
     assert product_b.filename == "B"
@@ -167,8 +151,8 @@ def test_bias_products(
     assert out_cube.units == units
     assert out_cube.dim_coords == regular_cubes[0].dim_coords
     assert out_cube.aux_coords == regular_cubes[0].aux_coords
-    assert product_b.wasderivedfrom.call_count == len(expected_ancestors)
-    assert product_b.mock_ancestors == expected_ancestors
+    product_b.wasderivedfrom.assert_called_once()
+    assert product_b.mock_ancestors == {ref_product}
 
 
 @pytest.mark.parametrize(("bias_type", "data", "units"), TEST_BIAS)

From e5ebef516b41baf1014227080743693ec8689a45 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 11:54:57 +0200
Subject: [PATCH 14/21] Fix lazy t-test calculation

---
 esmvalcore/preprocessor/_compare_with_refs.py | 32 +++++++++++++------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py
index 2035088734..90c7764fe1 100644
--- a/esmvalcore/preprocessor/_compare_with_refs.py
+++ b/esmvalcore/preprocessor/_compare_with_refs.py
@@ -8,6 +8,7 @@
 
 import dask
 import dask.array as da
+import dask.array.stats
 import iris.analysis
 import iris.analysis.stats
 import numpy as np
@@ -738,24 +739,37 @@ def _calculate_t_test(
     if coordinate is None:
         axis = None
         remaining_axes = ()
+        shape = ()
     else:
         axis = cube.coord_dims(coordinate)[0]
         remaining_axes = tuple(sorted(set(range(cube.ndim)) - {axis}))
+        shape = tuple(cube.shape[a] for a in remaining_axes)
 
     if cube.has_lazy_data() and reference.has_lazy_data():
-        ttest_func = da.stats.ttest_ind
+        t_test = dask.array.stats.ttest_ind(
+            cube.core_data(),
+            reference.core_data(),
+            axis=axis,
+            **kwargs,
+        )
+        # For some reason, t_test.pvalue is not a da.array, but a dask.Delayed
+        p_value_arr = da.from_delayed(t_test.pvalue, shape, cube.dtype)
     else:
-        ttest_func = ttest_ind
-    t_test = ttest_func(
-        cube.core_data(),
-        reference.core_data(),
-        axis=axis,
-        **kwargs,
-    )
+        # Avoid realizing cube.data
+        cube_data = (
+            cube.lazy_data().compute() if cube.has_lazy_data() else cube.data
+        )
+        ref_data = (
+            reference.lazy_data().compute()
+            if reference.has_lazy_data()
+            else reference.data
+        )
+        t_test = ttest_ind(cube_data, ref_data, axis=axis, **kwargs)
+        p_value_arr = t_test.pvalue
 
     cube.add_ancillary_variable(
         AncillaryVariable(
-            t_test.pvalue,
+            p_value_arr,
             long_name="p-value",
             var_name="pvalue",
             units="1",

From 0d1f95cd1a6de336c44a4a7ddc219880b558c9c1 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 11:55:11 +0200
Subject: [PATCH 15/21] Add tests for t-test calculation of lazy and non-lazy
 cubes

---
 .../test_compare_with_refs.py                 | 97 ++++++++++++++++---
 1 file changed, 81 insertions(+), 16 deletions(-)

diff --git a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
index 51ab2bdb15..4bb9185f1a 100644
--- a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
+++ b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
@@ -8,7 +8,7 @@
 import numpy as np
 import pytest
 from cf_units import Unit
-from iris.coords import AncillaryVariable, CellMeasure, CellMethod
+from iris.coords import CellMeasure, CellMethod
 from iris.cube import Cube, CubeList
 from iris.exceptions import CoordinateNotFoundError
 
@@ -939,6 +939,22 @@ def test_distance_metric_no_lon_for_area_weights(regular_cubes, metric, error):
         )
 
 
+def assert_correct_p_value(cube: Cube, *, is_lazy: bool = False) -> None:
+    assert len(cube.ancillary_variables()) == 1
+    p_value = cube.ancillary_variables()[0]
+    assert p_value.standard_name is None
+    assert p_value.long_name == "p-value"
+    assert p_value.var_name == "pvalue"
+    assert p_value.units == "1"
+    assert p_value.attributes == {}
+    assert p_value.has_lazy_data() is is_lazy
+    assert p_value.dtype == np.float32
+    np.testing.assert_allclose(
+        p_value.data,
+        [[1.0, 0.6666667], [0.42264974, 0.46547753]],
+    )
+
+
 @pytest.mark.parametrize("use_reference_product", [True, False])
 def test_t_test_products(regular_cubes, ref_cubes, use_reference_product):  # noqa: PLR0915
     """Test calculation of t_test with products."""
@@ -962,16 +978,6 @@ def test_t_test_products(regular_cubes, ref_cubes, use_reference_product):  # no
 
     out_products = t_test(products, reference=reference)
 
-    expected_ancillary_variable = AncillaryVariable(
-        np.array(
-            [[1.0, 0.6666667], [0.42264974, 0.46547753]],
-            dtype=np.float32,
-        ),
-        long_name="p-value",
-        var_name="pvalue",
-        units="1",
-    )
-
     assert isinstance(out_products, set)
     out_dict = products_set_to_dict(out_products)
     assert len(out_dict) == len(products)
@@ -988,7 +994,7 @@ def test_t_test_products(regular_cubes, ref_cubes, use_reference_product):  # no
     assert out_cube.units == "K"
     assert out_cube.dim_coords == regular_cubes[0].dim_coords
     assert out_cube.aux_coords == regular_cubes[0].aux_coords
-    assert out_cube.ancillary_variables() == [expected_ancillary_variable]
+    assert_correct_p_value(out_cube)
     assert product_a.wasderivedfrom.call_count == len(expected_ancestors)
     assert product_a.mock_ancestors == expected_ancestors
 
@@ -1004,7 +1010,7 @@ def test_t_test_products(regular_cubes, ref_cubes, use_reference_product):  # no
     assert out_cube.units == "K"
     assert out_cube.dim_coords == regular_cubes[0].dim_coords
     assert out_cube.aux_coords == regular_cubes[0].aux_coords
-    assert out_cube.ancillary_variables() == [expected_ancillary_variable]
+    assert_correct_p_value(out_cube)
     assert product_b.wasderivedfrom.call_count == len(expected_ancestors)
     assert product_b.mock_ancestors == expected_ancestors
 
@@ -1026,7 +1032,66 @@ def test_t_test_products(regular_cubes, ref_cubes, use_reference_product):  # no
         assert product_ref.mock_ancestors == set()
 
 
-def test_t_test_reference_none_cubes(regular_cubes):
+@pytest.mark.parametrize("lazy", [True, False])
+def test_t_test_cubes(regular_cubes, ref_cubes, lazy):
+    """Test calculation of t_test with cubes."""
+    cube = regular_cubes[0]
+    ref_cube = ref_cubes[0]
+    if lazy:
+        cube.data = cube.lazy_data()
+        ref_cube.data = ref_cube.lazy_data()
+
+    out_cubes = t_test([cube], ref_cube)
+
+    assert cube.has_lazy_data() is lazy
+    assert ref_cube.has_lazy_data() is lazy
+
+    assert isinstance(out_cubes, CubeList)
+    assert len(out_cubes) == 1
+    out_cube = out_cubes[0]
+
+    assert out_cube.has_lazy_data() is lazy
+    assert out_cube.dtype == np.float32
+    assert_allclose(out_cube.data, cube.data)
+    assert out_cube.var_name == "tas"
+    assert out_cube.standard_name == "air_temperature"
+    assert out_cube.units == "K"
+    assert out_cube.dim_coords == regular_cubes[0].dim_coords
+    assert out_cube.aux_coords == regular_cubes[0].aux_coords
+    assert_correct_p_value(out_cube, is_lazy=lazy)
+
+
+@pytest.mark.parametrize("ref_lazy", [True, False])
+def test_t_test_cubes_partly_lazy(regular_cubes, ref_cubes, ref_lazy):
+    """Test calculation of t_test with cubes."""
+    cube = regular_cubes[0]
+    ref_cube = ref_cubes[0]
+    if ref_lazy:
+        ref_cube.data = ref_cube.lazy_data()
+    else:
+        cube.data = cube.lazy_data()
+
+    out_cubes = t_test([cube], ref_cube)
+
+    assert cube.has_lazy_data() is not ref_lazy
+    assert ref_cube.has_lazy_data() is ref_lazy
+
+    assert isinstance(out_cubes, CubeList)
+    assert len(out_cubes) == 1
+    out_cube = out_cubes[0]
+
+    assert out_cube.has_lazy_data() is not ref_lazy
+    assert out_cube.dtype == np.float32
+    assert_allclose(out_cube.data, cube.data)
+    assert out_cube.var_name == "tas"
+    assert out_cube.standard_name == "air_temperature"
+    assert out_cube.units == "K"
+    assert out_cube.dim_coords == regular_cubes[0].dim_coords
+    assert out_cube.aux_coords == regular_cubes[0].aux_coords
+    assert_correct_p_value(out_cube)
+
+
+def test_t_test_reference_none_cubes_fail(regular_cubes):
     """Test t_test with reference=None and cubes given."""
     msg = (
         r"A list of Cubes is given to this preprocessor; please specify a "
@@ -1036,7 +1101,7 @@ def test_t_test_reference_none_cubes(regular_cubes):
         t_test(regular_cubes)
 
 
-def test_no_reference_for_t_test(regular_cubes, ref_cubes):
+def test_no_reference_for_t_test_fail(regular_cubes, ref_cubes):
     """Test fail when no reference_for_t_test is given."""
     products = {
         PreprocessorFile(regular_cubes, "A", {}),
@@ -1048,7 +1113,7 @@ def test_no_reference_for_t_test(regular_cubes, ref_cubes):
         t_test(products)
 
 
-def test_two_references_for_t_test(regular_cubes, ref_cubes):
+def test_two_references_for_t_test_fail(regular_cubes, ref_cubes):
     """Test fail when two reference_for_t_test products are given."""
     products = {
         PreprocessorFile(regular_cubes, "A", {"reference_for_t_test": False}),

From e70a9438c015fdd67aedd11c53f6f3681afbcead Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 11:58:30 +0200
Subject: [PATCH 16/21] t_test intrinsically preserves metadata

---
 esmvalcore/preprocessor/_compare_with_refs.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py
index 90c7764fe1..1ba3069509 100644
--- a/esmvalcore/preprocessor/_compare_with_refs.py
+++ b/esmvalcore/preprocessor/_compare_with_refs.py
@@ -733,7 +733,6 @@ def _calculate_t_test(
 ) -> Cube:
     """Calculate the t-test and attach the p-value as ancillary variable to cube."""
     cube = cube.copy()  # do not modify input cube
-    cube_metadata = cube.metadata
 
     remaining_axes: tuple[int, ...]
     if coordinate is None:
@@ -776,6 +775,5 @@ def _calculate_t_test(
         ),
         remaining_axes,
     )
-    cube.metadata = cube_metadata
 
     return cube

From aedd6a621dfe7bdedff4b87605bf6944a49ec68f Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 13:39:28 +0200
Subject: [PATCH 17/21] Support masked data

---
 esmvalcore/preprocessor/_compare_with_refs.py | 25 ++++--
 .../test_compare_with_refs.py                 | 85 +++++++++++++++++--
 2 files changed, 94 insertions(+), 16 deletions(-)

diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py
index 1ba3069509..b6735fa43a 100644
--- a/esmvalcore/preprocessor/_compare_with_refs.py
+++ b/esmvalcore/preprocessor/_compare_with_refs.py
@@ -746,25 +746,36 @@ def _calculate_t_test(
 
     if cube.has_lazy_data() and reference.has_lazy_data():
         t_test = dask.array.stats.ttest_ind(
-            cube.core_data(),
-            reference.core_data(),
+            da.ma.filled(cube.core_data(), np.nan),
+            da.ma.filled(reference.core_data(), np.nan),
             axis=axis,
             **kwargs,
         )
         # For some reason, t_test.pvalue is not a da.array, but a dask.Delayed
         p_value_arr = da.from_delayed(t_test.pvalue, shape, cube.dtype)
+        p_value_arr = da.ma.masked_invalid(p_value_arr)
     else:
         # Avoid realizing cube.data
-        cube_data = (
-            cube.lazy_data().compute() if cube.has_lazy_data() else cube.data
+        cube_data = np.ma.filled(
+            cube.lazy_data().compute() if cube.has_lazy_data() else cube.data,
+            np.nan,
         )
-        ref_data = (
+        ref_data = np.ma.filled(
             reference.lazy_data().compute()
             if reference.has_lazy_data()
-            else reference.data
+            else reference.data,
+            np.nan,
         )
         t_test = ttest_ind(cube_data, ref_data, axis=axis, **kwargs)
-        p_value_arr = t_test.pvalue
+        p_value_arr = np.ma.masked_invalid(t_test.pvalue)
+
+    # Ensure that float dtype from parents is inherited
+    if (
+        np.issubdtype(cube.dtype, np.floating)
+        and np.issubdtype(reference.dtype, np.floating)
+        and cube.dtype == reference.dtype
+    ):
+        p_value_arr = p_value_arr.astype(cube.dtype)
 
     cube.add_ancillary_variable(
         AncillaryVariable(
diff --git a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
index 4bb9185f1a..49660d4038 100644
--- a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
+++ b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
@@ -1,7 +1,10 @@
 """Unit tests for :mod:`esmvalcore.preprocessor._compare_with_refs`."""
 
+from __future__ import annotations
+
 import contextlib
 import re
+from typing import Any
 
 import dask.array as da
 import iris
@@ -939,7 +942,12 @@ def test_distance_metric_no_lon_for_area_weights(regular_cubes, metric, error):
         )
 
 
-def assert_correct_p_value(cube: Cube, *, is_lazy: bool = False) -> None:
+def assert_correct_p_value(
+    cube: Cube,
+    data: Any,  # noqa: ANN401
+    *,
+    is_lazy: bool = False,
+) -> None:
     assert len(cube.ancillary_variables()) == 1
     p_value = cube.ancillary_variables()[0]
     assert p_value.standard_name is None
@@ -949,10 +957,9 @@ def assert_correct_p_value(cube: Cube, *, is_lazy: bool = False) -> None:
     assert p_value.attributes == {}
     assert p_value.has_lazy_data() is is_lazy
     assert p_value.dtype == np.float32
-    np.testing.assert_allclose(
-        p_value.data,
-        [[1.0, 0.6666667], [0.42264974, 0.46547753]],
-    )
+    if np.ma.is_masked(data):
+        np.testing.assert_equal(p_value.data.mask, data.mask)
+    np.testing.assert_allclose(p_value.data, data)
 
 
 @pytest.mark.parametrize("use_reference_product", [True, False])
@@ -994,7 +1001,10 @@ def test_t_test_products(regular_cubes, ref_cubes, use_reference_product):  # no
     assert out_cube.units == "K"
     assert out_cube.dim_coords == regular_cubes[0].dim_coords
     assert out_cube.aux_coords == regular_cubes[0].aux_coords
-    assert_correct_p_value(out_cube)
+    assert_correct_p_value(
+        out_cube,
+        [[1.0, 0.6666667], [0.42264974, 0.46547753]],
+    )
     assert product_a.wasderivedfrom.call_count == len(expected_ancestors)
     assert product_a.mock_ancestors == expected_ancestors
 
@@ -1010,7 +1020,10 @@ def test_t_test_products(regular_cubes, ref_cubes, use_reference_product):  # no
     assert out_cube.units == "K"
     assert out_cube.dim_coords == regular_cubes[0].dim_coords
     assert out_cube.aux_coords == regular_cubes[0].aux_coords
-    assert_correct_p_value(out_cube)
+    assert_correct_p_value(
+        out_cube,
+        [[1.0, 0.6666667], [0.42264974, 0.46547753]],
+    )
     assert product_b.wasderivedfrom.call_count == len(expected_ancestors)
     assert product_b.mock_ancestors == expected_ancestors
 
@@ -1058,7 +1071,11 @@ def test_t_test_cubes(regular_cubes, ref_cubes, lazy):
     assert out_cube.units == "K"
     assert out_cube.dim_coords == regular_cubes[0].dim_coords
     assert out_cube.aux_coords == regular_cubes[0].aux_coords
-    assert_correct_p_value(out_cube, is_lazy=lazy)
+    assert_correct_p_value(
+        out_cube,
+        [[1.0, 0.6666667], [0.42264974, 0.46547753]],
+        is_lazy=lazy,
+    )
 
 
 @pytest.mark.parametrize("ref_lazy", [True, False])
@@ -1088,7 +1105,51 @@ def test_t_test_cubes_partly_lazy(regular_cubes, ref_cubes, ref_lazy):
     assert out_cube.units == "K"
     assert out_cube.dim_coords == regular_cubes[0].dim_coords
     assert out_cube.aux_coords == regular_cubes[0].aux_coords
-    assert_correct_p_value(out_cube)
+    assert_correct_p_value(
+        out_cube,
+        [[1.0, 0.6666667], [0.42264974, 0.46547753]],
+    )
+
+
+@pytest.mark.parametrize("lazy", [True, False])
+def test_t_test_cubes_masked(regular_cubes, ref_cubes, lazy):
+    """Test calculation of t_test with cubes (masked data)."""
+    cube = regular_cubes[0]
+    cube.data = np.ma.masked_inside(cube.data, 1.5, 3.5)
+    cube.data = np.ma.masked_greater(cube.data, 6.5)
+    ref_cube = ref_cubes[0]
+    ref_cube.data = np.ma.masked_invalid(
+        np.array(
+            [[[2.0, np.nan], [2.0, 2.0]], [[2.0, 2.0], [2.0, 2.0]]],
+            dtype=np.float32,
+        ),
+    )
+    if lazy:
+        cube.data = cube.lazy_data()
+        ref_cube.data = ref_cube.lazy_data()
+
+    out_cubes = t_test([cube], ref_cube)
+
+    assert cube.has_lazy_data() is lazy
+    assert ref_cube.has_lazy_data() is lazy
+
+    assert isinstance(out_cubes, CubeList)
+    assert len(out_cubes) == 1
+    out_cube = out_cubes[0]
+
+    assert out_cube.has_lazy_data() is lazy
+    assert out_cube.dtype == np.float32
+    assert_allclose(out_cube.data, cube.data)
+    assert out_cube.var_name == "tas"
+    assert out_cube.standard_name == "air_temperature"
+    assert out_cube.units == "K"
+    assert out_cube.dim_coords == regular_cubes[0].dim_coords
+    assert out_cube.aux_coords == regular_cubes[0].aux_coords
+    assert_correct_p_value(
+        out_cube,
+        np.ma.masked_invalid([[1.0, np.nan], [np.nan, np.nan]]),
+        is_lazy=lazy,
+    )
 
 
 def test_t_test_reference_none_cubes_fail(regular_cubes):
@@ -1123,3 +1184,9 @@ def test_two_references_for_t_test_fail(regular_cubes, ref_cubes):
     msg = r"Expected exactly 1 dataset with 'reference_for_t_test: true', found 2"
     with pytest.raises(ValueError, match=re.escape(msg)):
         t_test(products)
+
+
+# ADD:
+# - masked data
+# - coordinate=None
+# - **kwargs

From 2066b5e94b102c34d8d26be94ca49ea9cbc2b56e Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 14:29:51 +0200
Subject: [PATCH 18/21] dask ttest_ind does not support axis=None, so we will
 not support it either

---
 esmvalcore/preprocessor/_compare_with_refs.py | 24 +++++++++----------
 .../test_compare_with_refs.py                 | 18 ++++++++++----
 2 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py
index b6735fa43a..69d931727a 100644
--- a/esmvalcore/preprocessor/_compare_with_refs.py
+++ b/esmvalcore/preprocessor/_compare_with_refs.py
@@ -15,6 +15,7 @@
 from iris.common.metadata import CubeMetadata
 from iris.coords import AncillaryVariable, CellMethod
 from iris.cube import Cube, CubeList
+from iris.exceptions import CoordinateMultiDimError
 from scipy.stats import ttest_ind, wasserstein_distance
 
 from esmvalcore.iris_helpers import (
@@ -617,7 +618,7 @@ def _get_emd(
 def t_test(
     products: set[PreprocessorFile] | Iterable[Cube],
     reference: Cube | None = None,
-    coordinate: str | AuxCoord | DimCoord | None = "time",
+    coordinate: str | AuxCoord | DimCoord = "time",
     **kwargs: Any,
 ) -> set[PreprocessorFile] | CubeList:
     """Perform t-test between dataset and reference dataset.
@@ -652,8 +653,7 @@ def t_test(
         dataset in `products` needs the facet ``reference_for_t_test: true``.
         Do not specify this argument in a recipe.
     coordinate:
-        Coordinate axis of the input along which to compute the statistic. If
-        ``None``, the input will be raveled before computing the statistic.
+        Coordinate axis of the input along which to compute the statistic.
     **kwargs:
         Additional keyword arguments passed to :func:`scipy.stats.ttest_ind`
         (not all input data are lazy) or
@@ -675,6 +675,8 @@ def t_test(
         ``reference_for_t_test: true`` if ``reference=None``;
         ``reference=None`` and the input products are given as iterable of
         :class:`~iris.cube.Cube` objects.
+    CoordinateMultiDimError
+        ``coordinate`` is not 1D.
 
     """
     ref_product = None
@@ -728,21 +730,17 @@ def t_test(
 def _calculate_t_test(
     cube: Cube,
     reference: Cube,
-    coordinate: str | AuxCoord | DimCoord | None,
+    coordinate: str | AuxCoord | DimCoord,
     **kwargs: Any,
 ) -> Cube:
     """Calculate the t-test and attach the p-value as ancillary variable to cube."""
     cube = cube.copy()  # do not modify input cube
 
-    remaining_axes: tuple[int, ...]
-    if coordinate is None:
-        axis = None
-        remaining_axes = ()
-        shape = ()
-    else:
-        axis = cube.coord_dims(coordinate)[0]
-        remaining_axes = tuple(sorted(set(range(cube.ndim)) - {axis}))
-        shape = tuple(cube.shape[a] for a in remaining_axes)
+    if cube.coord(coordinate).ndim > 1:
+        raise CoordinateMultiDimError(cube.coord(coordinate))
+    axis = cube.coord_dims(coordinate)[0]
+    remaining_axes = tuple(sorted(set(range(cube.ndim)) - {axis}))
+    shape = tuple(cube.shape[a] for a in remaining_axes)
 
     if cube.has_lazy_data() and reference.has_lazy_data():
         t_test = dask.array.stats.ttest_ind(
diff --git a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
index 49660d4038..2c27580bfb 100644
--- a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
+++ b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
@@ -11,9 +11,9 @@
 import numpy as np
 import pytest
 from cf_units import Unit
-from iris.coords import CellMeasure, CellMethod
+from iris.coords import AuxCoord, CellMeasure, CellMethod
 from iris.cube import Cube, CubeList
-from iris.exceptions import CoordinateNotFoundError
+from iris.exceptions import CoordinateMultiDimError, CoordinateNotFoundError
 
 from esmvalcore.preprocessor._compare_with_refs import (
     bias,
@@ -957,6 +957,7 @@ def assert_correct_p_value(
     assert p_value.attributes == {}
     assert p_value.has_lazy_data() is is_lazy
     assert p_value.dtype == np.float32
+    print(p_value.data)
     if np.ma.is_masked(data):
         np.testing.assert_equal(p_value.data.mask, data.mask)
     np.testing.assert_allclose(p_value.data, data)
@@ -1152,6 +1153,16 @@ def test_t_test_cubes_masked(regular_cubes, ref_cubes, lazy):
     )
 
 
+def test_t_test_multidim_coord_fail(regular_cubes):
+    """Test t_test with multidimensional coordinate."""
+    cube = regular_cubes[0]
+    x_coord = AuxCoord([[0, 0], [0, 0]], var_name="x")
+    cube.add_aux_coord(x_coord, (0, 1))
+    msg = r"Multi-dimensional coordinate not supported: 'x'"
+    with pytest.raises(CoordinateMultiDimError, match=re.escape(msg)):
+        t_test([cube], reference=cube, coordinate=x_coord)
+
+
 def test_t_test_reference_none_cubes_fail(regular_cubes):
     """Test t_test with reference=None and cubes given."""
     msg = (
@@ -1187,6 +1198,5 @@ def test_two_references_for_t_test_fail(regular_cubes, ref_cubes):
 
 
 # ADD:
-# - masked data
-# - coordinate=None
+# - 1D case
 # - **kwargs

From 1a1bae800f21ec7ca799144828a77ca9ae5d2bc4 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 14:44:56 +0200
Subject: [PATCH 19/21] Add test case for 1D data

---
 .../test_compare_with_refs.py                 | 40 ++++++++++++++++---
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
index 2c27580bfb..1ebaf70994 100644
--- a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
+++ b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
@@ -957,7 +957,6 @@ def assert_correct_p_value(
     assert p_value.attributes == {}
     assert p_value.has_lazy_data() is is_lazy
     assert p_value.dtype == np.float32
-    print(p_value.data)
     if np.ma.is_masked(data):
         np.testing.assert_equal(p_value.data.mask, data.mask)
     np.testing.assert_allclose(p_value.data, data)
@@ -1079,6 +1078,40 @@ def test_t_test_cubes(regular_cubes, ref_cubes, lazy):
     )
 
 
+@pytest.mark.parametrize("lazy", [True, False])
+def test_t_test_cubes_1d(regular_cubes, ref_cubes, lazy):
+    """Test calculation of t_test with 1D cubes."""
+    cube = regular_cubes[0][0, :, 0]
+    ref_cube = ref_cubes[0][0, :, 0]
+    if lazy:
+        cube.data = cube.lazy_data()
+        ref_cube.data = ref_cube.lazy_data()
+
+    out_cubes = t_test(
+        [cube],
+        ref_cube,
+        coordinate="latitude",
+        equal_var=False,
+    )
+
+    assert cube.has_lazy_data() is lazy
+    assert ref_cube.has_lazy_data() is lazy
+
+    assert isinstance(out_cubes, CubeList)
+    assert len(out_cubes) == 1
+    out_cube = out_cubes[0]
+
+    assert out_cube.has_lazy_data() is lazy
+    assert out_cube.dtype == np.float32
+    assert_allclose(out_cube.data, cube.data)
+    assert out_cube.var_name == "tas"
+    assert out_cube.standard_name == "air_temperature"
+    assert out_cube.units == "K"
+    assert out_cube.dim_coords == cube.dim_coords
+    assert out_cube.aux_coords == cube.aux_coords
+    assert_correct_p_value(out_cube, [0.4999999701976776], is_lazy=lazy)
+
+
 @pytest.mark.parametrize("ref_lazy", [True, False])
 def test_t_test_cubes_partly_lazy(regular_cubes, ref_cubes, ref_lazy):
     """Test calculation of t_test with cubes."""
@@ -1195,8 +1228,3 @@ def test_two_references_for_t_test_fail(regular_cubes, ref_cubes):
     msg = r"Expected exactly 1 dataset with 'reference_for_t_test: true', found 2"
     with pytest.raises(ValueError, match=re.escape(msg)):
         t_test(products)
-
-
-# ADD:
-# - 1D case
-# - **kwargs

From bba79ac2a8bd19ec2cc3b0cc3baf48bb6655a6de Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 14:57:29 +0200
Subject: [PATCH 20/21] Make sure that bias also preserves cell measures and
 add corresponding test

---
 esmvalcore/preprocessor/_compare_with_refs.py | 16 ++++----
 .../test_compare_with_refs.py                 | 37 ++++++++++++++++++-
 2 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py
index 69d931727a..a94fe47402 100644
--- a/esmvalcore/preprocessor/_compare_with_refs.py
+++ b/esmvalcore/preprocessor/_compare_with_refs.py
@@ -198,19 +198,19 @@ def _get_ref(
     return (reference, ref_product)
 
 
-def _calculate_bias(
-    cube: Cube,
-    reference: Cube,
-    bias_type: BiasType,
-) -> Cube:
+def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType) -> Cube:
     """Calculate bias for a single cube relative to a reference cube."""
     cube_metadata = cube.metadata
 
-    # save ancillary variables as they get removed in the bias calculation
+    # Save ancillary variables and cell measures as they get removed in the
+    # bias calculation
     ancillary_variables_and_dims = [
         (a, cube.ancillary_variable_dims(a))
         for a in cube.ancillary_variables()
     ]
+    cell_measures_and_dims = [
+        (c, cube.cell_measure_dims(c)) for c in cube.cell_measures()
+    ]
 
     if bias_type == "absolute":
         cube = cube - reference
@@ -228,9 +228,11 @@ def _calculate_bias(
     cube.metadata = cube_metadata
     cube.units = new_units
 
-    # Reattach ancillary variables
+    # Reattach ancillary variables and cell measures
     for ancillary_variable, dims in ancillary_variables_and_dims:
         cube.add_ancillary_variable(ancillary_variable, dims)
+    for cell_measure, dims in cell_measures_and_dims:
+        cube.add_cell_measure(cell_measure, dims)
 
     return cube
 
diff --git a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
index 1ebaf70994..736058febc 100644
--- a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
+++ b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py
@@ -11,7 +11,7 @@
 import numpy as np
 import pytest
 from cf_units import Unit
-from iris.coords import AuxCoord, CellMeasure, CellMethod
+from iris.coords import AncillaryVariable, AuxCoord, CellMeasure, CellMethod
 from iris.cube import Cube, CubeList
 from iris.exceptions import CoordinateMultiDimError, CoordinateNotFoundError
 
@@ -351,6 +351,41 @@ def test_bias_products_and_ref_cube(
     assert product_a.mock_ancestors == set()
 
 
+@pytest.mark.parametrize(("bias_type", "data", "units"), TEST_BIAS)
+def test_bias_cubes_preserve_metadata(
+    regular_cubes,
+    ref_cubes,
+    bias_type,
+    data,
+    units,
+):
+    """Test calculation of bias with cubes."""
+    cube = regular_cubes[0]
+    ancillary_var = AncillaryVariable([0, 1], var_name="a")
+    cell_measure = CellMeasure([0, 1], var_name="c")
+    cube.add_ancillary_variable(ancillary_var, 0)
+    cube.add_cell_measure(cell_measure, 1)
+    ref_cube = ref_cubes[0]
+
+    out_cubes = bias(regular_cubes, ref_cube, bias_type=bias_type)
+
+    assert isinstance(out_cubes, CubeList)
+    assert len(out_cubes) == 1
+    out_cube = out_cubes[0]
+
+    assert out_cube.dtype == np.float32
+    assert_allclose(out_cube.data, data)
+    assert out_cube.var_name == "tas"
+    assert out_cube.standard_name == "air_temperature"
+    assert out_cube.units == units
+    assert out_cube.dim_coords == cube.dim_coords
+    assert out_cube.aux_coords == cube.aux_coords
+    assert out_cube.ancillary_variables() == [ancillary_var]
+    assert out_cube.ancillary_variable_dims(ancillary_var) == (0,)
+    assert out_cube.cell_measures() == [cell_measure]
+    assert out_cube.cell_measure_dims(cell_measure) == (1,)
+
+
 def test_no_reference_for_bias(regular_cubes, ref_cubes):
     """Test fail when no reference_for_bias is given."""
     products = {

From 04006d715b8cda4e4115e57d9455bae635df32e3 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 1 Jul 2026 15:08:27 +0200
Subject: [PATCH 21/21] Improve docs

---
 esmvalcore/preprocessor/_compare_with_refs.py | 36 +++++++++----------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py
index a94fe47402..3583a97961 100644
--- a/esmvalcore/preprocessor/_compare_with_refs.py
+++ b/esmvalcore/preprocessor/_compare_with_refs.py
@@ -628,45 +628,41 @@ def t_test(
     This is a test for the null hypothesis that 2 independent samples (dataset
     and reference dataset) have identical average (expected) values.
 
-    The reference dataset needs to be broadcastable to all input `products`.
-    This supports `iris' rich broadcasting abilities
-    <https://scitools-iris.readthedocs.io/en/stable/userguide/cube_maths.
-    html#calculating-a-cube-anomaly>`__. To ensure this, the preprocessors
-    :func:`esmvalcore.preprocessor.regrid` and/or
-    :func:`esmvalcore.preprocessor.regrid_time` might be helpful.
+    All input datasets need to have the same shape. To ensure this, the
+    preprocessors :func:`esmvalcore.preprocessor.regrid`  might be helpful.
 
     Notes
     -----
-    The reference dataset can be specified with the `reference` argument. If
-    `reference` is ``None``, exactly one input dataset in the `products` set
-    needs to have the facet ``reference_for_t_test: true`` defined in the
-    recipe. Please do **not** specify the option `reference` when using this
+    The reference dataset can be specified with the ``reference`` argument. If
+    ``reference`` is ``None``, exactly one input dataset in the ``products``
+    set needs to have the facet ``reference_for_t_test: true`` defined in the
+    recipe. Please do **not** specify the option ``reference`` when using this
     preprocessor function in a recipe.
 
     Parameters
     ----------
     products:
-        Input datasets/cubes for which the significance is calculated relative
-        to a reference dataset/cube.
+        Input datasets/cubes for which the t-test is performed relative to a
+        reference dataset/cube.
     reference:
         Cube which is used as reference for the t-test calculation. If
         ``None``, `products` needs to be a :obj:`set` of
-        `~esmvalcore.preprocessor.PreprocessorFile` objects and exactly one
-        dataset in `products` needs the facet ``reference_for_t_test: true``.
-        Do not specify this argument in a recipe.
+        :class:`~esmvalcore.preprocessor.PreprocessorFile` objects and exactly
+        one dataset in ``products`` needs the facet ``reference_for_t_test:
+        true``. Do not specify this argument in a recipe.
     coordinate:
         Coordinate axis of the input along which to compute the statistic.
     **kwargs:
         Additional keyword arguments passed to :func:`scipy.stats.ttest_ind`
-        (not all input data are lazy) or
-        :func:`dask.array.stats.ttest_ind.ttest_ind` (all input data are lazy).
+        (not all input data are lazy) or :func:`dask.array.stats.ttest_ind`
+        (all input data are lazy).
 
     Returns
     -------
     :
         Output datasets/cubes. Will be a :obj:`set` of
         :class:`~esmvalcore.preprocessor.PreprocessorFile` objects if
-        `products` is also one, a :class:`~iris.cube.CubeList` otherwise. The
+        ``products`` is also one, a :class:`~iris.cube.CubeList` otherwise. The
         cubes contain an attached ancillary variable which is the p-value
         w.r.t. the chosen t-test.
 
@@ -742,7 +738,7 @@ def _calculate_t_test(
         raise CoordinateMultiDimError(cube.coord(coordinate))
     axis = cube.coord_dims(coordinate)[0]
     remaining_axes = tuple(sorted(set(range(cube.ndim)) - {axis}))
-    shape = tuple(cube.shape[a] for a in remaining_axes)
+    target_shape = tuple(cube.shape[a] for a in remaining_axes)
 
     if cube.has_lazy_data() and reference.has_lazy_data():
         t_test = dask.array.stats.ttest_ind(
@@ -752,7 +748,7 @@ def _calculate_t_test(
             **kwargs,
         )
         # For some reason, t_test.pvalue is not a da.array, but a dask.Delayed
-        p_value_arr = da.from_delayed(t_test.pvalue, shape, cube.dtype)
+        p_value_arr = da.from_delayed(t_test.pvalue, target_shape, cube.dtype)
         p_value_arr = da.ma.masked_invalid(p_value_arr)
     else:
         # Avoid realizing cube.data