diff --git a/esmvalcore/config/configurations/defaults/extra_facets_emac.yml b/esmvalcore/config/configurations/defaults/extra_facets_emac.yml index 169a69860f..8ac3d9d05d 100644 --- a/esmvalcore/config/configurations/defaults/extra_facets_emac.yml +++ b/esmvalcore/config/configurations/defaults/extra_facets_emac.yml @@ -257,9 +257,12 @@ projects: ta: raw_name: [tm1_cav, tm1_ave, tm1] channel: Amon + # tro3: + # raw_name: [O3_cav, O3_ave, O3] + # channel: Amon tro3: - raw_name: [O3_cav, O3_ave, O3] - channel: Amon + raw_name: [O3_p39_cav, O3_cav, O3_ave, O3] + channel: AERmon ua: raw_name: [um1_cav, um1_ave, um1] channel: Amon diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index 798edb36a8..520c67ddb0 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -31,7 +31,11 @@ meridional_statistics, zonal_statistics, ) -from esmvalcore.preprocessor._compare_with_refs import bias, distance_metric +from esmvalcore.preprocessor._compare_with_refs import ( + bias, + distance_metric, + t_test, +) from esmvalcore.preprocessor._concatenate import concatenate from esmvalcore.preprocessor._cycles import amplitude from esmvalcore.preprocessor._dask_progress import _compute_with_progress @@ -217,6 +221,7 @@ # Multi model statistics "multi_model_statistics", # Comparison with reference datasets + "t_test", "bias", "distance_metric", # Remove supplementary variables from cube @@ -260,6 +265,7 @@ "multi_model_statistics", "mask_multimodel", "mask_fillvalues", + "t_test", } diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py index 4e6fc6c180..3583a97961 100644 --- a/esmvalcore/preprocessor/_compare_with_refs.py +++ b/esmvalcore/preprocessor/_compare_with_refs.py @@ -8,13 +8,15 @@ import dask import dask.array as da +import dask.array.stats import iris.analysis import iris.analysis.stats import numpy as np from iris.common.metadata import CubeMetadata -from iris.coords import CellMethod +from iris.coords import AncillaryVariable, CellMethod from iris.cube import Cube, CubeList -from scipy.stats import wasserstein_distance +from iris.exceptions import CoordinateMultiDimError +from scipy.stats import ttest_ind, wasserstein_distance from esmvalcore.iris_helpers import ( ignore_iris_vague_metadata_warnings, @@ -33,7 +35,7 @@ if TYPE_CHECKING: from collections.abc import Iterable - from iris.coords import Coord + from iris.coords import AuxCoord, Coord, DimCoord from esmvalcore.preprocessor import PreprocessorFile @@ -200,6 +202,16 @@ def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType) -> Cube: """Calculate bias for a single cube relative to a reference cube.""" cube_metadata = cube.metadata + # Save ancillary variables and cell measures as they get removed in the + # bias calculation + ancillary_variables_and_dims = [ + (a, cube.ancillary_variable_dims(a)) + for a in cube.ancillary_variables() + ] + cell_measures_and_dims = [ + (c, cube.cell_measure_dims(c)) for c in cube.cell_measures() + ] + if bias_type == "absolute": cube = cube - reference new_units = cube.units @@ -216,6 +228,12 @@ def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType) -> Cube: cube.metadata = cube_metadata cube.units = new_units + # Reattach ancillary variables and cell measures + for ancillary_variable, dims in ancillary_variables_and_dims: + cube.add_ancillary_variable(ancillary_variable, dims) + for cell_measure, dims in cell_measures_and_dims: + cube.add_cell_measure(cell_measure, dims) + return cube @@ -597,3 +615,172 @@ def _get_emd( if np.ma.is_masked(arr) or np.ma.is_masked(ref_arr): return np.ma.masked # this is safe because PMFs will be masked arrays return wasserstein_distance(bin_centers, bin_centers, arr, ref_arr) + + +def t_test( + products: set[PreprocessorFile] | Iterable[Cube], + reference: Cube | None = None, + coordinate: str | AuxCoord | DimCoord = "time", + **kwargs: Any, +) -> set[PreprocessorFile] | CubeList: + """Perform t-test between dataset and reference dataset. + + This is a test for the null hypothesis that 2 independent samples (dataset + and reference dataset) have identical average (expected) values. + + All input datasets need to have the same shape. To ensure this, the + preprocessors :func:`esmvalcore.preprocessor.regrid` might be helpful. + + Notes + ----- + The reference dataset can be specified with the ``reference`` argument. If + ``reference`` is ``None``, exactly one input dataset in the ``products`` + set needs to have the facet ``reference_for_t_test: true`` defined in the + recipe. Please do **not** specify the option ``reference`` when using this + preprocessor function in a recipe. + + Parameters + ---------- + products: + Input datasets/cubes for which the t-test is performed relative to a + reference dataset/cube. + reference: + Cube which is used as reference for the t-test calculation. If + ``None``, `products` needs to be a :obj:`set` of + :class:`~esmvalcore.preprocessor.PreprocessorFile` objects and exactly + one dataset in ``products`` needs the facet ``reference_for_t_test: + true``. Do not specify this argument in a recipe. + coordinate: + Coordinate axis of the input along which to compute the statistic. + **kwargs: + Additional keyword arguments passed to :func:`scipy.stats.ttest_ind` + (not all input data are lazy) or :func:`dask.array.stats.ttest_ind` + (all input data are lazy). + + Returns + ------- + : + Output datasets/cubes. Will be a :obj:`set` of + :class:`~esmvalcore.preprocessor.PreprocessorFile` objects if + ``products`` is also one, a :class:`~iris.cube.CubeList` otherwise. The + cubes contain an attached ancillary variable which is the p-value + w.r.t. the chosen t-test. + + Raises + ------ + ValueError + Not exactly one input datasets contains the facet + ``reference_for_t_test: true`` if ``reference=None``; + ``reference=None`` and the input products are given as iterable of + :class:`~iris.cube.Cube` objects. + CoordinateMultiDimError + ``coordinate`` is not 1D. + + """ + ref_product = None + all_cubes_given = all(isinstance(p, Cube) for p in products) + + # Get reference cube if not explicitly given + if reference is None: + if all_cubes_given: + msg = ( + "A list of Cubes is given to this preprocessor; please " + "specify a `reference`" + ) + raise ValueError(msg) + (reference, ref_product) = _get_ref(products, "reference_for_t_test") + else: + ref_product = None + + # If input is an Iterable of Cube objects, calculate t-test for each element + if all_cubes_given: + cubes = [ + _calculate_t_test(c, reference, coordinate, **kwargs) + for c in products + ] + return CubeList(cubes) + + # Otherwise, iterate over all input products, calculate t-test and adapt + # metadata and provenance information accordingly + output_products = set() + for product in products: + if product == ref_product: + continue + cube = concatenate(product.cubes) + + # Calculate t-test + cube = _calculate_t_test(cube, reference, coordinate, **kwargs) + + # Adapt metadata and provenance information + if ref_product is not None: + product.wasderivedfrom(ref_product) + + product.cubes = CubeList([cube]) + output_products.add(product) + + # Add reference dataset to output + if ref_product is not None: + output_products.add(ref_product) + + return output_products + + +def _calculate_t_test( + cube: Cube, + reference: Cube, + coordinate: str | AuxCoord | DimCoord, + **kwargs: Any, +) -> Cube: + """Calculate the t-test and attach the p-value as ancillary variable to cube.""" + cube = cube.copy() # do not modify input cube + + if cube.coord(coordinate).ndim > 1: + raise CoordinateMultiDimError(cube.coord(coordinate)) + axis = cube.coord_dims(coordinate)[0] + remaining_axes = tuple(sorted(set(range(cube.ndim)) - {axis})) + target_shape = tuple(cube.shape[a] for a in remaining_axes) + + if cube.has_lazy_data() and reference.has_lazy_data(): + t_test = dask.array.stats.ttest_ind( + da.ma.filled(cube.core_data(), np.nan), + da.ma.filled(reference.core_data(), np.nan), + axis=axis, + **kwargs, + ) + # For some reason, t_test.pvalue is not a da.array, but a dask.Delayed + p_value_arr = da.from_delayed(t_test.pvalue, target_shape, cube.dtype) + p_value_arr = da.ma.masked_invalid(p_value_arr) + else: + # Avoid realizing cube.data + cube_data = np.ma.filled( + cube.lazy_data().compute() if cube.has_lazy_data() else cube.data, + np.nan, + ) + ref_data = np.ma.filled( + reference.lazy_data().compute() + if reference.has_lazy_data() + else reference.data, + np.nan, + ) + t_test = ttest_ind(cube_data, ref_data, axis=axis, **kwargs) + p_value_arr = np.ma.masked_invalid(t_test.pvalue) + + # Ensure that float dtype from parents is inherited + if ( + np.issubdtype(cube.dtype, np.floating) + and np.issubdtype(reference.dtype, np.floating) + and cube.dtype == reference.dtype + ): + p_value_arr = p_value_arr.astype(cube.dtype) + + cube.add_ancillary_variable( + AncillaryVariable( + p_value_arr, + long_name="p-value", + var_name="pvalue", + units="1", + ), + remaining_axes, + ) + + return cube diff --git a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py index d9de9c7d4b..736058febc 100644 --- a/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py +++ b/tests/unit/preprocessor/_compare_with_refs/test_compare_with_refs.py @@ -1,17 +1,25 @@ """Unit tests for :mod:`esmvalcore.preprocessor._compare_with_refs`.""" +from __future__ import annotations + import contextlib +import re +from typing import Any import dask.array as da import iris import numpy as np import pytest from cf_units import Unit -from iris.coords import CellMeasure, CellMethod +from iris.coords import AncillaryVariable, AuxCoord, CellMeasure, CellMethod from iris.cube import Cube, CubeList -from iris.exceptions import CoordinateNotFoundError +from iris.exceptions import CoordinateMultiDimError, CoordinateNotFoundError -from esmvalcore.preprocessor._compare_with_refs import bias, distance_metric +from esmvalcore.preprocessor._compare_with_refs import ( + bias, + distance_metric, + t_test, +) from tests import PreprocessorFile @@ -343,6 +351,41 @@ def test_bias_products_and_ref_cube( assert product_a.mock_ancestors == set() +@pytest.mark.parametrize(("bias_type", "data", "units"), TEST_BIAS) +def test_bias_cubes_preserve_metadata( + regular_cubes, + ref_cubes, + bias_type, + data, + units, +): + """Test calculation of bias with cubes.""" + cube = regular_cubes[0] + ancillary_var = AncillaryVariable([0, 1], var_name="a") + cell_measure = CellMeasure([0, 1], var_name="c") + cube.add_ancillary_variable(ancillary_var, 0) + cube.add_cell_measure(cell_measure, 1) + ref_cube = ref_cubes[0] + + out_cubes = bias(regular_cubes, ref_cube, bias_type=bias_type) + + assert isinstance(out_cubes, CubeList) + assert len(out_cubes) == 1 + out_cube = out_cubes[0] + + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, data) + assert out_cube.var_name == "tas" + assert out_cube.standard_name == "air_temperature" + assert out_cube.units == units + assert out_cube.dim_coords == cube.dim_coords + assert out_cube.aux_coords == cube.aux_coords + assert out_cube.ancillary_variables() == [ancillary_var] + assert out_cube.ancillary_variable_dims(ancillary_var) == (0,) + assert out_cube.cell_measures() == [cell_measure] + assert out_cube.cell_measure_dims(cell_measure) == (1,) + + def test_no_reference_for_bias(regular_cubes, ref_cubes): """Test fail when no reference_for_bias is given.""" products = { @@ -382,8 +425,8 @@ def test_invalid_bias_type(regular_cubes, ref_cubes): bias(products, bias_type="invalid_bias_type") -def test_reference_none_cubes(regular_cubes): - """Test reference=None with with cubes.""" +def test_bias_reference_none_cubes(regular_cubes): + """Test bias with reference=None and cubes given.""" msg = ( "A list of Cubes is given to this preprocessor; please specify a " "`reference`" @@ -932,3 +975,291 @@ def test_distance_metric_no_lon_for_area_weights(regular_cubes, metric, error): reference=ref_cube, coords=["time", "latitude"], ) + + +def assert_correct_p_value( + cube: Cube, + data: Any, # noqa: ANN401 + *, + is_lazy: bool = False, +) -> None: + assert len(cube.ancillary_variables()) == 1 + p_value = cube.ancillary_variables()[0] + assert p_value.standard_name is None + assert p_value.long_name == "p-value" + assert p_value.var_name == "pvalue" + assert p_value.units == "1" + assert p_value.attributes == {} + assert p_value.has_lazy_data() is is_lazy + assert p_value.dtype == np.float32 + if np.ma.is_masked(data): + np.testing.assert_equal(p_value.data.mask, data.mask) + np.testing.assert_allclose(p_value.data, data) + + +@pytest.mark.parametrize("use_reference_product", [True, False]) +def test_t_test_products(regular_cubes, ref_cubes, use_reference_product): # noqa: PLR0915 + """Test calculation of t_test with products.""" + products = { + PreprocessorFile(regular_cubes, "A", {"dataset": "a"}), + PreprocessorFile(regular_cubes, "B", {"dataset": "b"}), + } + + if use_reference_product: + ref_product = PreprocessorFile( + ref_cubes, + "REF", + {"reference_for_t_test": True}, + ) + products.add(ref_product) + expected_ancestors = {ref_product} + reference = None + else: + expected_ancestors = set() + reference = ref_cubes[0] + + out_products = t_test(products, reference=reference) + + assert isinstance(out_products, set) + out_dict = products_set_to_dict(out_products) + assert len(out_dict) == len(products) + + product_a = out_dict["A"] + assert product_a.filename == "A" + assert product_a.attributes == {"dataset": "a"} + assert len(product_a.cubes) == 1 + out_cube = product_a.cubes[0] + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, regular_cubes[0].data) + assert out_cube.var_name == "tas" + assert out_cube.standard_name == "air_temperature" + assert out_cube.units == "K" + assert out_cube.dim_coords == regular_cubes[0].dim_coords + assert out_cube.aux_coords == regular_cubes[0].aux_coords + assert_correct_p_value( + out_cube, + [[1.0, 0.6666667], [0.42264974, 0.46547753]], + ) + assert product_a.wasderivedfrom.call_count == len(expected_ancestors) + assert product_a.mock_ancestors == expected_ancestors + + product_b = out_dict["B"] + assert product_b.filename == "B" + assert product_b.attributes == {"dataset": "b"} + assert len(product_b.cubes) == 1 + out_cube = product_b.cubes[0] + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, regular_cubes[0].data) + assert out_cube.var_name == "tas" + assert out_cube.standard_name == "air_temperature" + assert out_cube.units == "K" + assert out_cube.dim_coords == regular_cubes[0].dim_coords + assert out_cube.aux_coords == regular_cubes[0].aux_coords + assert_correct_p_value( + out_cube, + [[1.0, 0.6666667], [0.42264974, 0.46547753]], + ) + assert product_b.wasderivedfrom.call_count == len(expected_ancestors) + assert product_b.mock_ancestors == expected_ancestors + + if use_reference_product: + product_ref = out_dict["REF"] + assert product_ref.filename == "REF" + assert product_ref.attributes == {"reference_for_t_test": True} + assert len(product_ref.cubes) == 1 + out_cube = product_ref.cubes[0] + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, ref_cubes[0].data) + assert out_cube.var_name == "tas" + assert out_cube.standard_name == "air_temperature" + assert out_cube.units == "K" + assert out_cube.dim_coords == ref_cubes[0].dim_coords + assert out_cube.aux_coords == ref_cubes[0].aux_coords + assert out_cube.ancillary_variables() == [] + product_ref.wasderivedfrom.assert_not_called() + assert product_ref.mock_ancestors == set() + + +@pytest.mark.parametrize("lazy", [True, False]) +def test_t_test_cubes(regular_cubes, ref_cubes, lazy): + """Test calculation of t_test with cubes.""" + cube = regular_cubes[0] + ref_cube = ref_cubes[0] + if lazy: + cube.data = cube.lazy_data() + ref_cube.data = ref_cube.lazy_data() + + out_cubes = t_test([cube], ref_cube) + + assert cube.has_lazy_data() is lazy + assert ref_cube.has_lazy_data() is lazy + + assert isinstance(out_cubes, CubeList) + assert len(out_cubes) == 1 + out_cube = out_cubes[0] + + assert out_cube.has_lazy_data() is lazy + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, cube.data) + assert out_cube.var_name == "tas" + assert out_cube.standard_name == "air_temperature" + assert out_cube.units == "K" + assert out_cube.dim_coords == regular_cubes[0].dim_coords + assert out_cube.aux_coords == regular_cubes[0].aux_coords + assert_correct_p_value( + out_cube, + [[1.0, 0.6666667], [0.42264974, 0.46547753]], + is_lazy=lazy, + ) + + +@pytest.mark.parametrize("lazy", [True, False]) +def test_t_test_cubes_1d(regular_cubes, ref_cubes, lazy): + """Test calculation of t_test with 1D cubes.""" + cube = regular_cubes[0][0, :, 0] + ref_cube = ref_cubes[0][0, :, 0] + if lazy: + cube.data = cube.lazy_data() + ref_cube.data = ref_cube.lazy_data() + + out_cubes = t_test( + [cube], + ref_cube, + coordinate="latitude", + equal_var=False, + ) + + assert cube.has_lazy_data() is lazy + assert ref_cube.has_lazy_data() is lazy + + assert isinstance(out_cubes, CubeList) + assert len(out_cubes) == 1 + out_cube = out_cubes[0] + + assert out_cube.has_lazy_data() is lazy + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, cube.data) + assert out_cube.var_name == "tas" + assert out_cube.standard_name == "air_temperature" + assert out_cube.units == "K" + assert out_cube.dim_coords == cube.dim_coords + assert out_cube.aux_coords == cube.aux_coords + assert_correct_p_value(out_cube, [0.4999999701976776], is_lazy=lazy) + + +@pytest.mark.parametrize("ref_lazy", [True, False]) +def test_t_test_cubes_partly_lazy(regular_cubes, ref_cubes, ref_lazy): + """Test calculation of t_test with cubes.""" + cube = regular_cubes[0] + ref_cube = ref_cubes[0] + if ref_lazy: + ref_cube.data = ref_cube.lazy_data() + else: + cube.data = cube.lazy_data() + + out_cubes = t_test([cube], ref_cube) + + assert cube.has_lazy_data() is not ref_lazy + assert ref_cube.has_lazy_data() is ref_lazy + + assert isinstance(out_cubes, CubeList) + assert len(out_cubes) == 1 + out_cube = out_cubes[0] + + assert out_cube.has_lazy_data() is not ref_lazy + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, cube.data) + assert out_cube.var_name == "tas" + assert out_cube.standard_name == "air_temperature" + assert out_cube.units == "K" + assert out_cube.dim_coords == regular_cubes[0].dim_coords + assert out_cube.aux_coords == regular_cubes[0].aux_coords + assert_correct_p_value( + out_cube, + [[1.0, 0.6666667], [0.42264974, 0.46547753]], + ) + + +@pytest.mark.parametrize("lazy", [True, False]) +def test_t_test_cubes_masked(regular_cubes, ref_cubes, lazy): + """Test calculation of t_test with cubes (masked data).""" + cube = regular_cubes[0] + cube.data = np.ma.masked_inside(cube.data, 1.5, 3.5) + cube.data = np.ma.masked_greater(cube.data, 6.5) + ref_cube = ref_cubes[0] + ref_cube.data = np.ma.masked_invalid( + np.array( + [[[2.0, np.nan], [2.0, 2.0]], [[2.0, 2.0], [2.0, 2.0]]], + dtype=np.float32, + ), + ) + if lazy: + cube.data = cube.lazy_data() + ref_cube.data = ref_cube.lazy_data() + + out_cubes = t_test([cube], ref_cube) + + assert cube.has_lazy_data() is lazy + assert ref_cube.has_lazy_data() is lazy + + assert isinstance(out_cubes, CubeList) + assert len(out_cubes) == 1 + out_cube = out_cubes[0] + + assert out_cube.has_lazy_data() is lazy + assert out_cube.dtype == np.float32 + assert_allclose(out_cube.data, cube.data) + assert out_cube.var_name == "tas" + assert out_cube.standard_name == "air_temperature" + assert out_cube.units == "K" + assert out_cube.dim_coords == regular_cubes[0].dim_coords + assert out_cube.aux_coords == regular_cubes[0].aux_coords + assert_correct_p_value( + out_cube, + np.ma.masked_invalid([[1.0, np.nan], [np.nan, np.nan]]), + is_lazy=lazy, + ) + + +def test_t_test_multidim_coord_fail(regular_cubes): + """Test t_test with multidimensional coordinate.""" + cube = regular_cubes[0] + x_coord = AuxCoord([[0, 0], [0, 0]], var_name="x") + cube.add_aux_coord(x_coord, (0, 1)) + msg = r"Multi-dimensional coordinate not supported: 'x'" + with pytest.raises(CoordinateMultiDimError, match=re.escape(msg)): + t_test([cube], reference=cube, coordinate=x_coord) + + +def test_t_test_reference_none_cubes_fail(regular_cubes): + """Test t_test with reference=None and cubes given.""" + msg = ( + r"A list of Cubes is given to this preprocessor; please specify a " + r"`reference`" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + t_test(regular_cubes) + + +def test_no_reference_for_t_test_fail(regular_cubes, ref_cubes): + """Test fail when no reference_for_t_test is given.""" + products = { + PreprocessorFile(regular_cubes, "A", {}), + PreprocessorFile(regular_cubes, "B", {}), + PreprocessorFile(ref_cubes, "REF", {}), + } + msg = r"Expected exactly 1 dataset with 'reference_for_t_test: true', found 0" + with pytest.raises(ValueError, match=re.escape(msg)): + t_test(products) + + +def test_two_references_for_t_test_fail(regular_cubes, ref_cubes): + """Test fail when two reference_for_t_test products are given.""" + products = { + PreprocessorFile(regular_cubes, "A", {"reference_for_t_test": False}), + PreprocessorFile(ref_cubes, "REF1", {"reference_for_t_test": True}), + PreprocessorFile(ref_cubes, "REF2", {"reference_for_t_test": True}), + } + msg = r"Expected exactly 1 dataset with 'reference_for_t_test: true', found 2" + with pytest.raises(ValueError, match=re.escape(msg)): + t_test(products)