From e61cbec145a3c5a46edf2845e36de1471e1c8d21 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 22 Mar 2022 13:49:50 +0100 Subject: [PATCH 1/5] add _repr_html_ for PandasMultiIndexingAdapter This may greatly speed-up the html repr of Xarray objects with multi-indexes This optimized _repr_html_ is now used for formatting the array detailed view of multi-index coordinates, instead of converting the full index / levels to numpy arrays before formatting it. --- xarray/core/indexing.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index c797e6652de..776c61bd739 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -5,6 +5,7 @@ from contextlib import suppress from dataclasses import dataclass, field from datetime import timedelta +from html import escape from typing import ( TYPE_CHECKING, Any, @@ -1513,23 +1514,31 @@ def __repr__(self) -> str: ) return f"{type(self).__name__}{props}" - def _repr_inline_(self, max_width) -> str: - # special implementation to speed-up the repr for big multi-indexes + def _get_array_subset(self, size: int) -> np.ndarray: + # used to speed-up the repr for big multi-indexes + + if self.size > 200 and size < self.size: + n_values = size + indices = np.concatenate([np.arange(0, n_values), np.arange(-n_values, 0)]) + subset = self[OuterIndexer((indices,))] + else: + subset = self + + return np.asarray(subset) + + def _repr_inline_(self, max_width: int) -> str: + from .formatting import format_array_flat + if self.level is None: return "MultiIndex" else: - from .formatting import format_array_flat + return format_array_flat(self._get_array_subset(max_width), max_width) - if self.size > 100 and max_width < self.size: - n_values = max_width - indices = np.concatenate( - [np.arange(0, n_values), np.arange(-n_values, 0)] - ) - subset = self[OuterIndexer((indices,))] - else: - subset = self + def _repr_html_(self) -> str: + from .formatting import short_numpy_repr - return format_array_flat(np.asarray(subset), max_width) + array_repr = short_numpy_repr(self._get_array_subset(200)) + return f"
{escape(array_repr)}
" def copy(self, deep: bool = True) -> "PandasMultiIndexingAdapter": # see PandasIndexingAdapter.copy From afc3c8a64fea69816db9cf170e419a3acf0b65e9 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 22 Mar 2022 13:59:38 +0100 Subject: [PATCH 2/5] update release notes --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e88faee2a43..2e063e8d44d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -50,6 +50,8 @@ Bug fixes By `Benoît Bovy `_. - Fixed "unhashable type" error trying to read NetCDF file with variable having its 'units' attribute not ``str`` (e.g. ``numpy.ndarray``) (:issue:`6368`). By `Oleh Khoma `_. +- Fixed the poor html repr performance on large multi-indexes (:pull:`5529`). + By `Benoît Bovy `_. Documentation ~~~~~~~~~~~~~ From 3e03e6ea6e0ae30a02f11f48863a7a8323cb9130 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 22 Mar 2022 14:18:45 +0100 Subject: [PATCH 3/5] nit --- xarray/core/indexing.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 776c61bd739..ac81f146c88 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1516,10 +1516,8 @@ def __repr__(self) -> str: def _get_array_subset(self, size: int) -> np.ndarray: # used to speed-up the repr for big multi-indexes - if self.size > 200 and size < self.size: - n_values = size - indices = np.concatenate([np.arange(0, n_values), np.arange(-n_values, 0)]) + indices = np.concatenate([np.arange(0, size), np.arange(-size, 0)]) subset = self[OuterIndexer((indices,))] else: subset = self From 49d472b974f2d2281fd8ad0b09bc7fa7e31de270 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 28 Mar 2022 17:58:19 +0200 Subject: [PATCH 4/5] add display_values_threshold --- doc/whats-new.rst | 6 +++++- xarray/core/formatting.py | 6 +++++- xarray/core/indexing.py | 13 ++++++++----- xarray/core/options.py | 6 ++++++ xarray/tests/test_formatting.py | 6 ++++++ 5 files changed, 30 insertions(+), 7 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 2e063e8d44d..ced03a95e99 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -28,6 +28,10 @@ New Features - Multi-index levels are now accessible through their own, regular coordinates instead of virtual coordinates (:pull:`5692`). By `Benoît Bovy `_. +- Add a ``display_values_threshold`` option to control the total number of array + elements which trigger summarization rather than full repr in (numpy) array + detailed views of the html repr (:pull:`6400`). + By `Benoît Bovy `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -50,7 +54,7 @@ Bug fixes By `Benoît Bovy `_. - Fixed "unhashable type" error trying to read NetCDF file with variable having its 'units' attribute not ``str`` (e.g. ``numpy.ndarray``) (:issue:`6368`). By `Oleh Khoma `_. -- Fixed the poor html repr performance on large multi-indexes (:pull:`5529`). +- Fixed the poor html repr performance on large multi-indexes (:pull:`6400`). By `Benoît Bovy `_. Documentation diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 81617ae38f9..e372e3bdd40 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -520,7 +520,11 @@ def short_numpy_repr(array): # default to lower precision so a full (abbreviated) line can fit on # one line with the default display_width - options = {"precision": 6, "linewidth": OPTIONS["display_width"], "threshold": 200} + options = { + "precision": 6, + "linewidth": OPTIONS["display_width"], + "threshold": OPTIONS["display_values_threshold"], + } if array.ndim < 3: edgeitems = 3 elif array.ndim == 3: diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index ac81f146c88..641d43fb2ac 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -26,6 +26,7 @@ from . import duck_array_ops, nputils, utils from .npcompat import DTypeLike +from .options import OPTIONS from .pycompat import ( dask_array_type, dask_version, @@ -1514,10 +1515,12 @@ def __repr__(self) -> str: ) return f"{type(self).__name__}{props}" - def _get_array_subset(self, size: int) -> np.ndarray: + def _get_array_subset(self) -> np.ndarray: # used to speed-up the repr for big multi-indexes - if self.size > 200 and size < self.size: - indices = np.concatenate([np.arange(0, size), np.arange(-size, 0)]) + threshold = max(100, OPTIONS["display_values_threshold"] + 2) + if self.size > threshold: + pos = threshold // 2 + indices = np.concatenate([np.arange(0, pos), np.arange(-pos, 0)]) subset = self[OuterIndexer((indices,))] else: subset = self @@ -1530,12 +1533,12 @@ def _repr_inline_(self, max_width: int) -> str: if self.level is None: return "MultiIndex" else: - return format_array_flat(self._get_array_subset(max_width), max_width) + return format_array_flat(self._get_array_subset(), max_width) def _repr_html_(self) -> str: from .formatting import short_numpy_repr - array_repr = short_numpy_repr(self._get_array_subset(200)) + array_repr = short_numpy_repr(self._get_array_subset()) return f"
{escape(array_repr)}
" def copy(self, deep: bool = True) -> "PandasMultiIndexingAdapter": diff --git a/xarray/core/options.py b/xarray/core/options.py index 0c45e126fe6..399afe90b66 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -15,6 +15,7 @@ class T_Options(TypedDict): cmap_divergent: Union[str, "Colormap"] cmap_sequential: Union[str, "Colormap"] display_max_rows: int + display_values_threshold: int display_style: Literal["text", "html"] display_width: int display_expand_attrs: Literal["default", True, False] @@ -33,6 +34,7 @@ class T_Options(TypedDict): "cmap_divergent": "RdBu_r", "cmap_sequential": "viridis", "display_max_rows": 12, + "display_values_threshold": 200, "display_style": "html", "display_width": 80, "display_expand_attrs": "default", @@ -57,6 +59,7 @@ def _positive_integer(value): _VALIDATORS = { "arithmetic_join": _JOIN_OPTIONS.__contains__, "display_max_rows": _positive_integer, + "display_values_threshold": _positive_integer, "display_style": _DISPLAY_OPTIONS.__contains__, "display_width": _positive_integer, "display_expand_attrs": lambda choice: choice in [True, False, "default"], @@ -154,6 +157,9 @@ class set_options: * ``default`` : to expand unless over a pre-defined limit display_max_rows : int, default: 12 Maximum display rows. + display_values_threshold : int, default: 200 + Total number of array elements which trigger summarization rather + than full repr for variable data views (numpy arrays). display_style : {"text", "html"}, default: "html" Display style to use in jupyter for xarray objects. display_width : int, default: 80 diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 105cec7e850..efdb8a57288 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -479,6 +479,12 @@ def test_short_numpy_repr() -> None: num_lines = formatting.short_numpy_repr(array).count("\n") + 1 assert num_lines < 30 + # threshold option (default: 200) + array = np.arange(100) + assert "..." not in formatting.short_numpy_repr(array) + with xr.set_options(display_values_threshold=10): + assert "..." in formatting.short_numpy_repr(array) + def test_large_array_repr_length() -> None: From b8f732c61a86be5d1e8efbf3a906f9a5f69c31fd Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 28 Mar 2022 18:05:55 +0200 Subject: [PATCH 5/5] fix last merge main --- xarray/core/indexing.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 35df801db5b..27bd4954bc4 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -27,13 +27,7 @@ from . import duck_array_ops, nputils, utils from .npcompat import DTypeLike from .options import OPTIONS -from .pycompat import ( - dask_array_type, - dask_version, - integer_types, - is_duck_dask_array, - sparse_array_type, -) +from .pycompat import dask_version, integer_types, is_duck_dask_array, sparse_array_type from .types import T_Xarray from .utils import either_dict_or_kwargs, get_valid_numpy_dtype