From f3680469793e1212295eb5ca7089bd9d1121a566 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 3 Dec 2015 17:22:08 -0800 Subject: [PATCH] Rework DataArray internals Fixes GH367 Fixes GH634 The internal data model used by :py:class:`~xray.DataArray` has been rewritten to fix several outstanding issues (:issue:`367`, :issue:`634`, `this stackoverflow report`_). Internally, ``DataArray`` is now implemented in terms of ``._variable`` and ``._coords`` attributes instead of holding variables in a ``Dataset`` object. --- doc/whats-new.rst | 58 ++++++ xray/core/alignment.py | 10 + xray/core/combine.py | 25 ++- xray/core/common.py | 4 +- xray/core/coordinates.py | 91 +++++---- xray/core/dataarray.py | 391 +++++++++++++++++++----------------- xray/core/dataset.py | 220 +++++--------------- xray/core/groupby.py | 24 +-- xray/core/merge.py | 170 ++++++++++++++++ xray/core/variable.py | 22 +- xray/test/__init__.py | 3 +- xray/test/test_backends.py | 4 +- xray/test/test_combine.py | 9 +- xray/test/test_dask.py | 2 +- xray/test/test_dataarray.py | 86 ++++++-- xray/test/test_dataset.py | 19 +- xray/test/test_plot.py | 2 +- xray/test/test_variable.py | 18 +- 18 files changed, 702 insertions(+), 456 deletions(-) create mode 100644 xray/core/merge.py diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4d405c7e4b2..3461433f9a2 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -9,6 +9,64 @@ What's New import xray np.random.seed(123456) +v0.7.0 (unreleased) +------------------- + +.. _v0.7.0.breaking: + +Breaking changes +~~~~~~~~~~~~~~~~ + +- The internal data model used by :py:class:`~xray.DataArray` has been + rewritten to fix several outstanding issues (:issue:`367`, :issue:`634`, + `this stackoverflow report`_). Internally, ``DataArray`` is now implemented + in terms of ``._variable`` and ``._coords`` attributes instead of holding + variables in a ``Dataset`` object. + + This refactor ensures that if a DataArray has the + same name as one of its coordinates, the array and the coordinate no longer + share the same data. + + In practice, this means that creating a DataArray with the same ``name`` as + one of its dimensions no longer automatically uses that array to label the + corresponding coordinate. You will now need to provide coordinate labels + explicitly. Here's the old behavior: + + .. ipython:: + :verbatim: + + In [2]: xray.DataArray([4, 5, 6], dims='x', name='x') + Out[2]: + + array([4, 5, 6]) + Coordinates: + * x (x) int64 4 5 6 + + and the new behavior (compare the values of the ``x`` coordinate): + + .. ipython:: + :verbatim: + + In [2]: xray.DataArray([4, 5, 6], dims='x', name='x') + Out[2]: + + array([4, 5, 6]) + Coordinates: + * x (x) int64 0 1 2 + +- It is no longer possible to convert a DataArray to a Dataset with + :py:meth:`xray.DataArray.to_dataset` if it is unnamed. This will now + raise ``ValueError``. If the array is unnamed, you need to supply the + ``name`` argument. + +.. _this stackoverflow report: http://stackoverflow.com/questions/33158558/python-xray-extract-first-and-last-time-value-within-each-month-of-a-timeseries + +Bug fixes +~~~~~~~~~ + +- Fixes for several issues found on ``DataArray`` objects with the same name + as one of their coordinates (see :ref:`v0.7.0.breaking` for more details). + v0.6.2 (unreleased) ------------------- diff --git a/xray/core/alignment.py b/xray/core/alignment.py index 9afb04feac9..737fc4494cb 100644 --- a/xray/core/alignment.py +++ b/xray/core/alignment.py @@ -100,6 +100,16 @@ def partial_align(*objects, **kwargs): return tuple(obj.reindex(copy=copy, **joined_indexes) for obj in objects) +def align_variables(variables, join='outer', copy=False): + """Align all DataArrays in the provided dict, leaving other values alone. + """ + alignable = [k for k, v in variables.items() if hasattr(v, 'indexes')] + aligned = align(*[variables[a] for a in alignable], join=join, copy=copy) + new_variables = OrderedDict(variables) + new_variables.update(zip(alignable, aligned)) + return new_variables + + def reindex_variables(variables, indexes, indexers, method=None, tolerance=None, copy=True): """Conform a dictionary of aligned variables onto a new set of variables, diff --git a/xray/core/combine.py b/xray/core/combine.py index b331232c1bd..6946981da57 100644 --- a/xray/core/combine.py +++ b/xray/core/combine.py @@ -4,7 +4,7 @@ from . import utils from .pycompat import iteritems, reduce, OrderedDict, basestring -from .variable import Variable +from .variable import Variable, as_variable, Coordinate def concat(objs, dim=None, data_vars='all', coords='different', @@ -120,8 +120,6 @@ def _calc_concat_dim_coord(dim): Infer the dimension name and 1d coordinate variable (if appropriate) for concatenating along the new dimension. """ - from .dataarray import DataArray - if isinstance(dim, basestring): coord = None elif not hasattr(dim, 'dims'): @@ -129,8 +127,11 @@ def _calc_concat_dim_coord(dim): dim_name = getattr(dim, 'name', None) if dim_name is None: dim_name = 'concat_dim' - coord = DataArray(dim, dims=dim_name, name=dim_name) + coord = Coordinate(dim_name, dim) dim = dim_name + elif not hasattr(dim, 'name'): + coord = as_variable(dim).to_coord() + dim, = coord.dims else: coord = dim dim, = coord.dims @@ -207,6 +208,7 @@ def _dataset_concat(datasets, dim, data_vars, coords, compat, positions): concat_over = _calc_concat_over(datasets, dim, data_vars, coords) def insert_result_variable(k, v): + assert isinstance(v, Variable) if k in datasets[0].coords: result_coord_names.add(k) result_vars[k] = v @@ -267,22 +269,19 @@ def ensure_common_dims(vars): combined = Variable.concat(vars, dim, positions) insert_result_variable(k, combined) - # result._coord_names.update(datasets[0].coords) + result = Dataset(result_vars, attrs=result_attrs) + result = result.set_coords(result_coord_names) if coord is not None: # add concat dimension last to ensure that its in the final Dataset - insert_result_variable(coord.name, coord) - # result[coord.name] = coord - - result = Dataset(result_vars, attrs=result_attrs) - result = result.set_coords(result_coord_names) + result[coord.name] = coord return result def _dataarray_concat(arrays, dim, data_vars, coords, compat, positions): - from .dataarray import DataArray + arrays = list(arrays) if data_vars != 'all': raise ValueError('data_vars is not a valid argument when ' @@ -297,11 +296,11 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat, raise ValueError('array names not identical') else: arr = arr.rename(name) - datasets.append(arr._dataset) + datasets.append(arr._to_temp_dataset()) ds = _dataset_concat(datasets, dim, data_vars, coords, compat, positions) - return DataArray._new_from_dataset_no_copy(ds, name) + return arrays[0]._from_temp_dataset(ds, name) def _auto_concat(datasets, dim=None): diff --git a/xray/core/common.py b/xray/core/common.py index 6d031c1d577..d2067fdfa85 100644 --- a/xray/core/common.py +++ b/xray/core/common.py @@ -186,7 +186,7 @@ def assign_coords(self, **kwargs): Dataset.assign """ data = self.copy(deep=False) - results = data._calc_assign_results(kwargs) + results = self._calc_assign_results(kwargs) data.coords.update(results) return data @@ -333,7 +333,7 @@ def resample(self, freq, dim, how='mean', skipna=None, closed=None, RESAMPLE_DIM = '__resample_dim__' if isinstance(dim, basestring): dim = self[dim] - group = DataArray(dim, name=RESAMPLE_DIM) + group = DataArray(dim, [(RESAMPLE_DIM, dim)], name=RESAMPLE_DIM) time_grouper = pd.TimeGrouper(freq=freq, how=how, closed=closed, label=label, base=base) gb = self.groupby_cls(self, group, grouper=time_grouper) diff --git a/xray/core/coordinates.py b/xray/core/coordinates.py index f7906ac78c7..3436b3e8b61 100644 --- a/xray/core/coordinates.py +++ b/xray/core/coordinates.py @@ -2,8 +2,9 @@ from contextlib import contextmanager import pandas as pd -from .pycompat import iteritems, basestring, OrderedDict from . import formatting +from .merge import merge_dataarray_coords +from .pycompat import iteritems, basestring, OrderedDict def _coord_merge_finalize(target, other, target_conflicts, other_conflicts, @@ -37,16 +38,12 @@ def _dim_shape(var): class AbstractCoordinates(Mapping): - @property - def _names(self): - return self._dataset._coord_names - def __getitem__(self, key): if (key in self._names or (isinstance(key, basestring) and key.split('.')[0] in self._names)): # allow indexing current coordinates or components - return self._dataset[key] + return self._data[key] else: raise KeyError(key) @@ -55,7 +52,7 @@ def __setitem__(self, key, value): def __iter__(self): # needs to be in the same order as the dataset variables - for k in self._dataset._variables: + for k in self._variables: if k in self._names: yield k @@ -65,30 +62,19 @@ def __len__(self): def __contains__(self, key): return key in self._names - def __delitem__(self, key): - if key in self: - del self._dataset[key] - else: - raise KeyError(key) - def __repr__(self): return formatting.coords_repr(self) @property def dims(self): - return self._dataset.dims - - def to_dataset(self): - """Convert these coordinates into a new Dataset - """ - return self._dataset._copy_listed(self._names) + return self._data.dims def to_index(self, ordered_dims=None): """Convert all index coordinates into a :py:class:`pandas.MultiIndex` """ if ordered_dims is None: ordered_dims = self.dims - indexes = [self._dataset._variables[k].to_index() for k in ordered_dims] + indexes = [self._variables[k].to_index() for k in ordered_dims] return pd.MultiIndex.from_product(indexes, names=list(ordered_dims)) def _merge_validate(self, other): @@ -100,7 +86,7 @@ def _merge_validate(self, other): promote_dims = {} for k in self: if k in other: - self_var = self._dataset._variables[k] + self_var = self._variables[k] other_var = other[k].variable if not self_var.broadcast_equals(other_var): if k in self.dims and k in other.dims: @@ -165,12 +151,31 @@ class DatasetCoordinates(AbstractCoordinates): objects. """ def __init__(self, dataset): - self._dataset = dataset + self._data = dataset + + @property + def _names(self): + return self._data._coord_names + + @property + def _variables(self): + return self._data._variables + + def to_dataset(self): + """Convert these coordinates into a new Dataset + """ + return self._data._copy_listed(self._names) def update(self, other): - self._dataset.update(other) + self._data.update(other) self._names.update(other.keys()) + def __delitem__(self, key): + if key in self: + del self._data[key] + else: + raise KeyError(key) + class DataArrayCoordinates(AbstractCoordinates): """Dictionary like container for DataArray coordinates. @@ -180,20 +185,38 @@ class DataArrayCoordinates(AbstractCoordinates): objects. """ def __init__(self, dataarray): - self._dataarray = dataarray - self._dataset = dataarray._dataset + self._data = dataarray - def update(self, other): - with self._dataarray._set_new_dataset() as ds: - ds.coords.update(other) - bad_dims = [d for d in ds.dims if d not in self.dims] - if bad_dims: - raise ValueError('DataArray does not include all coordinate ' - 'dimensions: %s' % bad_dims) + @property + def _names(self): + return set(self._data._coords) @property - def dims(self): - return self._dataarray.dims + def _variables(self): + return self._data._coords + + def _to_dataset(self, shallow_copy=True): + from .dataset import Dataset + coords = OrderedDict((k, v.copy(deep=False) if shallow_copy else v) + for k, v in self._data._coords.items()) + dims = dict(zip(self.dims, self._data.shape)) + return Dataset._construct_direct(coords, coord_names=set(self._names), + dims=dims, attrs=None) + + def to_dataset(self): + return self._to_dataset() + + def update(self, other): + new_vars = merge_dataarray_coords( + self._data.indexes, self._data._coords, other) + + self._data._coords = new_vars + + def __delitem__(self, key): + if key in self.dims: + raise ValueError('cannot delete a coordinate corresponding to a ' + 'DataArray dimension') + del self._data._coords[key] class Indexes(Mapping): diff --git a/xray/core/dataarray.py b/xray/core/dataarray.py index c905f886917..7c7584d7338 100644 --- a/xray/core/dataarray.py +++ b/xray/core/dataarray.py @@ -11,13 +11,13 @@ from . import groupby from . import ops from . import utils -from . import variable from .alignment import align -from .common import AbstractArray, BaseDataObject +from .common import AbstractArray, BaseDataObject, squeeze from .coordinates import DataArrayCoordinates, Indexes from .dataset import Dataset from .pycompat import iteritems, basestring, OrderedDict, zip -from .variable import as_variable, _as_compatible_data, Coordinate +from .variable import (as_variable, Variable, as_compatible_data, Coordinate, + default_index_coordinate) from .formatting import format_item @@ -31,7 +31,7 @@ def _infer_coords_and_dims(shape, coords, dims): 'data' % (len(coords), len(shape))) if isinstance(dims, basestring): - dims = [dims] + dims = (dims,) if dims is None: dims = ['dim_%s' % n for n in range(len(shape))] @@ -41,24 +41,36 @@ def _infer_coords_and_dims(shape, coords, dims): dims = list(coords.keys()) else: for n, (dim, coord) in enumerate(zip(dims, coords)): - if getattr(coord, 'name', None) is None: - coord = as_variable(coord, key=dim).to_coord() + coord = as_variable(coord, key=dim).to_coord() dims[n] = coord.name + dims = tuple(dims) else: for d in dims: if not isinstance(d, basestring): raise TypeError('dimension %s is not a string' % d) - if coords is not None and not utils.is_dict_like(coords): - # ensure coordinates have the right dimensions - coords = [Coordinate(dim, coord, getattr(coord, 'attrs', {})) - for dim, coord in zip(dims, coords)] - if coords is None: - coords = {} - elif not utils.is_dict_like(coords): - coords = OrderedDict(zip(dims, coords)) + new_coords = OrderedDict() - return coords, dims + if utils.is_dict_like(coords): + for k, v in coords.items(): + new_coords[k] = as_variable(v, key=k, copy=True) + elif coords is not None: + for dim, coord in zip(dims, coords): + var = as_variable(coord, key=dim, copy=True) + var.dims = (dim,) + new_coords[dim] = var + + for dim, size in zip(dims, shape): + if dim not in new_coords: + new_coords[dim] = default_index_coordinate(dim, size) + + for k, v in new_coords.items(): + if any(d not in dims for d in v.dims): + raise ValueError('coordinate %s has dimensions %s, but these ' + 'are not a subset of the DataArray ' + 'dimensions %s' % (k, v.dims, dims)) + + return new_coords, dims class _LocIndexer(object): @@ -86,6 +98,14 @@ def __setitem__(self, key, value): self.data_array[self._remap_key(key)] = value +class _ThisArray(object): + """An instance of this object is used as the key corresponding to the + variable when converting arbitrary DataArray objects to datasets + """ + def __repr__(self): + return '' + + class DataArray(AbstractArray, BaseDataObject): """N-dimensional array with labeled coordinates and dimensions. @@ -127,7 +147,7 @@ class DataArray(AbstractArray, BaseDataObject): groupby_cls = groupby.DataArrayGroupBy def __init__(self, data, coords=None, dims=None, name=None, - attrs=None, encoding=None): + attrs=None, encoding=None, fastpath=False): """ Parameters ---------- @@ -159,67 +179,71 @@ def __init__(self, data, coords=None, dims=None, name=None, 'units' and 'calendar' (the later two only for datetime arrays). Unrecognized keys are ignored. """ - # try to fill in arguments from data if they weren't supplied - if coords is None: - coords = getattr(data, 'coords', None) - if isinstance(data, pd.Series): - coords = [data.index] - elif isinstance(data, pd.DataFrame): - coords = [data.index, data.columns] - elif isinstance(data, (pd.Index, variable.Coordinate)): - coords = [data] - elif isinstance(data, pd.Panel): - coords = [data.items, data.major_axis, data.minor_axis] - if dims is None: - dims = getattr(data, 'dims', getattr(coords, 'dims', None)) - if name is None: - name = getattr(data, 'name', None) - if attrs is None: - attrs = getattr(data, 'attrs', None) - if encoding is None: - encoding = getattr(data, 'encoding', None) - - data = _as_compatible_data(data) - coords, dims = _infer_coords_and_dims(data.shape, coords, dims) - dataset = Dataset(coords=coords) - # insert data afterwards in case of redundant coords/data - dataset[name] = (dims, data, attrs, encoding) - - for k, v in iteritems(dataset.coords): - if any(d not in dims for d in v.dims): - raise ValueError('coordinate %s has dimensions %s, but these ' - 'are not a subset of the DataArray ' - 'dimensions %s' % (k, v.dims, dims)) - - # these fully describe a DataArray - self._dataset = dataset + if fastpath: + variable = data + assert dims is None + assert attrs is None + assert encoding is None + else: + # try to fill in arguments from data if they weren't supplied + if coords is None: + coords = getattr(data, 'coords', None) + if isinstance(data, pd.Series): + coords = [data.index] + elif isinstance(data, pd.DataFrame): + coords = [data.index, data.columns] + elif isinstance(data, (pd.Index, Coordinate)): + coords = [data] + elif isinstance(data, pd.Panel): + coords = [data.items, data.major_axis, data.minor_axis] + if dims is None: + dims = getattr(data, 'dims', getattr(coords, 'dims', None)) + if name is None: + name = getattr(data, 'name', None) + if attrs is None: + attrs = getattr(data, 'attrs', None) + if encoding is None: + encoding = getattr(data, 'encoding', None) + + data = as_compatible_data(data) + coords, dims = _infer_coords_and_dims(data.shape, coords, dims) + variable = Variable(dims, data, attrs, encoding, fastpath=True) + + # These fully describe a DataArray + self._variable = variable + self._coords = coords self._name = name - @classmethod - def _new_from_dataset(cls, original_dataset, name): - """Private constructor for the benefit of Dataset.__getitem__ (skips - all validation) - """ - dataset = original_dataset._copy_listed([name], keep_attrs=False) - if name not in dataset: - # handle virtual variables - try: - _, name = name.split('.', 1) - except Exception: - raise KeyError(name) - if name not in dataset._dims: - dataset._coord_names.discard(name) - return cls._new_from_dataset_no_copy(dataset, name) + __default = object() - @classmethod - def _new_from_dataset_no_copy(cls, dataset, name): - obj = object.__new__(cls) - obj._dataset = dataset - obj._name = name - return obj + def _replace(self, variable=None, coords=None, name=__default): + if variable is None: + variable = self.variable + if coords is None: + coords = self._coords + if name is self.__default: + name = self.name + return type(self)(variable, coords, name=name, fastpath=True) + + def _replace_maybe_drop_dims(self, variable, name=__default): + if variable.dims == self.dims: + coords = None + else: + allowed_dims = set(variable.dims) + coords = OrderedDict((k, v) for k, v in self._coords.items() + if set(v.dims) <= allowed_dims) + return self._replace(variable, coords, name) + + __this_array = _ThisArray() - def _with_replaced_dataset(self, dataset): - return self._new_from_dataset_no_copy(dataset, self.name) + def _to_temp_dataset(self): + return self._to_dataset_whole(name=self.__this_array, + shallow_copy=False) + + def _from_temp_dataset(self, dataset, name=__default): + variable = dataset._variables.pop(self.__this_array) + coords = dataset._variables + return self._replace(variable, coords, name) def _to_dataset_split(self, dim): def subset(dim, label): @@ -233,11 +257,18 @@ def subset(dim, label): del coords[dim] return Dataset(variables, coords, self.attrs) - def _to_dataset_whole(self, name): + def _to_dataset_whole(self, name=None, shallow_copy=True): if name is None: - return self._dataset.copy() - else: - return self.rename(name)._dataset + name = self.name + if name is None: + raise ValueError('unable to convert unnamed DataArray to a ' + 'Dataset without providing an explicit name') + if name in self.coords: + raise ValueError('cannot create a Dataset from a DataArray with ' + 'the same name as one of its coordinates') + dataset = self.coords._to_dataset(shallow_copy=shallow_copy) + dataset[name] = self.variable + return dataset def to_dataset(self, dim=None, name=None): """Convert a DataArray to a Dataset. @@ -277,24 +308,13 @@ def name(self): """ return self._name - @contextlib.contextmanager - def _set_new_dataset(self): - """Context manager to use for modifying _dataset, in a manner that - can be safely rolled back if an error is encountered. - """ - ds = self._dataset.copy(deep=False) - yield ds - self._dataset = ds - @name.setter def name(self, value): - with self._set_new_dataset() as ds: - ds.rename({self.name: value}, inplace=True) self._name = value @property def variable(self): - return self._dataset._variables[self.name] + return self._variable @property def dtype(self): @@ -366,7 +386,14 @@ def _item_key_to_dict(self, key): def __getitem__(self, key): if isinstance(key, basestring): - return self.coords[key] + from .dataset import _get_virtual_variable + + try: + var = self._coords[key] + except KeyError: + _, key, var = _get_virtual_variable(self._coords, key) + + return self._replace_maybe_drop_dims(var, name=key) else: # orthogonal array indexing return self.isel(**self._item_key_to_dict(key)) @@ -379,7 +406,7 @@ def __setitem__(self, key, value): self.variable[key] = value def __delitem__(self, key): - del self._dataset[key] + del self.coords[key] @property def _attr_sources(self): @@ -387,7 +414,7 @@ def _attr_sources(self): return [self.coords, self.attrs] def __contains__(self, key): - return key in self._dataset + return key in self._coords @property def loc(self): @@ -449,10 +476,19 @@ def reset_coords(self, names=None, drop=False, inplace=False): raise ValueError('cannot reset coordinates in-place on a ' 'DataArray without ``drop == True``') if names is None: - names = (self._dataset._coord_names - set(self.dims) - - set([self.name])) - ds = self._dataset.reset_coords(names, drop, inplace) - return ds[self.name] if drop else ds + names = set(self.coords) - set(self.dims) + dataset = self.coords.to_dataset().reset_coords(names, drop) + if drop: + if inplace: + self._coords = dataset._variables + else: + return self._replace(coords=dataset._variables) + else: + if self.name is None: + raise ValueError('cannot reset_coords with drop=False ' + 'on an unnamed DataArrray') + dataset[self.name] = self.variable + return dataset def load(self): """Manually trigger loading of this array's data from disk or a @@ -463,7 +499,10 @@ def load(self): load data automatically. However, this method can be necessary when working with many file objects on disk. """ - self._dataset.load() + ds = self._to_temp_dataset().load() + new = self._from_temp_dataset(ds) + self._variable = new._variable + self._coords = new._coords return self def load_data(self): # pragma: no cover @@ -479,8 +518,10 @@ def copy(self, deep=True): dataset. Otherwise, a shallow copy is made, so each variable in the new array's dataset is also a variable in this array's dataset. """ - ds = self._dataset.copy(deep=deep) - return self._with_replaced_dataset(ds) + variable = self.variable.copy(deep=deep) + coords = OrderedDict((k, v.copy(deep=deep)) + for k, v in self._coords.items()) + return self._replace(variable, coords) def __copy__(self): return self.copy(deep=False) @@ -524,8 +565,8 @@ def chunk(self, chunks=None): if isinstance(chunks, (list, tuple)): chunks = dict(zip(self.dims, chunks)) - ds = self._dataset.chunk(chunks) - return self._with_replaced_dataset(ds) + ds = self._to_temp_dataset().chunk(chunks) + return self._from_temp_dataset(ds) def isel(self, **indexers): """Return a new DataArray whose dataset is given by integer indexing @@ -536,8 +577,8 @@ def isel(self, **indexers): Dataset.isel DataArray.sel """ - ds = self._dataset.isel(**indexers) - return self._with_replaced_dataset(ds) + ds = self._to_temp_dataset().isel(**indexers) + return self._from_temp_dataset(ds) def sel(self, method=None, tolerance=None, **indexers): """Return a new DataArray whose dataset is given by selecting @@ -559,8 +600,8 @@ def isel_points(self, dim='points', **indexers): -------- Dataset.isel_points """ - ds = self._dataset.isel_points(dim=dim, **indexers) - return self._with_replaced_dataset(ds) + ds = self._to_temp_dataset().isel_points(dim=dim, **indexers) + return self._from_temp_dataset(ds) def sel_points(self, dim='points', method=None, tolerance=None, **indexers): @@ -571,9 +612,9 @@ def sel_points(self, dim='points', method=None, tolerance=None, -------- Dataset.sel_points """ - ds = self._dataset.sel_points(dim=dim, method=method, - tolerance=tolerance, **indexers) - return self._with_replaced_dataset(ds) + ds = self._to_temp_dataset().sel_points( + dim=dim, method=method, tolerance=tolerance, **indexers) + return self._from_temp_dataset(ds) def reindex_like(self, other, method=None, tolerance=None, copy=True): """Conform this object onto the indexes of another object, filling @@ -660,9 +701,9 @@ def reindex(self, method=None, tolerance=None, copy=True, **indexers): DataArray.reindex_like align """ - ds = self._dataset.reindex(method=method, tolerance=tolerance, - copy=copy, **indexers) - return self._with_replaced_dataset(ds) + ds = self._to_temp_dataset().reindex( + method=method, tolerance=tolerance, copy=copy, **indexers) + return self._from_temp_dataset(ds) def rename(self, new_name_or_name_dict): """Returns a new DataArray with renamed coordinates and/or a new name. @@ -686,13 +727,12 @@ def rename(self, new_name_or_name_dict): DataArray.swap_dims """ if utils.is_dict_like(new_name_or_name_dict): - name_dict = new_name_or_name_dict - new_name = name_dict.get(self.name, self.name) + name_dict = new_name_or_name_dict.copy() + name = name_dict.pop(self.name, self.name) + dataset = self._to_temp_dataset().rename(name_dict) + return self._from_temp_dataset(dataset, name) else: - new_name = new_name_or_name_dict - name_dict = {self.name: new_name} - renamed_dataset = self._dataset.rename(name_dict) - return renamed_dataset[new_name] + return self._replace(name=new_name_or_name_dict) def swap_dims(self, dims_dict): """Returns a new DataArray with swapped dimensions. @@ -717,8 +757,8 @@ def swap_dims(self, dims_dict): DataArray.rename Dataset.swap_dims """ - ds = self._dataset.swap_dims(dims_dict) - return self._with_replaced_dataset(ds) + ds = self._to_temp_dataset().swap_dims(dims_dict) + return self._from_temp_dataset(ds) def transpose(self, *dims): """Return a new DataArray object with transposed dimensions. @@ -744,9 +784,8 @@ def transpose(self, *dims): numpy.transpose Dataset.transpose """ - ds = self._dataset.copy() - ds[self.name] = self.variable.transpose(*dims) - return self._with_replaced_dataset(ds) + variable = self.variable.transpose(*dims) + return self._replace(variable) def squeeze(self, dim=None): """Return a new DataArray object with squeezed data. @@ -773,8 +812,7 @@ def squeeze(self, dim=None): -------- numpy.squeeze """ - ds = self._dataset.squeeze(dim) - return self._with_replaced_dataset(ds) + return squeeze(self, dict(zip(self.dims, self.shape)), dim) def drop(self, labels, dim=None): """Drop coordinates or index labels from this DataArray. @@ -793,10 +831,8 @@ def drop(self, labels, dim=None): """ if utils.is_scalar(labels): labels = [labels] - if dim is None and self.name in labels: - raise ValueError('cannot drop this DataArray from itself') - ds = self._dataset.drop(labels, dim) - return self._with_replaced_dataset(ds) + ds = self._to_temp_dataset().drop(labels, dim) + return self._from_temp_dataset(ds) def dropna(self, dim, how='any', thresh=None): """Returns a new array with dropped labels for missing values along @@ -817,8 +853,8 @@ def dropna(self, dim, how='any', thresh=None): ------- DataArray """ - ds = self._dataset.dropna(dim, how=how, thresh=thresh) - return self._with_replaced_dataset(ds) + ds = self._to_temp_dataset().dropna(dim, how=how, thresh=thresh) + return self._from_temp_dataset(ds) def fillna(self, value): """Fill missing values in this object. @@ -874,9 +910,7 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=False, **kwargs): summarized data and the indicated dimension(s) removed. """ var = self.variable.reduce(func, dim, axis, keep_attrs, **kwargs) - ds = self._dataset.drop(set(self.dims) - set(var.dims)) - ds[self.name] = var - return self._with_replaced_dataset(ds) + return self._replace_maybe_drop_dims(var) def to_pandas(self): """Convert this array into a pandas object with the same shape. @@ -905,7 +939,7 @@ def to_pandas(self): 'pandas objects' % self.ndim) return constructor(self.values, *self.indexes.values()) - def to_dataframe(self): + def to_dataframe(self, name=None): """Convert this array and its coordinates into a tidy pandas.DataFrame. The DataFrame is indexed by the Cartesian product of index coordinates @@ -913,9 +947,24 @@ def to_dataframe(self): Other coordinates are included as columns in the DataFrame. """ - # TODO: add a 'name' parameter + if name is None: + name = self.name + if name is None: + raise ValueError('cannot convert an unnamed DataArray to a ' + 'DataFrame: use the ``name`` parameter') + dims = OrderedDict(zip(self.dims, self.shape)) - return self._dataset._to_dataframe(dims) + # By using a unique name, we can convert a DataArray into a DataFrame + # even if it shares a name with one of its coordinates. + # I would normally use unique_name = object() but that results in a + # dataframe with columns in the wrong order, for reasons I have not + # been able to debug (possibly a pandas bug?). + unique_name = '__unique_name_identifier_z98xfz98xugfg73ho__' + ds = self._to_dataset_whole(name=unique_name) + df = ds._to_dataframe(dims) + df.columns = [name if c == unique_name else c + for c in df.columns] + return df def to_series(self): """Convert this array into a pandas.Series. @@ -953,9 +1002,10 @@ def from_series(cls, series): method. """ # TODO: add a 'name' parameter - df = pd.DataFrame({series.name: series}) + name = series.name + df = pd.DataFrame({name: series}) ds = Dataset.from_dataframe(df) - return cls._new_from_dataset_no_copy(ds, series.name) + return ds[name] def to_cdms2(self): """Convert this array into a cdms2.Variable @@ -1030,36 +1080,17 @@ def identical(self, other): __default_name = object() def _result_name(self, other=None): - - if self.name in self.dims: - # these names match dimension, so if we preserve them we will also - # rename indexes - return None - - if other is None: - # shortcut - return self.name - - other_name = getattr(other, 'name', self.__default_name) - other_dims = getattr(other, 'dims', ()) - - if other_name in other_dims: - # same trouble as above - return None - # use the same naming heuristics as pandas: # https://github.com/ContinuumIO/blaze/issues/458#issuecomment-51936356 + other_name = getattr(other, 'name', self.__default_name) if other_name is self.__default_name or other_name == self.name: return self.name - - return None + else: + return None def __array_wrap__(self, obj, context=None): new_var = self.variable.__array_wrap__(obj, context) - ds = self.coords.to_dataset() - name = self._result_name() - ds[name] = new_var - return self._new_from_dataset_no_copy(ds, name) + return self._replace(new_var) @staticmethod def _unary_op(f): @@ -1081,15 +1112,16 @@ def func(self, other): if empty_indexes: raise ValueError('no overlapping labels for some ' 'dimensions: %s' % empty_indexes) - other_coords = getattr(other, 'coords', None) other_variable = getattr(other, 'variable', other) - ds = self.coords.merge(other_coords) - name = self._result_name(other) - ds[name] = (f(self.variable, other_variable) + other_coords = getattr(other, 'coords', None) + + variable = (f(self.variable, other_variable) if not reflexive else f(other_variable, self.variable)) - result = self._new_from_dataset_no_copy(ds, name) - return result + coords = self.coords.merge(other_coords)._variables + name = self._result_name(other) + + return self._replace(variable, coords, name) return func @staticmethod @@ -1108,7 +1140,7 @@ def func(self, other): @property def plot(self): - ''' + """ Access plotting functions >>> d = DataArray([[1, 2], [3, 4]]) @@ -1120,11 +1152,11 @@ def plot(self): DataArray methods >>> d.plot.imshow() # equivalent to xray.plot.imshow(d) - ''' + """ return _PlotMethods(self) def _title_for_slice(self, truncate=50): - ''' + """ If the dataarray has 1 dimensional coordinates or comes from a slice we can show that info in the title @@ -1138,7 +1170,7 @@ def _title_for_slice(self, truncate=50): title : string Can be used for plot titles - ''' + """ one_dims = [] for dim, coord in iteritems(self.coords): if coord.size == 1: @@ -1186,8 +1218,8 @@ def diff(self, dim, n=1, label='upper'): * x (x) int64 3 4 """ - ds = self._dataset.diff(n=n, dim=dim, label=label) - return self._with_replaced_dataset(ds) + ds = self._to_temp_dataset().diff(n=n, dim=dim, label=label) + return self._from_temp_dataset(ds) def shift(self, **shifts): """Shift this array by an offset along one or more dimensions. @@ -1223,9 +1255,8 @@ def shift(self, **shifts): Coordinates: * x (x) int64 0 1 2 """ - ds = self._dataset.copy() - ds[self.name] = self.variable.shift(**shifts) - return self._with_replaced_dataset(ds) + variable = self.variable.shift(**shifts) + return self._replace(variable) def roll(self, **shifts): """Roll this array by an offset along one or more dimensions. @@ -1258,16 +1289,16 @@ def roll(self, **shifts): Coordinates: * x (x) int64 2 0 1 """ - ds = self._dataset.roll(**shifts) - return self._with_replaced_dataset(ds) + ds = self._to_temp_dataset().roll(**shifts) + return self._from_temp_dataset(ds) @property def real(self): - return self._with_replaced_dataset(self._dataset.real) + return self._replace(self.variable.real) @property def imag(self): - return self._with_replaced_dataset(self._dataset.imag) + return self._replace(self.variable.imag) # priority most be higher than Variable to properly work with binary ufuncs ops.inject_all_ops_and_reduce_methods(DataArray, priority=60) diff --git a/xray/core/dataset.py b/xray/core/dataset.py index 9e873358ca8..964b298c1be 100644 --- a/xray/core/dataset.py +++ b/xray/core/dataset.py @@ -14,11 +14,13 @@ from . import alignment from . import formatting from .. import conventions -from .alignment import align, partial_align +from .alignment import align, align_variables from .coordinates import DatasetCoordinates, Indexes from .common import ImplementsDatasetReduce, BaseDataObject +from .merge import merge_datasets, expand_variables from .utils import Frozen, SortedKeysDict, ChainMap, maybe_wrap_array, hashable -from .variable import as_variable, Variable, Coordinate, broadcast_variables +from .variable import (as_variable, Variable, Coordinate, broadcast_variables, + default_index_coordinate) from .pycompat import (iteritems, basestring, OrderedDict, dask_array_type) from .combine import concat @@ -61,92 +63,6 @@ def _get_virtual_variable(variables, key): return ref_name, var_name, Variable(ref_var.dims, data) -def _as_dataset_variable(name, var): - """Prepare a variable for adding it to a Dataset - """ - try: - var = as_variable(var, key=name) - except TypeError: - raise TypeError('Dataset variables must be an array or a tuple of ' - 'the form (dims, data[, attrs, encoding])') - if name in var.dims: - # convert the into an Index - if var.ndim != 1: - raise ValueError('the variable %r has the same name as one of its ' - 'dimensions %r, but it is not 1-dimensional and ' - 'thus it is not a valid index' % (name, var.dims)) - var = var.to_coord() - return var - - -def _align_variables(variables, join='outer'): - """Align all DataArrays in the provided dict, leaving other values alone. - """ - alignable = [k for k, v in variables.items() if hasattr(v, 'indexes')] - aligned = align(*[variables[a] for a in alignable], - join=join, copy=False) - new_variables = OrderedDict(variables) - new_variables.update(zip(alignable, aligned)) - return new_variables - - -def _expand_variables(raw_variables, old_variables=None, compat='identical'): - """Expand a dictionary of variables. - - Returns a dictionary of Variable objects suitable for inserting into a - Dataset._variables dictionary. - - This includes converting tuples (dims, data) into Variable objects, - converting coordinate variables into Coordinate objects and expanding - DataArray objects into Variables plus coordinates. - - Raises ValueError if any conflicting values are found, between any of the - new or old variables. - """ - if old_variables is None: - old_variables = {} - new_variables = OrderedDict() - new_coord_names = set() - variables = ChainMap(new_variables, old_variables) - - def maybe_promote_or_replace(name, var): - existing_var = variables[name] - if name not in existing_var.dims: - if name in var.dims: - variables[name] = var - else: - common_dims = OrderedDict(zip(existing_var.dims, - existing_var.shape)) - common_dims.update(zip(var.dims, var.shape)) - variables[name] = existing_var.expand_dims(common_dims) - new_coord_names.update(var.dims) - - def add_variable(name, var): - var = _as_dataset_variable(name, var) - if name not in variables: - variables[name] = var - new_coord_names.update(variables[name].dims) - else: - if not getattr(variables[name], compat)(var): - raise ValueError('conflicting value for variable %s:\n' - 'first value: %r\nsecond value: %r' - % (name, variables[name], var)) - if compat == 'broadcast_equals': - maybe_promote_or_replace(name, var) - - for name, var in iteritems(raw_variables): - if hasattr(var, 'coords'): - # it's a DataArray - new_coord_names.update(var.coords) - for dim, coord in iteritems(var.coords): - if dim != name: - add_variable(dim, coord.variable) - var = var.variable - add_variable(name, var) - - return new_variables, new_coord_names - - def _calculate_dims(variables): """Calculate the dimensions corresponding to a set of variables. @@ -171,40 +87,6 @@ def _calculate_dims(variables): return dims -def _merge_expand(aligned_self, other, overwrite_vars, compat): - possible_conflicts = dict((k, v) for k, v in aligned_self._variables.items() - if k not in overwrite_vars) - new_vars, new_coord_names = _expand_variables(other, possible_conflicts, compat) - replace_vars = aligned_self._variables.copy() - replace_vars.update(new_vars) - return replace_vars, new_vars, new_coord_names - - -def _merge_dataset(self, other, overwrite_vars, compat, join): - aligned_self, other = partial_align(self, other, join=join, copy=False) - - replace_vars, new_vars, new_coord_names = _merge_expand( - aligned_self, other._variables, overwrite_vars, compat) - new_coord_names.update(other._coord_names) - - return replace_vars, new_vars, new_coord_names - - -def _merge_dict(self, other, overwrite_vars, compat, join): - other = _align_variables(other, join='outer') - - alignable = [k for k, v in other.items() if hasattr(v, 'indexes')] - aligned = partial_align(self, *[other[a] for a in alignable], - join=join, copy=False, exclude=overwrite_vars) - - aligned_self = aligned[0] - - other = OrderedDict(other) - other.update(zip(alignable, aligned[1:])) - - return _merge_expand(aligned_self, other, overwrite_vars, compat) - - def _assert_empty(args, msg='%s'): if args: raise ValueError(msg % args) @@ -213,16 +95,17 @@ def _assert_empty(args, msg='%s'): def as_dataset(obj): """Cast the given object to a Dataset. - Handles DataArrays, Datasets and dictionaries of variables. A new Dataset - object is only created in the last case. + Handles Datasets, DataArrays and dictionaries of variables. A new Dataset + object is only created if the provided object is not already one. """ - obj = getattr(obj, '_dataset', obj) + if hasattr(obj, 'to_dataset'): + obj = obj.to_dataset() if not isinstance(obj, Dataset): obj = Dataset(obj) return obj -class Variables(Mapping): +class DataVariables(Mapping): def __init__(self, dataset): self._dataset = dataset @@ -332,11 +215,7 @@ def _add_missing_coords_inplace(self): """ for dim, size in iteritems(self.dims): if dim not in self._variables: - # This is equivalent to np.arange(size), but - # waits to create the array until its actually accessed. - data = indexing.LazyIntegerRange(size) - coord = Coordinate(dim, data) - self._variables[dim] = coord + self._variables[dim] = default_index_coordinate(dim, size) def _update_vars_and_coords(self, new_variables, new_coord_names=None, needs_copy=True, check_coord_names=True): @@ -375,9 +254,9 @@ def _set_init_vars_and_dims(self, vars, coords, compat): 'redundant variables and coordinates: %s') variables = ChainMap(vars, coords) - aligned = _align_variables(variables) - new_variables, new_coord_names = _expand_variables(aligned, - compat=compat) + aligned = align_variables(variables) + new_variables, new_coord_names = expand_variables(aligned, + compat=compat) new_coord_names.update(coords) self._update_vars_and_coords(new_variables, new_coord_names, @@ -549,7 +428,19 @@ def copy(self, deep=False): return self._construct_direct(variables, self._coord_names.copy(), self._dims.copy(), self._attrs_copy()) - def _copy_listed(self, names, keep_attrs=True): + def _subset_with_all_valid_coords(self, variables, coord_names, attrs): + needed_dims = set() + for v in variables.values(): + needed_dims.update(v.dims) + for k in self._coord_names: + if set(self.variables[k].dims) <= needed_dims: + variables[k] = self._variables[k] + coord_names.add(k) + dims = dict((k, self._dims[k]) for k in needed_dims) + + return self._construct_direct(variables, coord_names, dims, attrs) + + def _copy_listed(self, names): """Create a new Dataset with the listed variables from this dataset and the all relevant coordinates. Skips all validation. """ @@ -566,19 +457,26 @@ def _copy_listed(self, names, keep_attrs=True): if ref_name in self._coord_names: coord_names.add(var_name) - needed_dims = set() - for v in variables.values(): - needed_dims.update(v._dims) - for k in self._coord_names: - if set(self._variables[k]._dims) <= needed_dims: - variables[k] = self._variables[k] - coord_names.add(k) + return self._subset_with_all_valid_coords(variables, coord_names, + attrs=self.attrs.copy()) - dims = dict((k, self._dims[k]) for k in needed_dims) + def _construct_dataarray(self, name): + """Construct a DataArray by indexing this dataset + """ + from .dataarray import DataArray - attrs = self.attrs.copy() if keep_attrs else None + try: + variable = self._variables[name] + except KeyError: + _, name, variable = _get_virtual_variable(self._variables, name) - return self._construct_direct(variables, coord_names, dims, attrs) + coords = OrderedDict() + needed_dims = set(variable.dims) + for k in self.coords: + if set(self.variables[k].dims) <= needed_dims: + coords[k] = self.variables[k] + + return DataArray(variable, coords, name=name, fastpath=True) def __copy__(self): return self.copy(deep=False) @@ -617,13 +515,11 @@ def __getitem__(self, key): Indexing with a list of names will return a new ``Dataset`` object. """ - from .dataarray import DataArray - if utils.is_dict_like(key): return self.isel(**key) if hashable(key): - return DataArray._new_from_dataset(self, key) + return self._construct_dataarray(key) else: return self._copy_listed(np.asarray(key)) @@ -745,7 +641,7 @@ def coords(self): def data_vars(self): """Dictionary of xray.DataArray objects corresponding to data variables """ - return Variables(self) + return DataVariables(self) @property def vars(self): # pragma: no cover @@ -1334,10 +1230,13 @@ def rename(self, name_dict, inplace=False): Dataset.swap_dims DataArray.rename """ - for k in name_dict: + for k, v in name_dict.items(): if k not in self: raise ValueError("cannot rename %r because it is not a " "variable in this dataset" % k) + if v in self: + raise ValueError('the new name %r already exists' % v) + variables = OrderedDict() coord_names = set() for k, v in iteritems(self._variables): @@ -1472,27 +1371,8 @@ def merge(self, other, inplace=False, overwrite_vars=set(), ValueError If any variables conflict (see ``compat``). """ - if compat not in ['broadcast_equals', 'equals', 'identical']: - raise ValueError("compat=%r invalid: must be 'broadcast_equals', " - "'equals' or 'identical'" % compat) - - if isinstance(overwrite_vars, basestring): - overwrite_vars = [overwrite_vars] - overwrite_vars = set(overwrite_vars) - - merge = _merge_dataset if isinstance(other, Dataset) else _merge_dict - - replace_vars, new_vars, new_coord_names = merge( + replace_vars, new_coord_names = merge_datasets( self, other, overwrite_vars, compat=compat, join=join) - - newly_coords = new_coord_names & (set(self) - set(self.coords)) - no_longer_coords = set(self.coords) & (set(new_vars) - new_coord_names) - ambiguous_coords = (newly_coords | no_longer_coords) - overwrite_vars - if ambiguous_coords: - raise ValueError('cannot merge: the following variables are ' - 'coordinates on one dataset but not the other: %s' - % list(ambiguous_coords)) - obj = self if inplace else self.copy() obj._update_vars_and_coords(replace_vars, new_coord_names) return obj diff --git a/xray/core/groupby.py b/xray/core/groupby.py index e2a29b0b823..9c53a2791b9 100644 --- a/xray/core/groupby.py +++ b/xray/core/groupby.py @@ -9,7 +9,7 @@ ) from .pycompat import zip from .utils import peek_at, maybe_wrap_array, safe_cast_to_index -from .variable import Variable, Coordinate +from .variable import as_variable, Variable, Coordinate def unique_value_groups(ar): @@ -110,7 +110,10 @@ def __init__(self, obj, group, squeeze=False, grouper=None): raise ValueError("`group` must have a 'dims' attribute") group_dim, = group.dims - expected_size = as_dataset(obj).dims[group_dim] + try: + expected_size = obj.dims[group_dim] + except TypeError: + expected_size = obj.shape[obj.get_axis_num(group_dim)] if group.size != expected_size: raise ValueError('the group variable\'s length does not ' 'match the length of this variable along its ' @@ -312,19 +315,16 @@ def _iter_grouped_shortcut(self): yield var[{self.group_dim: indices}] def _concat_shortcut(self, applied, concat_dim, positions): + # nb. don't worry too much about maintaining this method -- it does + # speed things up, but it's not very interpretable and there are much + # faster alternatives (e.g., doing the grouped aggregation in a + # compiled language) stacked = Variable.concat( applied, concat_dim, positions, shortcut=True) stacked.attrs.update(self.obj.attrs) - - name = self.obj.name - ds = self.obj._dataset.drop(name) - ds[concat_dim.name] = concat_dim - # remove extraneous dimensions - for dim in ds.dims: - if dim not in stacked.dims: - del ds[dim] - ds[name] = stacked - return ds[name] + result = self.obj._replace_maybe_drop_dims(stacked) + result._coords[concat_dim.name] = as_variable(concat_dim, copy=True) + return result def _restore_dim_order(self, stacked): def lookup_order(dimension): diff --git a/xray/core/merge.py b/xray/core/merge.py new file mode 100644 index 00000000000..c1830127fbc --- /dev/null +++ b/xray/core/merge.py @@ -0,0 +1,170 @@ +from .alignment import align, partial_align, align_variables +from .utils import ChainMap +from .variable import as_variable +from .pycompat import (basestring, iteritems, OrderedDict) + + +def _as_dataset_variable(name, var): + """Prepare a variable for adding it to a Dataset + """ + try: + var = as_variable(var, key=name) + except TypeError: + raise TypeError('variables must be given by arrays or a tuple of ' + 'the form (dims, data[, attrs, encoding])') + if name in var.dims: + # convert the into an Index + if var.ndim != 1: + raise ValueError('the variable %r has the same name as one of its ' + 'dimensions %r, but it is not 1-dimensional and ' + 'thus it is not a valid index' % (name, var.dims)) + var = var.to_coord() + return var + + +def expand_variables(raw_variables, old_variables=None, compat='identical'): + """Expand a dictionary of variables. + + Returns a dictionary of Variable objects suitable for inserting into a + Dataset._variables dictionary. + + This includes converting tuples (dims, data) into Variable objects, + converting coordinate variables into Coordinate objects and expanding + DataArray objects into Variables plus coordinates. + + Raises ValueError if any conflicting values are found, between any of the + new or old variables. + """ + if old_variables is None: + old_variables = {} + new_variables = OrderedDict() + new_coord_names = set() + variables = ChainMap(new_variables, old_variables) + + def maybe_promote_or_replace(name, var): + existing_var = variables[name] + if name not in existing_var.dims: + if name in var.dims: + variables[name] = var + else: + common_dims = OrderedDict(zip(existing_var.dims, + existing_var.shape)) + common_dims.update(zip(var.dims, var.shape)) + variables[name] = existing_var.expand_dims(common_dims) + new_coord_names.update(var.dims) + + def add_variable(name, var): + var = _as_dataset_variable(name, var) + if name not in variables: + variables[name] = var + new_coord_names.update(variables[name].dims) + else: + if not getattr(variables[name], compat)(var): + raise ValueError('conflicting value for variable %s:\n' + 'first value: %r\nsecond value: %r' + % (name, variables[name], var)) + if compat == 'broadcast_equals': + maybe_promote_or_replace(name, var) + + for name, var in iteritems(raw_variables): + if hasattr(var, 'coords'): + # it's a DataArray + new_coord_names.update(var.coords) + for dim, coord in iteritems(var.coords): + if dim != name: + add_variable(dim, coord.variable) + var = var.variable + add_variable(name, var) + + return new_variables, new_coord_names + + +def _merge_expand(variables, other, overwrite_vars, compat): + possible_conflicts = dict((k, v) for k, v in variables.items() + if k not in overwrite_vars) + new_vars, new_coord_names = expand_variables(other, possible_conflicts, compat) + replace_vars = variables.copy() + replace_vars.update(new_vars) + return replace_vars, new_vars, new_coord_names + + +def _merge_dataset_with_dataset(self, other, overwrite_vars, compat, join): + aligned_self, other = align(self, other, join=join, copy=False) + + replace_vars, new_vars, new_coord_names = _merge_expand( + aligned_self._variables, other._variables, overwrite_vars, compat) + new_coord_names.update(other._coord_names) + + return replace_vars, new_vars, new_coord_names + + +def _merge_dataset_with_dict(self, other, overwrite_vars, compat, join): + other = align_variables(other, join='outer', copy=False) + + alignable = [k for k, v in other.items() if hasattr(v, 'indexes')] + aligned = partial_align(self, *[other[a] for a in alignable], + join=join, copy=False, exclude=overwrite_vars) + + aligned_self = aligned[0] + + other = OrderedDict(other) + other.update(zip(alignable, aligned[1:])) + + return _merge_expand(aligned_self._variables, other, overwrite_vars, compat) + + +def merge_datasets(dataset, other, overwrite_vars=set(), + compat='broadcast_equals', join='outer'): + """ + Guts of Dataset.merge + """ + from .dataset import Dataset + + if compat not in ['broadcast_equals', 'equals', 'identical']: + raise ValueError("compat=%r invalid: must be 'broadcast_equals', " + "'equals' or 'identical'" % compat) + + if isinstance(overwrite_vars, basestring): + overwrite_vars = [overwrite_vars] + overwrite_vars = set(overwrite_vars) + + if isinstance(other, Dataset): + merge_func = _merge_dataset_with_dataset + else: + merge_func = _merge_dataset_with_dict + + replace_vars, new_vars, new_coord_names = merge_func( + dataset, other, overwrite_vars, compat=compat, join=join) + + newly_coords = new_coord_names & set(dataset.data_vars) + no_longer_coords = set(dataset.coords) & (set(new_vars) - new_coord_names) + ambiguous_coords = (newly_coords | no_longer_coords) - overwrite_vars + if ambiguous_coords: + raise ValueError('cannot merge: the following variables are ' + 'coordinates on one dataset but not the other: %s' + % list(ambiguous_coords)) + + return replace_vars, new_coord_names + + +def _reindex_variables_against(variables, indexes, copy=False): + """Reindex all DataArrays in the provided dict, leaving other values alone. + """ + alignable = [k for k, v in variables.items() if hasattr(v, 'indexes')] + aligned = [variables[a].reindex(copy=copy, indexes=indexes) + for a in alignable] + new_variables = OrderedDict(variables) + new_variables.update(zip(alignable, aligned)) + return new_variables + + +def merge_dataarray_coords(indexes, variables, other): + """ + Return the new dictionary of coordinate variables given by merging in + ``other`` to to these variables. + """ + other = align_variables(other, join='outer', copy=False) + other = _reindex_variables_against(other, indexes, copy=False) + replace_vars, _, __ = _merge_expand( + variables, other, other, compat='broadcast_equals') + return replace_vars diff --git a/xray/core/variable.py b/xray/core/variable.py index 2b742e3eb60..ba54ac591f5 100644 --- a/xray/core/variable.py +++ b/xray/core/variable.py @@ -11,7 +11,8 @@ from . import ops from . import utils from .pycompat import basestring, OrderedDict, zip, dask_array_type -from .indexing import (PandasIndexAdapter, orthogonally_indexable) +from .indexing import (PandasIndexAdapter, orthogonally_indexable, + LazyIntegerRange) import xray # only for Dataset and DataArray @@ -21,7 +22,7 @@ pass -def as_variable(obj, key=None, strict=True): +def as_variable(obj, key=None, strict=True, copy=False): """Convert an object into an Variable - If the object is already an `Variable`, return it. @@ -56,9 +57,20 @@ def as_variable(obj, key=None, strict=True): obj = Variable(key, obj) else: raise TypeError('cannot infer Variable dimensions') + else: + if copy: + obj = obj.copy(deep=False) return obj +def default_index_coordinate(dim, size): + """ + This is equivalent to np.arange(size), but waits to create the array until + its actually accessed. + """ + return Coordinate(dim, LazyIntegerRange(size)) + + def _maybe_wrap_data(data): """ Put pandas.Index and numpy.ndarray arguments in adapter objects to ensure @@ -72,7 +84,7 @@ def _maybe_wrap_data(data): return data -def _as_compatible_data(data, fastpath=False): +def as_compatible_data(data, fastpath=False): """Prepare and wrap data to put in a Variable. - If data does not have the necessary attributes, convert it to ndarray. @@ -197,7 +209,7 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): Well behaviored code to serialize a Variable should ignore unrecognized encoding items. """ - self._data = _as_compatible_data(data, fastpath=fastpath) + self._data = as_compatible_data(data, fastpath=fastpath) self._dims = self._parse_dimensions(dims) self._attrs = None self._encoding = None @@ -231,7 +243,7 @@ def data(self): @data.setter def data(self, data): - data = _as_compatible_data(data) + data = as_compatible_data(data) if data.shape != self.shape: raise ValueError( "replacement data must match the Variable's shape") diff --git a/xray/test/__init__.py b/xray/test/__init__.py index ad244b34fc2..40f71d09a62 100644 --- a/xray/test/__init__.py +++ b/xray/test/__init__.py @@ -195,7 +195,8 @@ def assertDataArrayEqual(self, ar1, ar2): def assertDataArrayIdentical(self, ar1, ar2): self.assertEqual(ar1.name, ar2.name) - self.assertDatasetIdentical(ar1.to_dataset(), ar2.to_dataset()) + self.assertDatasetIdentical(ar1._to_temp_dataset(), + ar2._to_temp_dataset()) def assertDataArrayAllClose(self, ar1, ar2, rtol=1e-05, atol=1e-08): self.assertVariableAllClose(ar1, ar2, rtol=rtol, atol=atol) diff --git a/xray/test/test_backends.py b/xray/test/test_backends.py index 364094c7d3f..033737409f3 100644 --- a/xray/test/test_backends.py +++ b/xray/test/test_backends.py @@ -121,10 +121,10 @@ def assert_loads(vars=None): if vars is None: vars = expected with self.roundtrip(expected) as actual: - for v in actual.values(): + for v in actual.variables.values(): self.assertFalse(v._in_memory) yield actual - for k, v in actual.items(): + for k, v in actual.variables.items(): if k in vars: self.assertTrue(v._in_memory) self.assertDatasetAllClose(expected, actual) diff --git a/xray/test/test_combine.py b/xray/test/test_combine.py index d7dc15c18c6..cbe14cc84ed 100644 --- a/xray/test/test_combine.py +++ b/xray/test/test_combine.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd -from xray import Dataset, DataArray, auto_combine, concat +from xray import Dataset, DataArray, auto_combine, concat, Variable from xray.core.pycompat import iteritems, OrderedDict from . import TestCase, InaccessibleArray, requires_dask @@ -207,6 +207,13 @@ def test_concat_do_not_promote(self): with self.assertRaises(ValueError): concat(objs, 't', coords='minimal') + def test_concat_dim_is_variable(self): + objs = [Dataset({'x': 0}), Dataset({'x': 1})] + coord = Variable('y', [3, 4]) + expected = Dataset({'x': ('y', [0, 1]), 'y': [3, 4]}) + actual = concat(objs, coord) + self.assertDatasetIdentical(actual, expected) + @requires_dask # only for toolz def test_auto_combine(self): objs = [Dataset({'x': [0]}), Dataset({'x': [1]})] diff --git a/xray/test/test_dask.py b/xray/test/test_dask.py index 20cfef2e3e6..99ddab258bf 100644 --- a/xray/test/test_dask.py +++ b/xray/test/test_dask.py @@ -199,7 +199,7 @@ def test_rechunk(self): def test_new_chunk(self): chunked = self.eager_array.chunk() - self.assertTrue(chunked.data.name.startswith('xray-foo-')) + self.assertTrue(chunked.data.name.startswith('xray-')) def test_lazy_dataset(self): lazy_ds = Dataset({'foo': (('x', 'y'), self.data)}) diff --git a/xray/test/test_dataarray.py b/xray/test/test_dataarray.py index e95df6028e9..6f85c350663 100644 --- a/xray/test/test_dataarray.py +++ b/xray/test/test_dataarray.py @@ -72,7 +72,7 @@ def test_name(self): actual = DataArray(Coordinate('x', [3])) actual.name = 'y' - expected = DataArray(Coordinate('y', [3])) + expected = DataArray([3], {'x': [3]}, name='y') self.assertDataArrayIdentical(actual, expected) def test_dims(self): @@ -517,13 +517,20 @@ def test_coords(self): actual = repr(da.coords) self.assertEquals(expected, actual) + with self.assertRaisesRegexp(ValueError, 'cannot delete'): + del da['x'] + + with self.assertRaisesRegexp(ValueError, 'cannot delete'): + del da.coords['x'] + def test_coord_coords(self): orig = DataArray([10, 20], {'x': [1, 2], 'x2': ('x', ['a', 'b']), 'z': 4}, dims='x') actual = orig.coords['x'] - expected = DataArray([1, 2], {'z': 4, 'x2': ('x', ['a', 'b'])}, + expected = DataArray([1, 2], {'z': 4, 'x2': ('x', ['a', 'b']), + 'x': [1, 2]}, dims='x', name='x') self.assertDataArrayIdentical(expected, actual) @@ -532,7 +539,8 @@ def test_coord_coords(self): expected.reset_coords('x2', drop=True), actual) actual.coords['x3'] = ('x', ['a', 'b']) - expected = DataArray([1, 2], {'z': 4, 'x3': ('x', ['a', 'b'])}, + expected = DataArray([1, 2], {'z': 4, 'x3': ('x', ['a', 'b']), + 'x': [1, 2]}, dims='x', name='x') self.assertDataArrayIdentical(expected, actual) @@ -576,7 +584,7 @@ def test_reset_coords(self): with self.assertRaisesRegexp(ValueError, 'cannot reset coord'): data.reset_coords(inplace=True) - with self.assertRaises(KeyError): + with self.assertRaisesRegexp(ValueError, 'cannot be found'): data.reset_coords('foo', drop=True) with self.assertRaisesRegexp(ValueError, 'cannot be found'): data.reset_coords('not_found') @@ -705,7 +713,6 @@ def test_inplace_math_basics(self): self.assertIs(b.variable, v) self.assertArrayEqual(b.values, x) self.assertIs(source_ndarray(b.values), x) - self.assertDatasetIdentical(b._dataset, self.ds) def test_inplace_math_automatic_alignment(self): a = DataArray(range(5), [('x', range(5))]) @@ -726,8 +733,8 @@ def test_math_name(self): self.assertIs((a + a.rename(None)).name, None) self.assertIs((a + a.rename('bar')).name, None) self.assertEqual((a + a).name, 'foo') - self.assertIs((+a['x']).name, None) - self.assertIs((a['x'] + 0).name, None) + self.assertIs((+a['x']).name, 'x') + self.assertIs((a['x'] + 0).name, 'x') self.assertIs((a + a['x']).name, None) def test_math_with_coords(self): @@ -785,12 +792,14 @@ def test_math_with_coords(self): def test_index_math(self): orig = DataArray(range(3), dims='x', name='x') actual = orig + 1 - expected = DataArray(1 + np.arange(3), coords=[('x', range(3))]) + expected = DataArray(1 + np.arange(3), coords=[('x', range(3))], + name='x') self.assertDataArrayIdentical(expected, actual) # regression tests for #254 actual = orig[0] < orig - expected = DataArray([False, True, True], coords=[('x', range(3))]) + expected = DataArray([False, True, True], coords=[('x', range(3))], + name='x') self.assertDataArrayIdentical(expected, actual) actual = orig > orig[0] @@ -855,11 +864,11 @@ def test_drop_coordinates(self): with self.assertRaises(ValueError): arr.drop('not found') - with self.assertRaisesRegexp(ValueError, 'cannot drop'): + with self.assertRaisesRegexp(ValueError, 'cannot be found'): arr.drop(None) renamed = arr.rename('foo') - with self.assertRaisesRegexp(ValueError, 'cannot drop'): + with self.assertRaisesRegexp(ValueError, 'cannot be found'): renamed.drop('foo') def test_drop_index_labels(self): @@ -1100,7 +1109,7 @@ def test_groupby_math(self): actual = array.coords['x'] + grouped self.assertDataArrayIdentical(expected, actual) - ds = array.coords['x'].to_dataset() + ds = array.coords['x'].to_dataset('X') expected = array + ds actual = grouped + ds self.assertDatasetIdentical(expected, actual) @@ -1212,6 +1221,15 @@ def test_resample_first(self): expected = DataArray([np.nan, 4, 8], [('time', times[::4])]) self.assertDataArrayIdentical(expected, actual) + # regerssion test for http://stackoverflow.com/questions/33158558/ + array = Dataset({'time': times})['time'] + actual = array.resample('1D', dim='time', how='last') + expected_times = pd.to_datetime(['2000-01-01T18', '2000-01-02T18', + '2000-01-03T06']) + expected = DataArray(expected_times, [('time', times[::4])], + name='time') + self.assertDataArrayIdentical(expected, actual) + def test_resample_skipna(self): times = pd.date_range('2000-01-01', freq='6H', periods=10) array = DataArray(np.ones(10), [('time', times)]) @@ -1305,9 +1323,9 @@ def test_to_pandas(self): def test_to_dataframe(self): # regression test for #260 arr = DataArray(np.random.randn(3, 4), - [('B', [1, 2, 3]), ('A', list('cdef'))]) + [('B', [1, 2, 3]), ('A', list('cdef'))], name='foo') expected = arr.to_series() - actual = arr.to_dataframe()[None] + actual = arr.to_dataframe()['foo'] self.assertArrayEqual(expected.values, actual.values) self.assertArrayEqual(expected.name, actual.name) self.assertArrayEqual(expected.index.values, actual.index.values) @@ -1316,12 +1334,29 @@ def test_to_dataframe(self): arr.coords['C'] = ('B', [-1, -2, -3]) expected = arr.to_series().to_frame() expected['C'] = [-1] * 4 + [-2] * 4 + [-3] * 4 - expected.columns = [None, 'C'] + expected = expected[['C', 'foo']] actual = arr.to_dataframe() self.assertArrayEqual(expected.values, actual.values) self.assertArrayEqual(expected.columns.values, actual.columns.values) self.assertArrayEqual(expected.index.values, actual.index.values) + arr.name = None # unnamed + with self.assertRaisesRegexp(ValueError, 'unnamed'): + arr.to_dataframe() + + def test_to_pandas_name_matches_coordinate(self): + # coordinate with same name as array + arr = DataArray([1, 2, 3], dims='x', name='x') + series = arr.to_series() + self.assertArrayEqual([1, 2, 3], series.values) + self.assertArrayEqual([0, 1, 2], series.index.values) + self.assertEqual('x', series.name) + self.assertEqual('x', series.index.name) + + frame = arr.to_dataframe() + expected = series.to_frame() + self.assertTrue(expected.equals(frame)) + def test_to_and_from_series(self): expected = self.dv.to_dataframe()['foo'] actual = self.dv.to_series() @@ -1401,10 +1436,8 @@ def test_to_and_from_cdms2(self): def test_to_dataset_whole(self): unnamed = DataArray([1, 2], dims='x') - actual = unnamed.to_dataset() - expected = Dataset({None: ('x', [1, 2])}) - self.assertDatasetIdentical(expected, actual) - self.assertIsNot(unnamed._dataset, actual) + with self.assertRaisesRegexp(ValueError, 'unable to convert unnamed'): + unnamed.to_dataset() actual = unnamed.to_dataset(name='foo') expected = Dataset({'foo': ('x', [1, 2])}) @@ -1431,8 +1464,8 @@ def test_to_dataset_split(self): with self.assertRaises(TypeError): array.to_dataset('x', name='foo') - roundtriped = actual.to_array(dim='x') - self.assertDataArrayIdentical(array, roundtriped) + roundtripped = actual.to_array(dim='x') + self.assertDataArrayIdentical(array, roundtripped) array = DataArray([1, 2, 3], dims='x') expected = Dataset(OrderedDict([('0', 1), ('1', 2), ('2', 3)])) @@ -1443,7 +1476,8 @@ def test__title_for_slice(self): array = DataArray(np.ones((4, 3, 2)), dims=['a', 'b', 'c']) self.assertEqual('', array._title_for_slice()) self.assertEqual('c = 0', array.isel(c=0)._title_for_slice()) - self.assertEqual('b = 1, c = 0', array.isel(b=1, c=0)._title_for_slice()) + title = array.isel(b=1, c=0)._title_for_slice() + self.assertTrue('b = 1, c = 0' == title or 'c = 0, b = 1' == title) a2 = DataArray(np.ones((4, 1)), dims=['a', 'b']) self.assertEqual('b = [0]', a2._title_for_slice()) @@ -1467,6 +1501,14 @@ def test_dataarray_diff_n1(self): ['x', 'y']) self.assertDataArrayEqual(expected, actual) + def test_coordinate_diff(self): + # regression test for GH634 + arr = DataArray(range(0, 20, 2), dims=['lon'], coords=[range(10)]) + lon = arr.coords['lon'] + expected = DataArray([1] * 9, dims=['lon'], coords=[range(1, 10)], + name='lon') + actual = lon.diff('lon') + def test_shift(self): arr = DataArray([1, 2, 3], dims='x') actual = arr.shift(x=1) diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py index 68989aed1c5..761486464cf 100644 --- a/xray/test/test_dataset.py +++ b/xray/test/test_dataset.py @@ -124,7 +124,7 @@ def test_constructor(self): with self.assertRaisesRegexp(ValueError, "variable 'x' has the same name"): Dataset({'a': x1, 'x': z}) - with self.assertRaisesRegexp(TypeError, 'must be an array or'): + with self.assertRaisesRegexp(TypeError, 'must be given by arrays or'): Dataset({'x': (1, 2, 3, 4, 5, 6, 7)}) with self.assertRaisesRegexp(ValueError, 'already exists as a scalar'): Dataset({'x': 0, 'y': ('x', [1, 2, 3])}) @@ -1034,6 +1034,9 @@ def test_rename(self): with self.assertRaisesRegexp(ValueError, "cannot rename 'not_a_var'"): data.rename({'not_a_var': 'nada'}) + with self.assertRaisesRegexp(ValueError, "'var1' already exists"): + data.rename({'var2': 'var1'}) + # verify that we can rename a variable without accessing the data var1 = data['var1'] data['var1'] = (var1.dims, InaccessibleArray(var1.values)) @@ -1205,8 +1208,10 @@ def test_getitem(self): self.assertDatasetEqual(expected, actual) actual = data['numbers'] - expected = DataArray(data['numbers'].variable, [data['dim3']], - name='numbers') + expected = DataArray(data['numbers'].variable, + {'dim3': data['dim3'], + 'numbers': data['numbers']}, + dims='dim3', name='numbers') self.assertDataArrayIdentical(expected, actual) actual = data[dict(dim1=0)] @@ -1243,6 +1248,14 @@ def test_virtual_variables(self): ds = Dataset({'t': ('x', pd.date_range('2000-01-01', periods=3))}) self.assertTrue((ds['t.year'] == 2000).all()) + def test_virtual_variable_same_name(self): + # regression test for GH367 + times = pd.date_range('2000-01-01', freq='H', periods=5) + data = Dataset({'time': times}) + actual = data['time.time'] + expected = DataArray(times.time, {'time': times}, name='time') + self.assertDataArrayIdentical(actual, expected) + def test_time_season(self): ds = Dataset({'t': pd.date_range('2000-01-01', periods=12, freq='M')}) expected = ['DJF'] * 2 + ['MAM'] * 3 + ['JJA'] * 3 + ['SON'] * 3 + ['DJF'] diff --git a/xray/test/test_plot.py b/xray/test/test_plot.py index 6f85f910cf4..f3e50865e22 100644 --- a/xray/test/test_plot.py +++ b/xray/test/test_plot.py @@ -593,7 +593,7 @@ def test_default_title(self): a.coords['d'] = u'foo' self.plotfunc(a.isel(c=1)) title = plt.gca().get_title() - self.assertEqual('c = 1, d = foo', title) + self.assertTrue('c = 1, d = foo' == title or 'd = foo, c = 1' == title) def test_colorbar_label(self): self.darray.name = 'testvar' diff --git a/xray/test/test_variable.py b/xray/test/test_variable.py index 44a7d60e451..79534fe7748 100644 --- a/xray/test/test_variable.py +++ b/xray/test/test_variable.py @@ -9,7 +9,7 @@ from xray import Variable, Dataset, DataArray from xray.core import indexing -from xray.core.variable import (Coordinate, as_variable, _as_compatible_data) +from xray.core.variable import (Coordinate, as_variable, as_compatible_data) from xray.core.indexing import PandasIndexAdapter, LazilyIndexedArray from xray.core.pycompat import PY3, OrderedDict @@ -919,11 +919,11 @@ def test_unchanged_types(self): pd.date_range('2000-01-01', periods=3).values]: x = t(data) self.assertIs(source_ndarray(x), - source_ndarray(_as_compatible_data(x))) + source_ndarray(as_compatible_data(x))) def test_converted_types(self): for input_array in [[[0, 1, 2]], pd.DataFrame([[0, 1, 2]])]: - actual = _as_compatible_data(input_array) + actual = as_compatible_data(input_array) self.assertArrayEqual(np.asarray(input_array), actual) self.assertEqual(np.ndarray, type(actual)) self.assertEqual(np.asarray(input_array).dtype, actual.dtype) @@ -931,39 +931,39 @@ def test_converted_types(self): def test_masked_array(self): original = np.ma.MaskedArray(np.arange(5)) expected = np.arange(5) - actual = _as_compatible_data(original) + actual = as_compatible_data(original) self.assertArrayEqual(expected, actual) self.assertEqual(np.dtype(int), actual.dtype) original = np.ma.MaskedArray(np.arange(5), mask=4 * [False] + [True]) expected = np.arange(5.0) expected[-1] = np.nan - actual = _as_compatible_data(original) + actual = as_compatible_data(original) self.assertArrayEqual(expected, actual) self.assertEqual(np.dtype(float), actual.dtype) def test_datetime(self): expected = np.datetime64('2000-01-01T00Z') - actual = _as_compatible_data(expected) + actual = as_compatible_data(expected) self.assertEqual(expected, actual) self.assertEqual(np.ndarray, type(actual)) self.assertEqual(np.dtype('datetime64[ns]'), actual.dtype) expected = np.array([np.datetime64('2000-01-01T00Z')]) - actual = _as_compatible_data(expected) + actual = as_compatible_data(expected) self.assertEqual(np.asarray(expected), actual) self.assertEqual(np.ndarray, type(actual)) self.assertEqual(np.dtype('datetime64[ns]'), actual.dtype) expected = np.array([np.datetime64('2000-01-01T00Z', 'ns')]) - actual = _as_compatible_data(expected) + actual = as_compatible_data(expected) self.assertEqual(np.asarray(expected), actual) self.assertEqual(np.ndarray, type(actual)) self.assertEqual(np.dtype('datetime64[ns]'), actual.dtype) self.assertIs(expected, source_ndarray(np.asarray(actual))) expected = np.datetime64('2000-01-01T00Z', 'ns') - actual = _as_compatible_data(datetime(2000, 1, 1)) + actual = as_compatible_data(datetime(2000, 1, 1)) self.assertEqual(np.asarray(expected), actual) self.assertEqual(np.ndarray, type(actual)) self.assertEqual(np.dtype('datetime64[ns]'), actual.dtype)