diff --git a/diffxpy/api/__init__.py b/diffxpy/api/__init__.py
index a1c1dad..18dd1b4 100644
--- a/diffxpy/api/__init__.py
+++ b/diffxpy/api/__init__.py
@@ -3,6 +3,7 @@
 
 from . import test
 from . import enrich
+from . import fit
 from . import stats
 from . import utils
 from .. import pkg_constants
diff --git a/diffxpy/api/fit.py b/diffxpy/api/fit.py
new file mode 100644
index 0000000..f08ce84
--- /dev/null
+++ b/diffxpy/api/fit.py
@@ -0,0 +1 @@
+from diffxpy.fit import model, residuals, partition
diff --git a/diffxpy/api/utils.py b/diffxpy/api/utils.py
index fb653a7..b2e4ded 100644
--- a/diffxpy/api/utils.py
+++ b/diffxpy/api/utils.py
@@ -1,4 +1,4 @@
 from diffxpy.testing.utils import constraint_matrix_from_string, constraint_matrix_from_dict, \
     constraint_system_from_star
-from diffxpy.testing.utils import design_matrix, design_matrix_from_xarray, design_matrix_from_anndata
+from diffxpy.testing.utils import design_matrix
 from diffxpy.testing.utils import view_coef_names, preview_coef_names
diff --git a/diffxpy/fit/__init__.py b/diffxpy/fit/__init__.py
new file mode 100644
index 0000000..d1ddf78
--- /dev/null
+++ b/diffxpy/fit/__init__.py
@@ -0,0 +1 @@
+from .fit import model, residuals, partition
\ No newline at end of file
diff --git a/diffxpy/fit/external.py b/diffxpy/fit/external.py
new file mode 100644
index 0000000..330ea9c
--- /dev/null
+++ b/diffxpy/fit/external.py
@@ -0,0 +1,3 @@
+from diffxpy.testing.tests import _fit
+from diffxpy.testing.utils import parse_gene_names, parse_sample_description, parse_size_factors, parse_grouping, \
+    constraint_system_from_star
\ No newline at end of file
diff --git a/diffxpy/fit/fit.py b/diffxpy/fit/fit.py
new file mode 100644
index 0000000..995a4d3
--- /dev/null
+++ b/diffxpy/fit/fit.py
@@ -0,0 +1,811 @@
+import anndata
+try:
+    from anndata.base import Raw
+except ImportError:
+    from anndata import Raw
+import logging
+import numpy as np
+import pandas as pd
+import patsy
+import scipy.sparse
+from typing import Union, List, Dict, Callable, Tuple
+
+from batchglm.models.base import _InputDataBase
+from .external import _fit
+from .external import parse_gene_names, parse_sample_description, parse_size_factors, parse_grouping, \
+    constraint_system_from_star
+
+
+def model(
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
+        formula_loc: Union[None, str] = None,
+        formula_scale: Union[None, str] = "~1",
+        as_numeric: Union[List[str], Tuple[str], str] = (),
+        init_a: Union[np.ndarray, str] = "AUTO",
+        init_b: Union[np.ndarray, str] = "AUTO",
+        gene_names: Union[np.ndarray, list] = None,
+        sample_description: Union[None, pd.DataFrame] = None,
+        dmat_loc: Union[patsy.design_info.DesignMatrix] = None,
+        dmat_scale: Union[patsy.design_info.DesignMatrix] = None,
+        constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None,
+        constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None,
+        noise_model: str = "nb",
+        size_factors: Union[np.ndarray, pd.core.series.Series, str] = None,
+        batch_size: int = None,
+        training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
+        quick_scale: bool = False,
+        dtype="float64",
+        **kwargs
+):
+    """
+    Fit model via maximum likelihood for each gene.
+
+    :param data: Input data matrix (observations x features) or (cells x genes).
+    :param formula_loc: formula
+        model formula for location and scale parameter models.
+        If not specified, `formula` will be used instead.
+    :param formula_scale: formula
+        model formula for scale parameter model.
+        If not specified, `formula` will be used instead.
+    :param as_numeric:
+        Which columns of sample_description to treat as numeric and
+        not as categorical. This yields columns in the design matrix
+        which do not correspond to one-hot encoded discrete factors.
+        This makes sense for number of genes, time, pseudotime or space
+        for example.
+    :param init_a: (Optional) Low-level initial values for a.
+        Can be:
+
+        - str:
+            * "auto": automatically choose best initialization
+            * "standard": initialize intercept with observed mean
+            * "closed_form": try to initialize with closed form
+        - np.ndarray: direct initialization of 'a'
+    :param init_b: (Optional) Low-level initial values for b
+        Can be:
+
+        - str:
+            * "auto": automatically choose best initialization
+            * "standard": initialize with zeros
+            * "closed_form": try to initialize with closed form
+        - np.ndarray: direct initialization of 'b'
+    :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these
+    :param sample_description: optional pandas.DataFrame containing sample annotations
+    :param dmat_loc: Pre-built location model design matrix.
+        This over-rides formula_loc and sample description information given in
+        data or sample_description.
+    :param dmat_scale: Pre-built scale model design matrix.
+        This over-rides formula_scale and sample description information given in
+        data or sample_description.
+    :param constraints_loc: Constraints for location model. Can be one of the following:
+
+            - np.ndarray:
+                Array with constraints in rows and model parameters in columns.
+                Each constraint contains non-zero entries for the a of parameters that
+                has to sum to zero. This constraint is enforced by binding one parameter
+                to the negative sum of the other parameters, effectively representing that
+                parameter as a function of the other parameters. This dependent
+                parameter is indicated by a -1 in this array, the independent parameters
+                of that constraint (which may be dependent at an earlier constraint)
+                are indicated by a 1. You should only use this option
+                together with prebuilt design matrix for the location model, dmat_loc,
+                for example via de.utils.setup_constrained().
+            - dict:
+                Every element of the dictionary corresponds to one set of equality constraints.
+                Each set has to be be an entry of the form {..., x: y, ...}
+                where x is the factor to be constrained and y is a factor by which levels of x are grouped
+                and then constrained. Set y="1" to constrain all levels of x to sum to one,
+                a single equality constraint.
+
+                    E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
+                        zero. This is applicable if repeats of a an experiment within each condition
+                        are independent so that the set-up ~1+condition+batch is perfectly confounded.
+
+                Can only group by non-constrained effects right now, use constraint_matrix_from_string
+                for other cases.
+            - list of strings or tuple of strings:
+                String encoded equality constraints.
+
+                    E.g. ["batch1 + batch2 + batch3 = 0"]
+            - None:
+                No constraints are used, this is equivalent to using an identity matrix as a
+                constraint matrix.
+    :param constraints_scale: Constraints for scale model. Can be one of the following:
+
+            - np.ndarray:
+                Array with constraints in rows and model parameters in columns.
+                Each constraint contains non-zero entries for the a of parameters that
+                has to sum to zero. This constraint is enforced by binding one parameter
+                to the negative sum of the other parameters, effectively representing that
+                parameter as a function of the other parameters. This dependent
+                parameter is indicated by a -1 in this array, the independent parameters
+                of that constraint (which may be dependent at an earlier constraint)
+                are indicated by a 1. You should only use this option
+                together with prebuilt design matrix for the scale model, dmat_scale,
+                for example via de.utils.setup_constrained().
+            - dict:
+                Every element of the dictionary corresponds to one set of equality constraints.
+                Each set has to be be an entry of the form {..., x: y, ...}
+                where x is the factor to be constrained and y is a factor by which levels of x are grouped
+                and then constrained. Set y="1" to constrain all levels of x to sum to one,
+                a single equality constraint.
+
+                    E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
+                        zero. This is applicable if repeats of a an experiment within each condition
+                        are independent so that the set-up ~1+condition+batch is perfectly confounded.
+
+                Can only group by non-constrained effects right now, use constraint_matrix_from_string
+                for other cases.
+            - list of strings or tuple of strings:
+                String encoded equality constraints.
+
+                    E.g. ["batch1 + batch2 + batch3 = 0"]
+            - None:
+                No constraints are used, this is equivalent to using an identity matrix as a
+                constraint matrix.
+    :param size_factors: 1D array of transformed library size factors for each cell in the
+        same order as in data or string-type column identifier of size-factor containing
+        column in sample description.
+    :param noise_model: str, noise model to use in model-based unit_test. Possible options:
+
+        - 'nb': default
+    :param batch_size: The batch size to use for the estimator.
+    :param training_strategy: {str, function, list} training strategy to use. Can be:
+
+        - str: will use Estimator.TrainingStrategy[training_strategy] to train
+        - function: Can be used to implement custom training function will be called as
+          `training_strategy(estimator)`.
+        - list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of
+          method arguments.
+    :param quick_scale: Depending on the optimizer, `scale` will be fitted faster and maybe less accurate.
+
+        Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
+    :param dtype: Allows specifying the precision which should be used to fit data.
+
+        Should be "float32" for single precision or "float64" for double precision.
+    :param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
+    """
+    if len(kwargs) != 0:
+        logging.getLogger("diffxpy").debug("additional kwargs: %s", str(kwargs))
+
+    if (dmat_loc is None and formula_loc is None) or \
+            (dmat_loc is not None and formula_loc is not None):
+        raise ValueError("Supply either dmat_loc or formula_loc.")
+    if (dmat_scale is None and formula_scale is None) or \
+            (dmat_scale is not None and formula_scale != "~1"):
+        raise ValueError("Supply either dmat_scale or formula_scale.")
+
+    # # Parse input data formats:
+    gene_names = parse_gene_names(data, gene_names)
+    if dmat_loc is None and dmat_scale is None:
+        sample_description = parse_sample_description(data, sample_description)
+    size_factors = parse_size_factors(
+        size_factors=size_factors,
+        data=data,
+        sample_description=sample_description
+    )
+
+    design_loc, constraints_loc = constraint_system_from_star(
+        dmat=dmat_loc,
+        sample_description=sample_description,
+        formula=formula_loc,
+        as_numeric=as_numeric,
+        constraints=constraints_loc,
+        return_type="patsy"
+    )
+    design_scale, constraints_scale = constraint_system_from_star(
+        dmat=dmat_scale,
+        sample_description=sample_description,
+        formula=formula_scale,
+        as_numeric=as_numeric,
+        constraints=constraints_scale,
+        return_type="patsy"
+    )
+
+    model = _fit(
+        noise_model=noise_model,
+        data=data,
+        design_loc=design_loc,
+        design_scale=design_scale,
+        constraints_loc=constraints_loc,
+        constraints_scale=constraints_scale,
+        init_a=init_a,
+        init_b=init_b,
+        gene_names=gene_names,
+        size_factors=size_factors,
+        batch_size=batch_size,
+        training_strategy=training_strategy,
+        quick_scale=quick_scale,
+        dtype=dtype
+    )
+    return model
+
+
+def residuals(
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
+        formula_loc: Union[None, str] = None,
+        formula_scale: Union[None, str] = "~1",
+        as_numeric: Union[List[str], Tuple[str], str] = (),
+        init_a: Union[np.ndarray, str] = "AUTO",
+        init_b: Union[np.ndarray, str] = "AUTO",
+        gene_names: Union[np.ndarray, list] = None,
+        sample_description: Union[None, pd.DataFrame] = None,
+        dmat_loc: Union[patsy.design_info.DesignMatrix] = None,
+        dmat_scale: Union[patsy.design_info.DesignMatrix] = None,
+        constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None,
+        constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None,
+        noise_model: str = "nb",
+        size_factors: Union[np.ndarray, pd.core.series.Series, str] = None,
+        batch_size: int = None,
+        training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
+        quick_scale: bool = False,
+        dtype="float64",
+        **kwargs
+):
+    """
+    Fits model for each gene and returns residuals.
+
+    :param data: Input data matrix (observations x features) or (cells x genes).
+    :param formula_loc: formula
+        model formula for location and scale parameter models.
+        If not specified, `formula` will be used instead.
+    :param formula_scale: formula
+        model formula for scale parameter model.
+        If not specified, `formula` will be used instead.
+    :param as_numeric:
+        Which columns of sample_description to treat as numeric and
+        not as categorical. This yields columns in the design matrix
+        which do not correspond to one-hot encoded discrete factors.
+        This makes sense for number of genes, time, pseudotime or space
+        for example.
+    :param init_a: (Optional) Low-level initial values for a.
+        Can be:
+
+        - str:
+            * "auto": automatically choose best initialization
+            * "standard": initialize intercept with observed mean
+            * "closed_form": try to initialize with closed form
+        - np.ndarray: direct initialization of 'a'
+    :param init_b: (Optional) Low-level initial values for b
+        Can be:
+
+        - str:
+            * "auto": automatically choose best initialization
+            * "standard": initialize with zeros
+            * "closed_form": try to initialize with closed form
+        - np.ndarray: direct initialization of 'b'
+    :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these
+    :param sample_description: optional pandas.DataFrame containing sample annotations
+    :param dmat_loc: Pre-built location model design matrix.
+        This over-rides formula_loc and sample description information given in
+        data or sample_description.
+    :param dmat_scale: Pre-built scale model design matrix.
+        This over-rides formula_scale and sample description information given in
+        data or sample_description.
+    :param constraints_loc: Constraints for location model. Can be one of the following:
+
+            - np.ndarray:
+                Array with constraints in rows and model parameters in columns.
+                Each constraint contains non-zero entries for the a of parameters that
+                has to sum to zero. This constraint is enforced by binding one parameter
+                to the negative sum of the other parameters, effectively representing that
+                parameter as a function of the other parameters. This dependent
+                parameter is indicated by a -1 in this array, the independent parameters
+                of that constraint (which may be dependent at an earlier constraint)
+                are indicated by a 1. You should only use this option
+                together with prebuilt design matrix for the location model, dmat_loc,
+                for example via de.utils.setup_constrained().
+            - dict:
+                Every element of the dictionary corresponds to one set of equality constraints.
+                Each set has to be be an entry of the form {..., x: y, ...}
+                where x is the factor to be constrained and y is a factor by which levels of x are grouped
+                and then constrained. Set y="1" to constrain all levels of x to sum to one,
+                a single equality constraint.
+
+                    E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
+                        zero. This is applicable if repeats of a an experiment within each condition
+                        are independent so that the set-up ~1+condition+batch is perfectly confounded.
+
+                Can only group by non-constrained effects right now, use constraint_matrix_from_string
+                for other cases.
+            - list of strings or tuple of strings:
+                String encoded equality constraints.
+
+                    E.g. ["batch1 + batch2 + batch3 = 0"]
+            - None:
+                No constraints are used, this is equivalent to using an identity matrix as a
+                constraint matrix.
+    :param constraints_scale: Constraints for scale model. Can be one of the following:
+
+            - np.ndarray:
+                Array with constraints in rows and model parameters in columns.
+                Each constraint contains non-zero entries for the a of parameters that
+                has to sum to zero. This constraint is enforced by binding one parameter
+                to the negative sum of the other parameters, effectively representing that
+                parameter as a function of the other parameters. This dependent
+                parameter is indicated by a -1 in this array, the independent parameters
+                of that constraint (which may be dependent at an earlier constraint)
+                are indicated by a 1. You should only use this option
+                together with prebuilt design matrix for the scale model, dmat_scale,
+                for example via de.utils.setup_constrained().
+            - dict:
+                Every element of the dictionary corresponds to one set of equality constraints.
+                Each set has to be be an entry of the form {..., x: y, ...}
+                where x is the factor to be constrained and y is a factor by which levels of x are grouped
+                and then constrained. Set y="1" to constrain all levels of x to sum to one,
+                a single equality constraint.
+
+                    E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
+                        zero. This is applicable if repeats of a an experiment within each condition
+                        are independent so that the set-up ~1+condition+batch is perfectly confounded.
+
+                Can only group by non-constrained effects right now, use constraint_matrix_from_string
+                for other cases.
+            - list of strings or tuple of strings:
+                String encoded equality constraints.
+
+                    E.g. ["batch1 + batch2 + batch3 = 0"]
+            - None:
+                No constraints are used, this is equivalent to using an identity matrix as a
+                constraint matrix.
+    :param size_factors: 1D array of transformed library size factors for each cell in the
+        same order as in data or string-type column identifier of size-factor containing
+        column in sample description.
+    :param noise_model: str, noise model to use in model-based unit_test. Possible options:
+
+        - 'nb': default
+    :param batch_size: The batch size to use for the estimator.
+    :param training_strategy: {str, function, list} training strategy to use. Can be:
+
+        - str: will use Estimator.TrainingStrategy[training_strategy] to train
+        - function: Can be used to implement custom training function will be called as
+          `training_strategy(estimator)`.
+        - list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of
+          method arguments.
+    :param quick_scale: Depending on the optimizer, `scale` will be fitted faster and maybe less accurate.
+
+        Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
+    :param dtype: Allows specifying the precision which should be used to fit data.
+
+        Should be "float32" for single precision or "float64" for double precision.
+    :param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
+    """
+    estim = model(
+        data=data,
+        formula_loc=formula_loc,
+        formula_scale=formula_scale,
+        as_numeric=as_numeric,
+        init_a=init_a,
+        init_b=init_b,
+        gene_names=gene_names,
+        sample_description=sample_description,
+        dmat_loc=dmat_loc,
+        dmat_scale=dmat_scale,
+        constraints_loc=constraints_loc,
+        constraints_scale=constraints_scale,
+        noise_model=noise_model,
+        size_factors=size_factors,
+        batch_size=batch_size,
+        training_strategy=training_strategy,
+        quick_scale=quick_scale,
+        dtype=dtype,
+        ** kwargs
+    )
+    residuals = estim.x - estim.model.location
+    return residuals
+
+
+def partition(
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
+        parts: Union[str, np.ndarray, list],
+        gene_names: Union[np.ndarray, list] = None,
+        sample_description: pd.DataFrame = None,
+        dmat_loc: Union[patsy.design_info.DesignMatrix] = None,
+        dmat_scale: Union[patsy.design_info.DesignMatrix] = None,
+        size_factors: Union[np.ndarray, pd.core.series.Series, str] = None
+):
+    r"""
+    Perform differential expression test for each group. This class handles
+    the partitioning of the data set, the differential test callls and
+    the sumamry of the individual tests into one
+    DifferentialExpressionTestMulti object. All functions the yield
+    DifferentialExpressionTestSingle objects can be performed on each
+    partition.
+
+    Wraps _Partition so that doc strings are nice.
+
+    :param data: Array-like or anndata.Anndata object containing observations.
+        Input data matrix (observations x features) or (cells x genes).
+    :param parts: str, array
+
+            - column in data.obs/sample_description which contains the split of observations into the two groups.
+            - array of length `num_observations` containing group labels
+    :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these
+    :param sample_description: optional pandas.DataFrame containing sample annotations.
+    :param dmat_loc: Pre-built location model design matrix.
+        This over-rides formula_loc and sample description information given in
+        data or sample_description.
+    :param dmat_scale: Pre-built scale model design matrix.
+        This over-rides formula_scale and sample description information given in
+        data or sample_description.
+    :param size_factors: 1D array of transformed library size factors for each cell in the
+        same order as in data or string-type column identifier of size-factor containing
+        column in sample description.
+    """
+    return (_Partition(
+        data=data,
+        parts=parts,
+        gene_names=gene_names,
+        sample_description=sample_description,
+        dmat_loc=dmat_loc,
+        dmat_scale=dmat_scale,
+        size_factors=size_factors
+    ))
+
+
+class _Partition:
+    """
+    Perform model fit separately for each group.
+    """
+
+    def __init__(
+            self,
+            data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
+            parts: Union[str, np.ndarray, list],
+            gene_names: Union[np.ndarray, list] = None,
+            sample_description: pd.DataFrame = None,
+            dmat_loc: Union[patsy.design_info.DesignMatrix] = None,
+            dmat_scale: Union[patsy.design_info.DesignMatrix] = None,
+            size_factors: Union[np.ndarray, pd.core.series.Series, str] = None
+    ):
+        """
+        :param data: Array-like or anndata.Anndata object containing observations.
+            Input data matrix (observations x features) or (cells x genes).
+        :param parts: str, array
+
+            - column in data.obs/sample_description which contains the split of observations into the two groups.
+            - array of length `num_observations` containing group labels
+        :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these
+        :param sample_description: optional pandas.DataFrame containing sample annotations.
+        :param dmat_loc: Pre-built location model design matrix.
+            This over-rides formula_loc and sample description information given in
+            data or sample_description.
+        :param dmat_scale: Pre-built scale model design matrix.
+            This over-rides formula_scale and sample description information given in
+            data or sample_description.
+        :param size_factors: 1D array of transformed library size factors for each cell in the
+            same order as in data or string-type column identifier of size-factor containing
+            column in sample description.
+        """
+        if isinstance(data, _InputDataBase):
+            self.x = data.x
+        elif isinstance(data, anndata.AnnData) or isinstance(data, Raw):
+            self.x = data.X
+        elif isinstance(data, np.ndarray):
+            self.x = data
+        else:
+            raise ValueError("data type %s not recognized" % type(data))
+        self.gene_names = parse_gene_names(data, gene_names)
+        self.sample_description = parse_sample_description(data, sample_description)
+        self.dmat_loc = dmat_loc
+        self.dmat_scale = dmat_scale
+        self.size_factors = size_factors
+        self.partition = parse_grouping(data, sample_description, parts)
+        self.partitions = np.unique(self.partition)
+        self.partition_idx = [np.where(self.partition == x)[0] for x in self.partitions]
+
+    def model(
+            self,
+            formula_loc: Union[None, str] = None,
+            formula_scale: Union[None, str] = "~1",
+            as_numeric: Union[List[str], Tuple[str], str] = (),
+            init_a: Union[np.ndarray, str] = "AUTO",
+            init_b: Union[np.ndarray, str] = "AUTO",
+            constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None,
+            constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None,
+            noise_model: str = "nb",
+            batch_size: int = None,
+            training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
+            quick_scale: bool = False,
+            dtype="float64",
+            **kwargs
+    ):
+        """
+        Fit model for each gene and for each group.
+
+        :param formula_loc: formula
+            model formula for location and scale parameter models.
+            If not specified, `formula` will be used instead.
+        :param formula_scale: formula
+            model formula for scale parameter model.
+            If not specified, `formula` will be used instead.
+        :param as_numeric:
+            Which columns of sample_description to treat as numeric and
+            not as categorical. This yields columns in the design matrix
+            which do not correspond to one-hot encoded discrete factors.
+            This makes sense for number of genes, time, pseudotime or space
+            for example.
+        :param init_a: (Optional) Low-level initial values for a.
+            Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "standard": initialize intercept with observed mean
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'a'
+        :param init_b: (Optional) Low-level initial values for b
+            Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "standard": initialize with zeros
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'b'
+        :param constraints_loc: Constraints for location model. Can be one of the following:
+
+                - np.ndarray:
+                    Array with constraints in rows and model parameters in columns.
+                    Each constraint contains non-zero entries for the a of parameters that
+                    has to sum to zero. This constraint is enforced by binding one parameter
+                    to the negative sum of the other parameters, effectively representing that
+                    parameter as a function of the other parameters. This dependent
+                    parameter is indicated by a -1 in this array, the independent parameters
+                    of that constraint (which may be dependent at an earlier constraint)
+                    are indicated by a 1. You should only use this option
+                    together with prebuilt design matrix for the location model, dmat_loc,
+                    for example via de.utils.setup_constrained().
+                - dict:
+                    Every element of the dictionary corresponds to one set of equality constraints.
+                    Each set has to be be an entry of the form {..., x: y, ...}
+                    where x is the factor to be constrained and y is a factor by which levels of x are grouped
+                    and then constrained. Set y="1" to constrain all levels of x to sum to one,
+                    a single equality constraint.
+
+                        E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
+                            zero. This is applicable if repeats of a an experiment within each condition
+                            are independent so that the set-up ~1+condition+batch is perfectly confounded.
+
+                    Can only group by non-constrained effects right now, use constraint_matrix_from_string
+                    for other cases.
+                - list of strings or tuple of strings:
+                    String encoded equality constraints.
+
+                        E.g. ["batch1 + batch2 + batch3 = 0"]
+                - None:
+                    No constraints are used, this is equivalent to using an identity matrix as a
+                    constraint matrix.
+        :param constraints_scale: Constraints for scale model. Can be one of the following:
+
+                - np.ndarray:
+                    Array with constraints in rows and model parameters in columns.
+                    Each constraint contains non-zero entries for the a of parameters that
+                    has to sum to zero. This constraint is enforced by binding one parameter
+                    to the negative sum of the other parameters, effectively representing that
+                    parameter as a function of the other parameters. This dependent
+                    parameter is indicated by a -1 in this array, the independent parameters
+                    of that constraint (which may be dependent at an earlier constraint)
+                    are indicated by a 1. You should only use this option
+                    together with prebuilt design matrix for the scale model, dmat_scale,
+                    for example via de.utils.setup_constrained().
+                - dict:
+                    Every element of the dictionary corresponds to one set of equality constraints.
+                    Each set has to be be an entry of the form {..., x: y, ...}
+                    where x is the factor to be constrained and y is a factor by which levels of x are grouped
+                    and then constrained. Set y="1" to constrain all levels of x to sum to one,
+                    a single equality constraint.
+
+                        E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
+                            zero. This is applicable if repeats of a an experiment within each condition
+                            are independent so that the set-up ~1+condition+batch is perfectly confounded.
+
+                    Can only group by non-constrained effects right now, use constraint_matrix_from_string
+                    for other cases.
+                - list of strings or tuple of strings:
+                    String encoded equality constraints.
+
+                        E.g. ["batch1 + batch2 + batch3 = 0"]
+                - None:
+                    No constraints are used, this is equivalent to using an identity matrix as a
+                    constraint matrix.
+        :param noise_model: str, noise model to use in model-based unit_test. Possible options:
+
+            - 'nb': default
+        :param batch_size: The batch size to use for the estimator.
+        :param training_strategy: {str, function, list} training strategy to use. Can be:
+
+            - str: will use Estimator.TrainingStrategy[training_strategy] to train
+            - function: Can be used to implement custom training function will be called as
+              `training_strategy(estimator)`.
+            - list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of
+              method arguments.
+        :param quick_scale: Depending on the optimizer, `scale` will be fitted faster and maybe less accurate.
+
+            Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
+        :param dtype: Allows specifying the precision which should be used to fit data.
+
+            Should be "float32" for single precision or "float64" for double precision.
+        :param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
+        """
+        estims = []
+        for i, idx in enumerate(self.partition_idx):
+            estims.append(model(
+                data=self.x[idx, :],
+                formula_loc=formula_loc,
+                formula_scale=formula_scale,
+                as_numeric=as_numeric,
+                init_a=init_a,
+                init_b=init_b,
+                gene_names=self.gene_names,
+                sample_description=self.sample_description.iloc[idx, :],
+                dmat_loc=self.dmat_loc[idx, :] if self.dmat_loc is not None else None,
+                dmat_scale=self.dmat_scale[idx, :] if self.dmat_scale is not None else None,
+                constraints_loc=constraints_loc,
+                constraints_scale=constraints_scale,
+                noise_model=noise_model,
+                size_factors=self.size_factors[idx] if self.size_factors is not None else None,
+                batch_size=batch_size,
+                training_strategy=training_strategy,
+                quick_scale=quick_scale,
+                dtype=dtype,
+                **kwargs
+            ))
+        return estims
+
+    def residuals(
+            self,
+            formula_loc: Union[None, str] = None,
+            formula_scale: Union[None, str] = "~1",
+            as_numeric: Union[List[str], Tuple[str], str] = (),
+            init_a: Union[np.ndarray, str] = "AUTO",
+            init_b: Union[np.ndarray, str] = "AUTO",
+            constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None,
+            constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None,
+            noise_model: str = "nb",
+            batch_size: int = None,
+            training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
+            quick_scale: bool = False,
+            dtype="float64",
+            **kwargs
+    ):
+        """
+        Fit model for each gene and for each group and computes residuals.
+
+        :param formula_loc: formula
+            model formula for location and scale parameter models.
+            If not specified, `formula` will be used instead.
+        :param formula_scale: formula
+            model formula for scale parameter model.
+            If not specified, `formula` will be used instead.
+        :param as_numeric:
+            Which columns of sample_description to treat as numeric and
+            not as categorical. This yields columns in the design matrix
+            which do not correspond to one-hot encoded discrete factors.
+            This makes sense for number of genes, time, pseudotime or space
+            for example.
+        :param init_a: (Optional) Low-level initial values for a.
+            Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "standard": initialize intercept with observed mean
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'a'
+        :param init_b: (Optional) Low-level initial values for b
+            Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "standard": initialize with zeros
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'b'
+        :param constraints_loc: Constraints for location model. Can be one of the following:
+
+                - np.ndarray:
+                    Array with constraints in rows and model parameters in columns.
+                    Each constraint contains non-zero entries for the a of parameters that
+                    has to sum to zero. This constraint is enforced by binding one parameter
+                    to the negative sum of the other parameters, effectively representing that
+                    parameter as a function of the other parameters. This dependent
+                    parameter is indicated by a -1 in this array, the independent parameters
+                    of that constraint (which may be dependent at an earlier constraint)
+                    are indicated by a 1. You should only use this option
+                    together with prebuilt design matrix for the location model, dmat_loc,
+                    for example via de.utils.setup_constrained().
+                - dict:
+                    Every element of the dictionary corresponds to one set of equality constraints.
+                    Each set has to be be an entry of the form {..., x: y, ...}
+                    where x is the factor to be constrained and y is a factor by which levels of x are grouped
+                    and then constrained. Set y="1" to constrain all levels of x to sum to one,
+                    a single equality constraint.
+
+                        E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
+                            zero. This is applicable if repeats of a an experiment within each condition
+                            are independent so that the set-up ~1+condition+batch is perfectly confounded.
+
+                    Can only group by non-constrained effects right now, use constraint_matrix_from_string
+                    for other cases.
+                - list of strings or tuple of strings:
+                    String encoded equality constraints.
+
+                        E.g. ["batch1 + batch2 + batch3 = 0"]
+                - None:
+                    No constraints are used, this is equivalent to using an identity matrix as a
+                    constraint matrix.
+        :param constraints_scale: Constraints for scale model. Can be one of the following:
+
+                - np.ndarray:
+                    Array with constraints in rows and model parameters in columns.
+                    Each constraint contains non-zero entries for the a of parameters that
+                    has to sum to zero. This constraint is enforced by binding one parameter
+                    to the negative sum of the other parameters, effectively representing that
+                    parameter as a function of the other parameters. This dependent
+                    parameter is indicated by a -1 in this array, the independent parameters
+                    of that constraint (which may be dependent at an earlier constraint)
+                    are indicated by a 1. You should only use this option
+                    together with prebuilt design matrix for the scale model, dmat_scale,
+                    for example via de.utils.setup_constrained().
+                - dict:
+                    Every element of the dictionary corresponds to one set of equality constraints.
+                    Each set has to be be an entry of the form {..., x: y, ...}
+                    where x is the factor to be constrained and y is a factor by which levels of x are grouped
+                    and then constrained. Set y="1" to constrain all levels of x to sum to one,
+                    a single equality constraint.
+
+                        E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to
+                            zero. This is applicable if repeats of a an experiment within each condition
+                            are independent so that the set-up ~1+condition+batch is perfectly confounded.
+
+                    Can only group by non-constrained effects right now, use constraint_matrix_from_string
+                    for other cases.
+                - list of strings or tuple of strings:
+                    String encoded equality constraints.
+
+                        E.g. ["batch1 + batch2 + batch3 = 0"]
+                - None:
+                    No constraints are used, this is equivalent to using an identity matrix as a
+                    constraint matrix.
+        :param noise_model: str, noise model to use in model-based unit_test. Possible options:
+
+            - 'nb': default
+        :param batch_size: The batch size to use for the estimator.
+        :param training_strategy: {str, function, list} training strategy to use. Can be:
+
+            - str: will use Estimator.TrainingStrategy[training_strategy] to train
+            - function: Can be used to implement custom training function will be called as
+              `training_strategy(estimator)`.
+            - list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of
+              method arguments.
+        :param quick_scale: Depending on the optimizer, `scale` will be fitted faster and maybe less accurate.
+
+            Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
+        :param dtype: Allows specifying the precision which should be used to fit data.
+
+            Should be "float32" for single precision or "float64" for double precision.
+        :param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
+        """
+        residuals = []
+        for i, idx in enumerate(self.partition_idx):
+            residuals.append(residuals(
+                data=self.x[idx, :],
+                formula_loc=formula_loc,
+                formula_scale=formula_scale,
+                as_numeric=as_numeric,
+                init_a=init_a,
+                init_b=init_b,
+                gene_names=self.gene_names,
+                sample_description=self.sample_description.iloc[idx, :],
+                dmat_loc=self.dmat_loc[idx, :] if self.dmat_loc is not None else None,
+                dmat_scale=self.dmat_scale[idx, :] if self.dmat_scale is not None else None,
+                constraints_loc=constraints_loc,
+                constraints_scale=constraints_scale,
+                noise_model=noise_model,
+                size_factors=self.size_factors[idx] if self.size_factors is not None else None,
+                batch_size=batch_size,
+                training_strategy=training_strategy,
+                quick_scale=quick_scale,
+                dtype=dtype,
+                **kwargs
+            ))
+        return residuals
+
+
diff --git a/diffxpy/models/batch_bfgs/optim.py b/diffxpy/models/batch_bfgs/optim.py
index ef27ec5..98c234e 100644
--- a/diffxpy/models/batch_bfgs/optim.py
+++ b/diffxpy/models/batch_bfgs/optim.py
@@ -16,17 +16,17 @@
 class Estim_BFGS_Model():
 
     def __init__(self, Estim_BFGS, nproc):
-        self._num_observations = Estim_BFGS.X.shape[0]
-        self._num_features = Estim_BFGS.X.shape[1]
+        self._num_observations = Estim_BFGS.x.shape[0]
+        self._num_features = Estim_BFGS.x.shape[1]
         self._features = Estim_BFGS.feature_names
-        self._observations = Estim_BFGS.X.shape[0]
+        self._observations = Estim_BFGS.x.shape[0]
         self._design_loc = Estim_BFGS.design_loc
         self._design_scale = Estim_BFGS.design_scale
         self._loss = xr.DataArray(Estim_BFGS.full_loss(nproc))
         self._log_probs = -self._loss
         self._probs = np.exp(self._log_probs)
         self._mles = xr.DataArray(np.transpose(Estim_BFGS.mles()))
-        self._gradient = xr.DataArray(np.zeros([Estim_BFGS.X.shape[1]]))
+        self._gradient = xr.DataArray(np.zeros([Estim_BFGS.x.shape[1]]))
         self._fisher_inv = xr.DataArray(Estim_BFGS.fisher_inv)
         self._idx_loc = np.arange(0, Estim_BFGS.design_loc.shape[1])
         self._idx_scale = np.arange(Estim_BFGS.design_loc.shape[1],
diff --git a/diffxpy/pkg_constants.py b/diffxpy/pkg_constants.py
index 389f621..399a45d 100644
--- a/diffxpy/pkg_constants.py
+++ b/diffxpy/pkg_constants.py
@@ -1,5 +1,3 @@
-DE_TREAT_ZEROVAR_TT_AS_SIG = True
-
 BATCHGLM_OPTIM_GD = False
 BATCHGLM_OPTIM_ADAM = False
 BATCHGLM_OPTIM_ADAGRAD = False
diff --git a/diffxpy/stats/stats.py b/diffxpy/stats/stats.py
index 9f60d26..5b8f8fe 100644
--- a/diffxpy/stats/stats.py
+++ b/diffxpy/stats/stats.py
@@ -1,9 +1,8 @@
-from typing import Union
-
 import numpy as np
 import numpy.linalg
+import scipy.sparse
 import scipy.stats
-import xarray as xr
+from typing import Union
 
 
 def likelihood_ratio_test(
@@ -34,42 +33,40 @@ def likelihood_ratio_test(
     delta_df = df_full - df_reduced
     # Compute the deviance test statistic.
     delta_dev = 2 * (ll_full - ll_reduced)
-    # Compute the p-values based on the deviance and its expection based on the chi-square distribution.
+    # Compute the p-values based on the deviance and its expectation based on the chi-square distribution.
     pvals = 1 - scipy.stats.chi2(delta_df).cdf(delta_dev)
     return pvals
 
 
 def mann_whitney_u_test(
-        x0: np.ndarray,
-        x1: np.ndarray,
+        x0: Union[np.ndarray, scipy.sparse.csr_matrix],
+        x1: Union[np.ndarray, scipy.sparse.csr_matrix]
 ):
     """
-    Perform Wilcoxon rank sum test (Mann-Whitney U test) along second axis
-    (for each gene).
+    Perform Wilcoxon rank sum test (Mann-Whitney U test) along second axis, ie. for each gene.
 
     The Wilcoxon rank sum test is a non-parameteric test
     to compare two groups of observations.
 
-    :param x0: np.array (observations x genes)
+    :param x0: (observations x genes)
         Observations in first group by gene
-    :param x1:  np.array (observations x genes)
+    :param x1: (observations x genes)
         Observations in second group by gene.
     """
     axis = 1
     if np.any(np.ndim(x0) != np.ndim(x1)):
-        raise ValueError('stats.wilcoxon(): number of dimensions is not allowed to differ between x0 and x1!')
+        raise ValueError('number of dimensions is not allowed to differ between x0 and x1')
     # Reshape into 2D array if only one test is performed.
     if np.ndim(x0) == 1:
         x0 = x0.reshape([x0.shape[0], 1])
         x1 = x1.reshape([x1.shape[0], 1])
     if np.any(x0.shape[axis] != x1.shape[axis]):
-        raise ValueError(
-            'stats.wilcoxon(): the first axis (number of tests) is not allowed to differ between x0 and x1!')
+        raise ValueError('the first axis (number of tests) is not allowed to differ between x0 and x1')
 
     pvals = np.asarray([
         scipy.stats.mannwhitneyu(
-            x=x0[:, i].flatten(),
-            y=x1[:, i].flatten(),
+            x=np.asarray(x0[:, i].todense()).flatten() if isinstance(x0, scipy.sparse.csr_matrix) else x0[:, i],
+            y=np.asarray(x1[:, i].todense()).flatten() if isinstance(x0, scipy.sparse.csr_matrix) else x1[:, i],
             use_continuity=True,
             alternative="two-sided"
         ).pvalue for i in range(x0.shape[1])
@@ -252,6 +249,8 @@ def wald_test_chisq(
     if theta_mle.shape[0] != theta_covar.shape[1]:
         raise ValueError(
             'stats.wald_test(): theta_mle and theta_covar have to contain the same number of parameters')
+    if len(theta_covar.shape) != 3:
+        raise ValueError('stats.wald_test(): theta_covar should have 3 dimensions but has %i' % len(theta_covar.shape))
     if theta_mle.shape[1] != theta_covar.shape[0]:
         raise ValueError('stats.wald_test(): theta_mle and theta_covar have to contain the same number of genes')
     if theta_covar.shape[1] != theta_covar.shape[2]:
@@ -264,8 +263,6 @@ def wald_test_chisq(
 
     theta_diff = theta_mle - theta0
     # Convert to nd.array to avoid gufunc error.
-    if isinstance(theta_diff, xr.DataArray):
-        theta_diff = theta_diff.values
     wald_statistic = np.array([
         np.matmul(
             np.matmul(
diff --git a/diffxpy/testing/det.py b/diffxpy/testing/det.py
index 412a15d..94c6f99 100644
--- a/diffxpy/testing/det.py
+++ b/diffxpy/testing/det.py
@@ -3,20 +3,19 @@
 from typing import Union, Dict, Tuple, List, Set
 import pandas as pd
 from random import sample
+import scipy.sparse
 
 import numpy as np
-import xarray as xr
 import patsy
 
-from .utils import split_X, dmat_unique
+from .utils import split_x, dmat_unique
 
 try:
     import anndata
 except ImportError:
     anndata = None
 
-from batchglm.xarray_sparse.base import SparseXArrayDataArray
-from batchglm.models.glm_nb import Model as GeneralizedLinearModel
+from batchglm.models.base import _EstimatorBase, _InputDataBase
 
 from ..stats import stats
 from . import correction
@@ -25,84 +24,6 @@
 logger = logging.getLogger("diffxpy")
 
 
-class _Estimation(GeneralizedLinearModel, metaclass=abc.ABCMeta):
-    """
-    Dummy class specifying all needed methods / parameters necessary for a model
-    fitted for DifferentialExpressionTest.
-    Useful for type hinting.
-    """
-
-    @property
-    @abc.abstractmethod
-    def X(self) -> np.ndarray:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def design_loc(self) -> np.ndarray:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def design_scale(self) -> np.ndarray:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def constraints_loc(self) -> np.ndarray:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def constraints_scale(self) -> np.ndarray:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def num_observations(self) -> int:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def num_features(self) -> int:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def features(self) -> np.ndarray:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def observations(self) -> np.ndarray:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def log_likelihood(self, **kwargs) -> np.ndarray:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def loss(self, **kwargs) -> np.ndarray:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def gradients(self, **kwargs) -> np.ndarray:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def hessians(self, **kwargs) -> np.ndarray:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def fisher_inv(self, **kwargs) -> np.ndarray:
-        pass
-
-
 class _DifferentialExpressionTest(metaclass=abc.ABCMeta):
     """
     Dummy class specifying all needed methods / parameters necessary for DifferentialExpressionTest.
@@ -132,7 +53,7 @@ def gene_ids(self) -> np.ndarray:
 
     @property
     @abc.abstractmethod
-    def X(self):
+    def x(self):
         pass
 
     @abc.abstractmethod
@@ -169,24 +90,22 @@ def _correction(self, method) -> np.ndarray:
 
     def _ave(self):
         """
-        Returns a xr.DataArray containing the mean expression by gene
+        Returns the mean expression by gene.
 
-        :return: xr.DataArray
+        :return: np.ndarray
         """
         pass
 
     @property
     def log_likelihood(self):
         if self._log_likelihood is None:
-            self._log_likelihood = self._ll().compute()
+            self._log_likelihood = self._ll()
         return self._log_likelihood
 
     @property
     def mean(self):
         if self._mean is None:
             self._mean = self._ave()
-            if isinstance(self._mean, xr.DataArray):  # Could also be np.ndarray coming out of XArraySparseDataArray
-                self._mean = self._mean.compute()
         return self._mean
 
     @property
@@ -340,10 +259,7 @@ def plot_volcano(
 
         logfc = np.reshape(self.log2_fold_change(), -1)
         # Clipping throws errors if not performed in actual data format (ndarray or DataArray):
-        if isinstance(logfc, xr.DataArray):
-            logfc = logfc.clip(-log2_fc_threshold, log2_fc_threshold)
-        else:
-            logfc = np.clip(logfc, -log2_fc_threshold, log2_fc_threshold, logfc)
+        logfc = np.clip(logfc, -log2_fc_threshold, log2_fc_threshold, logfc)
 
         fig, ax = plt.subplots()
 
@@ -389,6 +305,7 @@ def plot_volcano(
             plt.show()
 
         plt.close(fig)
+        plt.ion()
 
         if return_axs:
             return ax
@@ -450,10 +367,7 @@ def plot_ma(
 
         logfc = np.reshape(self.log2_fold_change(), -1)
         # Clipping throws errors if not performed in actual data format (ndarray or DataArray):
-        if isinstance(logfc, xr.DataArray):
-            logfc = logfc.clip(-log2_fc_threshold, log2_fc_threshold)
-        else:
-            logfc = np.clip(logfc, -log2_fc_threshold, log2_fc_threshold, logfc)
+        logfc = np.clip(logfc, -log2_fc_threshold, log2_fc_threshold, logfc)
 
         fig, ax = plt.subplots()
 
@@ -554,17 +468,17 @@ class DifferentialExpressionTestLRT(_DifferentialExpressionTestSingle):
 
     sample_description: pd.DataFrame
     full_design_loc_info: patsy.design_info
-    full_estim: _Estimation
+    full_estim: _EstimatorBase
     reduced_design_loc_info: patsy.design_info
-    reduced_estim: _Estimation
+    reduced_estim: _EstimatorBase
 
     def __init__(
             self,
             sample_description: pd.DataFrame,
             full_design_loc_info: patsy.design_info,
-            full_estim,
+            full_estim: _EstimatorBase,
             reduced_design_loc_info: patsy.design_info,
-            reduced_estim
+            reduced_estim: _EstimatorBase
     ):
         super().__init__()
         self.sample_description = sample_description
@@ -575,19 +489,19 @@ def __init__(
 
     @property
     def gene_ids(self) -> np.ndarray:
-        return np.asarray(self.full_estim.features)
+        return np.asarray(self.full_estim.input_data.features)
 
     @property
-    def X(self):
-        return self.full_estim.X
+    def x(self):
+        return self.full_estim.x
 
     @property
     def reduced_model_gradient(self):
-        return self.reduced_estim.gradients
+        return self.reduced_estim.jacobian
 
     @property
     def full_model_gradient(self):
-        return self.full_estim.gradients
+        return self.full_estim.jacobian
 
     def _test(self):
         if np.any(self.full_estim.log_likelihood < self.reduced_estim.log_likelihood):
@@ -596,27 +510,29 @@ def _test(self):
         return stats.likelihood_ratio_test(
             ll_full=self.full_estim.log_likelihood,
             ll_reduced=self.reduced_estim.log_likelihood,
-            df_full=self.full_estim.constraints_loc.shape[1] + self.full_estim.constraints_scale.shape[1],
-            df_reduced=self.reduced_estim.constraints_loc.shape[1] + self.reduced_estim.constraints_scale.shape[1],
+            df_full=self.full_estim.input_data.constraints_loc.shape[1] +
+                    self.full_estim.input_data.constraints_scale.shape[1],
+            df_reduced=self.reduced_estim.input_data.constraints_loc.shape[1] +
+                       self.reduced_estim.input_data.constraints_scale.shape[1],
         )
 
     def _ave(self):
         """
-        Returns a xr.DataArray containing the mean expression by gene
+        Returns the mean expression by gene
 
-        :return: xr.DataArray
+        :return: np.ndarray
         """
 
-        return np.mean(self.full_estim.X, axis=0)
+        return np.asarray(np.mean(self.full_estim.x, axis=0)).flatten()
 
     def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e):
         """
-        Returns a xr.DataArray containing the locations for the different categories of the factors
+        Returns the locations for the different categories of the factors
 
         :param factors: the factors to select.
             E.g. `condition` or `batch` if formula would be `~ 1 + batch + condition`
         :param base: the log base to use; default is the natural logarithm
-        :return: xr.DataArray
+        :return: np.ndarray
         """
 
         if not (isinstance(factors, list) or isinstance(factors, tuple) or isinstance(factors, set)):
@@ -626,7 +542,7 @@ def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e):
 
         di = self.full_design_loc_info
         sample_description = self.sample_description[[f.name() for f in di.subset(factors).factor_infos]]
-        dmat = self.full_estim.design_loc
+        dmat = self.full_estim.input_data.design_loc
 
         # make rows unique
         dmat, sample_description = dmat_unique(dmat, sample_description)
@@ -645,19 +561,11 @@ def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e):
         # make the design matrix + sample description unique again
         dmat, sample_description = dmat_unique(dmat, sample_description)
 
-        locations = self.full_estim.inverse_link_loc(dmat.dot(self.full_estim.par_link_loc))
+        locations = self.full_estim.model.inverse_link_loc(np.matmul(dmat, self.full_estim.model.a))
         locations = np.log(locations) / np.log(base)
 
         dist = np.expand_dims(locations, axis=0)
         dist = np.transpose(dist, [1, 0, 2]) - dist
-        dist = xr.DataArray(dist, dims=("minuend", "subtrahend", "gene"))
-        # retval = xr.Dataset({"logFC": retval})
-
-        dist.coords["gene"] = self.gene_ids
-
-        for col in sample_description:
-            dist.coords["minuend_" + col] = (("minuend",), sample_description[col])
-            dist.coords["subtrahend_" + col] = (("subtrahend",), sample_description[col])
 
         # # If this is a pairwise comparison, return only one fold change per gene
         # if dist.shape[:2] == (2, 2):
@@ -695,7 +603,7 @@ def log_fold_change(self, base=np.e, return_type="vector"):
                 return None
             else:
                 dists = self._log_fold_change(factors=factors, base=base)
-                return dists[1, 0].values
+                return dists[1, 0]
         else:
             dists = self._log_fold_change(factors=factors, base=base)
             return dists
@@ -709,12 +617,12 @@ def locations(self):
 
         di = self.full_design_loc_info
         sample_description = self.sample_description[[f.name() for f in di.factor_infos]]
-        dmat = self.full_estim.design_loc
+        dmat = self.full_estim.input_data.design_loc
 
         dmat, sample_description = dmat_unique(dmat, sample_description)
 
-        retval = self.full_estim.inverse_link_loc(dmat.dot(self.full_estim.par_link_loc))
-        retval = pd.DataFrame(retval, columns=self.full_estim.features)
+        retval = self.full_estim.model.inverse_link_loc(np.matmul(dmat, self.full_estim.model.a))
+        retval = pd.DataFrame(retval, columns=self.full_estim.input_data.features)
         for col in sample_description:
             retval[col] = sample_description[col]
 
@@ -731,12 +639,12 @@ def scales(self):
 
         di = self.full_design_loc_info
         sample_description = self.sample_description[[f.name() for f in di.factor_infos]]
-        dmat = self.full_estim.design_scale
+        dmat = self.full_estim.input_data.design_scale
 
         dmat, sample_description = dmat_unique(dmat, sample_description)
 
         retval = self.full_estim.inverse_link_scale(dmat.doc(self.full_estim.par_link_scale))
-        retval = pd.DataFrame(retval, columns=self.full_estim.features)
+        retval = pd.DataFrame(retval, columns=self.full_estim.input_data.features)
         for col in sample_description:
             retval[col] = sample_description[col]
 
@@ -781,7 +689,7 @@ class DifferentialExpressionTestWald(_DifferentialExpressionTestSingle):
     Single wald test per gene.
     """
 
-    model_estim: _Estimation
+    model_estim: _EstimatorBase
     sample_description: pd.DataFrame
     coef_loc_totest: np.ndarray
     theta_mle: np.ndarray
@@ -791,7 +699,7 @@ class DifferentialExpressionTestWald(_DifferentialExpressionTestSingle):
 
     def __init__(
             self,
-            model_estim: _Estimation,
+            model_estim: _EstimatorBase,
             col_indices: np.ndarray,
             noise_model: str,
             sample_description: pd.DataFrame
@@ -809,28 +717,32 @@ def __init__(
         self._store_ols = None
 
         try:
-            if model_estim._error_codes is not None:
-                self._error_codes = model_estim._error_codes
+            if self.model_estim.error_codes is not None:
+                self._error_codes = self.model_estim.error_codes
+            else:
+                self._error_codes = None
         except Exception as e:
             self._error_codes = None
 
         try:
-            if model_estim._niter is not None:
-                self._niter = model_estim._niter
+            if self.model_estim.niter is not None:
+                self._niter = self.model_estim.niter
+            else:
+                self._niter = None
         except Exception as e:
             self._niter = None
 
     @property
     def gene_ids(self) -> np.ndarray:
-        return np.asarray(self.model_estim.features)
+        return np.asarray(self.model_estim.input_data.features)
 
     @property
-    def X(self):
-        return self.model_estim.X
+    def x(self):
+        return self.model_estim.x
 
     @property
     def model_gradient(self):
-        return self.model_estim.gradients
+        return self.model_estim.jacobian
 
     def log_fold_change(self, base=np.e, **kwargs):
         """
@@ -854,35 +766,34 @@ def log_fold_change(self, base=np.e, **kwargs):
 
     def _ll(self):
         """
-        Returns a xr.DataArray containing the log likelihood of each gene
+        Returns the log likelihood of each gene
 
-        :return: xr.DataArray
+        :return: np.ndarray
         """
         return self.model_estim.log_likelihood
 
     def _ave(self):
         """
-        Returns a xr.DataArray containing the mean expression by gene
+        Returns the mean expression by gene
 
-        :return: xr.DataArray
+        :return: np.ndarray
         """
-        return self.X.mean(axis=0)
+        return np.asarray(self.x.mean(axis=0)).flatten()
 
     def _test(self):
         """
-        Returns a xr.DataArray containing the p-value for differential expression for each gene
+        Returns the p-value for differential expression for each gene
 
-        :return: xr.DataArray
+        :return: np.ndarray
         """
         # Check whether single- or multiple parameters are tested.
         # For a single parameter, the wald statistic distribution is approximated
         # with a normal distribution, for multiple parameters, a chi-square distribution is used.
         self.theta_mle = self.model_estim.a_var[self.coef_loc_totest]
         if len(self.coef_loc_totest) == 1:
-            self.theta_mle = self.theta_mle[0]  # Make xarray one dimensional for stats.wald_test.
-            self.theta_sd = self.model_estim.fisher_inv[:, self.coef_loc_totest[0], self.coef_loc_totest[0]].values
-            self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd,
-                                         where=self.theta_sd < np.nextafter(0, np.inf))
+            self.theta_mle = self.theta_mle[0]
+            self.theta_sd = self.model_estim.fisher_inv[:, self.coef_loc_totest[0], self.coef_loc_totest[0]]
+            self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd, where=self.theta_sd < np.nextafter(0, np.inf))
             self.theta_sd = np.sqrt(self.theta_sd)
             return stats.wald_test(
                 theta_mle=self.theta_mle,
@@ -891,12 +802,11 @@ def _test(self):
             )
         else:
             self.theta_sd = np.diagonal(self.model_estim.fisher_inv, axis1=-2, axis2=-1).copy()
-            self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd,
-                                         where=self.theta_sd < np.nextafter(0, np.inf))
+            self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd, where=self.theta_sd < np.nextafter(0, np.inf))
             self.theta_sd = np.sqrt(self.theta_sd)
             return stats.wald_test_chisq(
                 theta_mle=self.theta_mle,
-                theta_covar=self.model_estim.fisher_inv[:, self.coef_loc_totest, self.coef_loc_totest],
+                theta_covar=self.model_estim.fisher_inv[:, self.coef_loc_totest, :][:, :, self.coef_loc_totest],
                 theta0=0
             )
 
@@ -944,12 +854,19 @@ def summary(
     def plot_vs_ttest(
             self,
             log10=False,
+            show: bool = True,
+            save: Union[str, None] = None,
+            suffix: str = "_plot_vs_ttest.png",
             return_axs: bool = False
     ):
         """
         Normalizes data by size factors if any were used in model.
 
         :param log10:
+        :param show: Whether (if save is not None) and where (save indicates dir and file stem) to display plot.
+        :param save: Path+file name stem to save plots to.
+            File will be save+suffix. Does not save if save is None.
+        :param suffix: Suffix for file name to save plot to. Also use this to set the file type.
         :param return_axs: Whether to return axis objects.
 
         :return:
@@ -958,12 +875,17 @@ def plot_vs_ttest(
         import seaborn as sns
         from .tests import t_test
 
-        grouping = np.asarray(self.model_estim.design_loc[:, self.coef_loc_totest])
+        plt.ioff()
+
+        grouping = np.asarray(self.model_estim.input_data.design_loc[:, self.coef_loc_totest])
         # Normalize by size factors that were used in regression.
-        sf = np.broadcast_to(np.expand_dims(self.model_estim.size_factors, axis=1),
-                             shape=self.model_estim.X.shape)
+        if self.model_estim.input_data.size_factors is not None:
+            sf = np.broadcast_to(np.expand_dims(self.model_estim.input_data.size_factors, axis=1),
+                                 shape=self.model_estim.x.shape)
+        else:
+            sf = np.ones(shape=(self.model_estim.x.shape[0], 1))
         ttest = t_test(
-            data=self.model_estim.X.multiply(1 / sf, copy=True),
+            data=self.model_estim.x / sf,
             grouping=grouping,
             gene_names=self.gene_ids,
         )
@@ -980,6 +902,16 @@ def plot_vs_ttest(
 
         ax.set(xlabel="t-test", ylabel='wald test')
 
+        # Save, show and return figure.
+        if save is not None:
+            plt.savefig(save + suffix)
+
+        if show:
+            plt.show()
+
+        plt.close(fig)
+        plt.ion()
+
         if return_axs:
             return ax
         else:
@@ -1019,11 +951,13 @@ def plot_comparison_ols_coef(
         import matplotlib.pyplot as plt
         from matplotlib import gridspec
         from matplotlib import rcParams
-        from batchglm.api.models.glm_norm import Estimator, InputData
+        from batchglm.api.models.glm_norm import Estimator, InputDataGLM
+
+        plt.ioff()
 
         # Run OLS model fit to have comparison coefficients.
         if self._store_ols is None:
-            input_data_ols = InputData.new(
+            input_data_ols = InputDataGLM(
                 data=self.model_estim.input_data.data,
                 design_loc=self.model_estim.input_data.design_loc,
                 design_scale=self.model_estim.input_data.design_scale[:, [0]],
@@ -1048,10 +982,10 @@ def plot_comparison_ols_coef(
         # Prepare parameter summary of both model fits.
         par_loc = self.model_estim.input_data.data.coords["design_loc_params"].values
 
-        a_var_ols = store_ols.a_var.values
+        a_var_ols = store_ols.a_var
         a_var_ols[1:, :] = (a_var_ols[1:, :] + a_var_ols[[0], :]) / a_var_ols[[0], :]
 
-        a_var_user = self.model_estim.a_var.values
+        a_var_user = self.model_estim.a_var
         # Translate coefficients from both fits to be multiplicative in identity space.
         if self.noise_model == "nb":
             a_var_user = np.exp(a_var_user)  # self.model_estim.inverse_link_loc(a_var_user)
@@ -1156,11 +1090,13 @@ def plot_comparison_ols_pred(
         import matplotlib.pyplot as plt
         from matplotlib import gridspec
         from matplotlib import rcParams
-        from batchglm.api.models.glm_norm import Estimator, InputData
+        from batchglm.api.models.glm_norm import Estimator, InputDataGLM
+
+        plt.ioff()
 
         # Run OLS model fit to have comparison coefficients.
         if self._store_ols is None:
-            input_data_ols = InputData.new(
+            input_data_ols = InputDataGLM(
                 data=self.model_estim.input_data.data,
                 design_loc=self.model_estim.input_data.design_loc,
                 design_scale=self.model_estim.input_data.design_scale[:, [0]],
@@ -1203,19 +1139,16 @@ def plot_comparison_ols_pred(
 
         pred_n_cells = sample(
             population=list(np.arange(0, self.model_estim.X.shape[0])),
-            k=np.min([20, self.model_estim.design_loc.shape[0]])
+            k=np.min([20, self.model_estim.input_data.design_loc.shape[0]])
         )
 
-        if isinstance(self.model_estim.X, SparseXArrayDataArray):
-            x = np.asarray(self.model_estim.X.X[pred_n_cells, :].todense()).flatten()
-        else:
-            x = np.asarray(self.model_estim.X[pred_n_cells, :]).flatten()
+        x = np.asarray(self.model_estim.X[pred_n_cells, :]).flatten()
 
-        y_user = self.model_estim.inverse_link_loc(
-            np.matmul(self.model_estim.design_loc[pred_n_cells, :].values, self.model_estim.a_var.values).flatten()
+        y_user = self.model_estim.model.inverse_link_loc(
+            np.matmul(self.model_estim.input_data.design_loc[pred_n_cells, :], self.model_estim.a_var).flatten()
         )
         y_ols = store_ols.inverse_link_loc(
-            np.matmul(store_ols.design_loc[pred_n_cells, :].values, store_ols.a_var.values).flatten()
+            np.matmul(store_ols.design_loc[pred_n_cells, :], store_ols.a_var).flatten()
         )
         if log1p_transform:
             x = np.log(x+1)
@@ -1309,15 +1242,15 @@ def _assemble_gene_fits(
         :param log1p_transform: Whether to log transform observations
             before estimating the distribution with boxplot. Model estimates are adjusted accordingly.
         :param incl_fits: Whether to include fits in plot.
-        :return summaries_genes: List with data frame for seabron in it.
+        :return summaries_genes: List with data frame for seaborn in it.
         """
 
         summaries_genes = []
         for i, g in enumerate(gene_names):
-            assert g in self.model_estim.features, "gene %g not found" % g
-            g_idx = self.model_estim.features.tolist().index(g)
+            assert g in self.model_estim.input_data.features, "gene %g not found" % g
+            g_idx = self.model_estim.input_data.features.index(g)
             # Raw data for boxplot:
-            y = self.model_estim.X[:, g_idx]
+            y = self.model_estim.x[:, g_idx]
             # Model fits:
             loc = self.model_estim.location[:, g_idx]
             scale = self.model_estim.scale[:, g_idx]
@@ -1613,16 +1546,20 @@ def __init__(
             is_sig_zerovar: bool = True
     ):
         super().__init__()
-        self._X = data
+        if isinstance(data, anndata.AnnData) or isinstance(data, anndata.Raw):
+            data = data.X
+        elif isinstance(data, _InputDataBase):
+            data = data.x
+        self._x = data
         self.sample_description = sample_description
         self.grouping = grouping
         self._gene_names = np.asarray(gene_names)
 
-        x0, x1 = split_X(data, grouping)
+        x0, x1 = split_x(data, grouping)
 
         # Only compute p-values for genes with non-zero observations and non-zero group-wise variance.
-        mean_x0 = x0.mean(axis=0).astype(dtype=np.float)
-        mean_x1 = x1.mean(axis=0).astype(dtype=np.float)
+        mean_x0 = np.asarray(np.mean(x0, axis=0)).flatten().astype(dtype=np.float)
+        mean_x1 = np.asarray(np.mean(x1, axis=0)).flatten().astype(dtype=np.float)
         # Avoid unnecessary mean computation:
         self._mean = np.average(
             a=np.vstack([mean_x0, mean_x1]),
@@ -1632,8 +1569,13 @@ def __init__(
             returned=False
         )
         self._ave_nonzero = self._mean != 0  # omit all-zero features
-        var_x0 = np.asarray(x0.var(axis=0)).flatten().astype(dtype=np.float)
-        var_x1 = np.asarray(x1.var(axis=0)).flatten().astype(dtype=np.float)
+        if isinstance(x0, scipy.sparse.csr_matrix):
+            # Efficient analytic expression of variance without densification.
+            var_x0 = np.asarray(np.mean(x0.power(2), axis=0)).flatten().astype(dtype=np.float) - np.square(mean_x0)
+            var_x1 = np.asarray(np.mean(x1.power(2), axis=0)).flatten().astype(dtype=np.float) - np.square(mean_x1)
+        else:
+            var_x0 = np.asarray(np.var(x0, axis=0)).flatten().astype(dtype=np.float)
+            var_x1 = np.asarray(np.var(x1, axis=0)).flatten().astype(dtype=np.float)
         self._var_geq_zero = np.logical_or(
             var_x0 > 0,
             var_x1 > 0
@@ -1652,6 +1594,10 @@ def __init__(
             np.logical_and(mean_x0 == mean_x1, self._mean > 0),
             np.logical_not(self._var_geq_zero)
         ))[0]] = 1.0
+        # Depening on user choice via is_sig_zerovar:
+        # Set p-value to 0 if LFC was non-zero and variances are zero,
+        # this causes division by zero in the test statistic. This
+        # is a highly significant result if one believes the variance estimate.
         if is_sig_zerovar:
             pval[np.where(np.logical_and(
                 mean_x0 != mean_x1,
@@ -1664,36 +1610,14 @@ def __init__(
             self._logfc = mean_x1 - mean_x0
         else:
             self._logfc = np.log(mean_x1) - np.log(mean_x0)
-        # Return 0 if LFC was non-zero and variances are zero,
-        # this causes division by zero in the test statistic. This
-        # is a highly significant result if one believes the variance estimate.
-        # This is the default which can be changed and can be changed
-        # via DIFFXPY_TREAT_ZEROVAR_TT_AS_SIG.
-        pval[np.where(np.logical_and(np.logical_and(
-            np.logical_not(self._var_geq_zero),
-            self._ave_nonzero),
-            np.abs(self._logfc) < np.nextafter(0, 1)
-        ))] = 0
-        if pkg_constants.DE_TREAT_ZEROVAR_TT_AS_SIG:
-            pval[np.where(np.logical_and(np.logical_and(
-                np.logical_not(self._var_geq_zero),
-                self._ave_nonzero),
-                np.abs(self._logfc) >= np.nextafter(0, 1)
-            ))] = 1
-        else:
-            pval[np.where(np.logical_and(np.logical_and(
-                np.logical_not(self._var_geq_zero),
-                self._ave_nonzero),
-                np.abs(self._logfc) >= np.nextafter(0, 1)
-            ))] = 0
 
     @property
     def gene_ids(self) -> np.ndarray:
         return self._gene_names
 
     @property
-    def X(self):
-        return self._X
+    def x(self):
+        return self._x
 
     def log_fold_change(self, base=np.e, **kwargs):
         """
@@ -1747,15 +1671,19 @@ def __init__(
             is_sig_zerovar: bool = True
     ):
         super().__init__()
-        self._X = data
+        if isinstance(data, anndata.AnnData) or isinstance(data, anndata.Raw):
+            data = data.X
+        elif isinstance(data, _InputDataBase):
+            data = data.x
+        self._x = data
         self.sample_description = sample_description
         self.grouping = grouping
         self._gene_names = np.asarray(gene_names)
 
-        x0, x1 = split_X(data, grouping)
+        x0, x1 = split_x(data, grouping)
 
-        mean_x0 = x0.mean(axis=0).astype(dtype=np.float)
-        mean_x1 = x1.mean(axis=0).astype(dtype=np.float)
+        mean_x0 = np.asarray(np.mean(x0, axis=0)).flatten().astype(dtype=np.float)
+        mean_x1 = np.asarray(np.mean(x1, axis=0)).flatten().astype(dtype=np.float)
         # Avoid unnecessary mean computation:
         self._mean = np.average(
             a=np.vstack([mean_x0, mean_x1]),
@@ -1764,30 +1692,33 @@ def __init__(
             axis=0,
             returned=False
         )
-        var_x0 = np.asarray(x0.var(axis=0)).flatten().astype(dtype=np.float)
-        var_x1 = np.asarray(x1.var(axis=0)).flatten().astype(dtype=np.float)
+        if isinstance(x0, scipy.sparse.csr_matrix):
+            # Efficient analytic expression of variance without densification.
+            var_x0 = np.asarray(np.mean(x0.power(2), axis=0)).flatten().astype(dtype=np.float) - np.square(mean_x0)
+            var_x1 = np.asarray(np.mean(x1.power(2), axis=0)).flatten().astype(dtype=np.float) - np.square(mean_x1)
+        else:
+            var_x0 = np.asarray(np.var(x0, axis=0)).flatten().astype(dtype=np.float)
+            var_x1 = np.asarray(np.var(x1, axis=0)).flatten().astype(dtype=np.float)
         self._var_geq_zero = np.logical_or(
             var_x0 > 0,
             var_x1 > 0
         )
         idx_run = np.where(np.logical_and(self._mean != 0, self._var_geq_zero))[0]
 
-        # TODO: can this be done on sparse?
+        # TODO: can this be done directly on sparse?
         pval = np.zeros([data.shape[1]]) + np.nan
-        if isinstance(x0, xr.DataArray):
-            pval[idx_run] = stats.mann_whitney_u_test(
-                x0=x0.data[:, idx_run],
-                x1=x1.data[:, idx_run]
-            )
-        else:
-            pval[idx_run] = stats.mann_whitney_u_test(
-                x0=x0.X[:, idx_run].toarray(),
-                x1=x1.X[:, idx_run].toarray()
-            )
+        pval[idx_run] = stats.mann_whitney_u_test(
+            x0=x0[:, idx_run],
+            x1=x1[:, idx_run]
+        )
         pval[np.where(np.logical_and(
             np.logical_and(mean_x0 == mean_x1, self._mean > 0),
             np.logical_not(self._var_geq_zero)
         ))[0]] = 1.0
+        # Depening on user choice via is_sig_zerovar:
+        # Set p-value to 0 if LFC was non-zero and variances are zero,
+        # this causes division by zero in the test statistic. This
+        # is a highly significant result if one believes the variance estimate.
         if is_sig_zerovar:
             pval[np.where(np.logical_and(
                 mean_x0 != mean_x1,
@@ -1806,8 +1737,8 @@ def gene_ids(self) -> np.ndarray:
         return self._gene_names
 
     @property
-    def X(self):
-        return self._X
+    def x(self):
+        return self._x
 
     def log_fold_change(self, base=np.e, **kwargs):
         """
@@ -1855,7 +1786,7 @@ def plot_vs_ttest(self, log10=False):
 
         grouping = self.grouping
         ttest = t_test(
-            data=self.X,
+            data=self.x,
             grouping=grouping,
             gene_names=self.gene_ids,
         )
@@ -1967,7 +1898,7 @@ def gene_ids(self) -> np.ndarray:
         return self._gene_ids
 
     @property
-    def X(self):
+    def x(self):
         return None
 
     @property
@@ -2159,11 +2090,17 @@ class DifferentialExpressionTestZTest(_DifferentialExpressionTestMulti):
     Pairwise unit_test between more than 2 groups per gene.
     """
 
-    model_estim: _Estimation
+    model_estim: _EstimatorBase
     theta_mle: np.ndarray
     theta_sd: np.ndarray
 
-    def __init__(self, model_estim: _Estimation, grouping, groups, correction_type: str):
+    def __init__(
+            self,
+            model_estim: _EstimatorBase,
+            grouping,
+            groups,
+            correction_type: str
+    ):
         super().__init__(correction_type=correction_type)
         self.model_estim = model_estim
         self.grouping = grouping
@@ -2182,7 +2119,7 @@ def __init__(self, model_estim: _Estimation, grouping, groups, correction_type:
 
     def _test(self, **kwargs):
         groups = self.groups
-        num_features = self.model_estim.X.shape[1]
+        num_features = self.model_estim.x.shape[1]
 
         pvals = np.tile(np.NaN, [len(groups), len(groups), num_features])
         pvals[np.eye(pvals.shape[0]).astype(bool)] = 1
@@ -2202,28 +2139,28 @@ def _test(self, **kwargs):
 
     @property
     def gene_ids(self) -> np.ndarray:
-        return np.asarray(self.model_estim.features)
+        return np.asarray(self.model_estim.input_data.features)
 
     @property
-    def X(self):
-        return self.model_estim.X
+    def x(self):
+        return self.model_estim.x
 
     @property
     def log_likelihood(self):
-        return np.sum(self.model_estim.log_probs(), axis=0)
+        return self.model_estim.log_likelihood
 
     @property
     def model_gradient(self):
-        return self.model_estim.gradients
+        return self.model_estim.jacobian
 
     def _ave(self):
         """
-        Returns a xr.DataArray containing the mean expression by gene
+        Returns the mean expression by gene
 
-        :return: xr.DataArray
+        :return: np.ndarray
         """
 
-        return np.mean(self.model_estim.X, axis=0)
+        return np.mean(self.model_estim.x, axis=0)
 
     def log_fold_change(self, base=np.e, **kwargs):
         """
@@ -2231,7 +2168,7 @@ def log_fold_change(self, base=np.e, **kwargs):
         """
         if self._logfc is None:
             groups = self.groups
-            num_features = self.model_estim.X.shape[1]
+            num_features = self.model_estim.x.shape[1]
 
             logfc = np.tile(np.NaN, [len(groups), len(groups), num_features])
             logfc[np.eye(logfc.shape[0]).astype(bool)] = 0
@@ -2356,11 +2293,16 @@ class DifferentialExpressionTestZTestLazy(_DifferentialExpressionTestMulti):
     memory.
     """
 
-    model_estim: _Estimation
+    model_estim: _EstimatorBase
     _theta_mle: np.ndarray
     _theta_sd: np.ndarray
 
-    def __init__(self, model_estim: _Estimation, grouping, groups, correction_type="global"):
+    def __init__(
+            self,
+            model_estim: _EstimatorBase,
+            grouping, groups,
+            correction_type="global"
+    ):
         super().__init__(correction_type=correction_type)
         self.model_estim = model_estim
         self.grouping = grouping
@@ -2370,7 +2312,7 @@ def __init__(self, model_estim: _Estimation, grouping, groups, correction_type="
             self.groups = groups.tolist()
 
         # values of parameter estimates: coefficients x genes array with one coefficient per group
-        self._theta_mle = model_estim.par_link_loc
+        self._theta_mle = model_estim.a_var
         # standard deviation of estimates: coefficients x genes array with one coefficient per group
         # theta_sd = sqrt(diagonal(fisher_inv))
         self._theta_sd = np.sqrt(np.diagonal(model_estim.fisher_inv, axis1=-2, axis2=-1)).T
@@ -2410,7 +2352,7 @@ def _test(self, **kwargs):
         pass
 
     def _test_pairs(self, groups0, groups1):
-        num_features = self.model_estim.X.shape[1]
+        num_features = self.model_estim.x.shape[1]
 
         pvals = np.tile(np.NaN, [len(groups0), len(groups1), num_features])
 
@@ -2430,28 +2372,28 @@ def _test_pairs(self, groups0, groups1):
 
     @property
     def gene_ids(self) -> np.ndarray:
-        return np.asarray(self.model_estim.features)
+        return np.asarray(self.model_estim.input_data.features)
 
     @property
-    def X(self):
-        return self.model_estim.X
+    def x(self):
+        return self.model_estim.x
 
     @property
     def log_likelihood(self):
-        return np.sum(self.model_estim.log_probs(), axis=0)
+        return self.model_estim.log_likelihood
 
     @property
     def model_gradient(self):
-        return self.model_estim.gradients
+        return self.model_estim.jacobian
 
     def _ave(self):
         """
-        Returns a xr.DataArray containing the mean expression by gene
+        Returns the mean expression by gene
 
-        :return: xr.DataArray
+        :return: np.ndarray
         """
 
-        return np.mean(self.model_estim.X, axis=0)
+        return np.mean(self.model_estim.x, axis=0)
 
     @property
     def pval(self, **kwargs):
@@ -2459,7 +2401,8 @@ def pval(self, **kwargs):
         This function is not available in lazy results evaluation as it would
         require all pairwise tests to be performed.
         """
-        pass
+        raise ValueError("This function is not available in lazy results evaluation as it would "
+                         "require all pairwise tests to be performed.")
 
     @property
     def qval(self, **kwargs):
@@ -2467,7 +2410,8 @@ def qval(self, **kwargs):
         This function is not available in lazy results evaluation as it would
         require all pairwise tests to be performed.
         """
-        pass
+        raise ValueError("This function is not available in lazy results evaluation as it would "
+                         "require all pairwise tests to be performed.")
 
     def log_fold_change(self, base=np.e, **kwargs):
         """
@@ -2499,9 +2443,9 @@ def summary(
         pass
 
     def _check_groups(self, groups0, groups1):
-        if isinstance(groups0, list)==False:
+        if not isinstance(groups0, list):
             groups0 = [groups0]
-        if isinstance(groups1, list)==False:
+        if not isinstance(groups1, list):
             groups1 = [groups1]
         for g in groups0:
             if g not in self.groups:
@@ -2587,7 +2531,7 @@ def log_fold_change_pairs(self, groups0=None, groups1=None, base=np.e):
         logfc = np.zeros(shape=(len(groups0), len(groups1), num_features))
         for i, g0 in enumerate(groups0):
             for j, g1 in enumerate(groups1):
-                logfc[i, j, :] = self._theta_mle[g0, :].values - self._theta_mle[g1, :].values
+                logfc[i, j, :] = self._theta_mle[g0, :] - self._theta_mle[g1, :]
 
         if base == np.e:
             return logfc
@@ -2718,7 +2662,16 @@ class DifferentialExpressionTestVsRest(_DifferentialExpressionTestMulti):
     Tests between between each group and the rest for more than 2 groups per gene.
     """
 
-    def __init__(self, gene_ids, pval, logfc, ave, groups, tests, correction_type: str):
+    def __init__(
+            self,
+            gene_ids,
+            pval,
+            logfc,
+            ave,
+            groups,
+            tests,
+            correction_type: str
+    ):
         super().__init__(correction_type=correction_type)
         self._gene_ids = np.asarray(gene_ids)
         self._pval = pval
@@ -2744,7 +2697,7 @@ def gene_ids(self) -> np.ndarray:
         return self._gene_ids
 
     @property
-    def X(self) -> Union[np.ndarray, None]:
+    def x(self) -> Union[np.ndarray, None]:
         return None
 
     def log_fold_change(self, base=np.e, **kwargs):
@@ -2853,7 +2806,7 @@ def gene_ids(self) -> np.ndarray:
         return self._gene_ids
 
     @property
-    def X(self) -> np.ndarray:
+    def x(self) -> np.ndarray:
         return None
 
     def log_fold_change(self, base=np.e, **kwargs):
@@ -2903,7 +2856,7 @@ def summary(self, qval_thres=None, fc_upper_thres=None,
 
 class _DifferentialExpressionTestCont(_DifferentialExpressionTestSingle):
     _de_test: _DifferentialExpressionTestSingle
-    _model_estim: _Estimation
+    _model_estim: _EstimatorBase
     _size_factors: np.ndarray
     _continuous_coords: np.ndarray
     _spline_coefs: list
@@ -2911,7 +2864,7 @@ class _DifferentialExpressionTestCont(_DifferentialExpressionTestSingle):
     def __init__(
             self,
             de_test: _DifferentialExpressionTestSingle,
-            model_estim: _Estimation,
+            model_estim: _EstimatorBase,
             size_factors: np.ndarray,
             continuous_coords: np.ndarray,
             spline_coefs: list,
@@ -2929,8 +2882,8 @@ def gene_ids(self) -> np.ndarray:
         return self._de_test.gene_ids
 
     @property
-    def X(self):
-        return self._de_test.X
+    def x(self):
+        return self._de_test.x
 
     @property
     def pval(self) -> np.ndarray:
@@ -2948,8 +2901,14 @@ def mean(self) -> np.ndarray:
     def log_likelihood(self) -> np.ndarray:
         return self._de_test.log_likelihood
 
-    def summary(self, nonnumeric=False, qval_thres=None, fc_upper_thres=None,
-                fc_lower_thres=None, mean_thres=None) -> pd.DataFrame:
+    def summary(
+            self,
+            nonnumeric=False,
+            qval_thres=None,
+            fc_upper_thres=None,
+            fc_lower_thres=None,
+            mean_thres=None
+    ) -> pd.DataFrame:
         """
         Summarize differential expression results into an output table.
 
@@ -2987,7 +2946,7 @@ def log_fold_change(self, base=np.e, genes=None, nonnumeric=False):
         :return: Log-fold change of fitted expression value by gene.
         """
         if genes is None:
-            genes = np.asarray(range(self.X.shape[1]))
+            genes = np.asarray(range(self.x.shape[1]))
         else:
             genes = self._idx_genes(genes)
 
@@ -3016,7 +2975,7 @@ def _filter_genes_int(self, genes: list):
         :param genes: List of genes to filter.
         :return: Filtered list of genes
         """
-        genes_found = np.array([x < self.X.shape[1] for x in genes])
+        genes_found = np.array([x < self.x.shape[1] for x in genes])
         if any(genes_found == False):
             logger.info("did not find some genes, omitting")
             genes = genes[genes_found]
@@ -3062,13 +3021,13 @@ def _continuous_model(self, idx, non_numeric=False):
         """
         idx = np.asarray(idx)
         if non_numeric:
-            mu = np.matmul(self._model_estim.design_loc.values,
+            mu = np.matmul(self._model_estim.input_data.design_loc,
                            self._model_estim.par_link_loc[:, idx])
             if self._size_factors is not None:
                 mu = mu + self._size_factors
         else:
             idx_basis = self._spline_par_loc_idx(intercept=True)
-            mu = np.matmul(self._model_estim.design_loc[:, idx_basis].values,
+            mu = np.matmul(self._model_estim.input_data.design_loc[:, idx_basis],
                            self._model_estim.par_link_loc[idx_basis, idx])
 
         mu = np.exp(mu)
@@ -3189,7 +3148,7 @@ def plot_genes(
             ax = plt.subplot(gs[i])
             axs.append(ax)
 
-            y = self.X[:, g]
+            y = self.x[:, g]
             yhat = self._continuous_model(idx=g, non_numeric=non_numeric)
             if log:
                 y = np.log(y + 1)
diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py
index cba2037..0a87d2f 100644
--- a/diffxpy/testing/tests.py
+++ b/diffxpy/testing/tests.py
@@ -1,20 +1,17 @@
-from typing import Union, List, Dict, Callable, Tuple
-
 import anndata
+try:
+    from anndata.base import Raw
+except ImportError:
+    from anndata import Raw
 import logging
 import numpy as np
 import pandas as pd
 import patsy
 import scipy.sparse
-import xarray as xr
-
-try:
-    from anndata.base import Raw
-except ImportError:
-    from anndata import Raw
+from typing import Union, List, Dict, Callable, Tuple
 
 from batchglm import data as data_utils
-from batchglm.xarray_sparse import SparseXArrayDataSet
+from batchglm.models.base import _EstimatorBase, _InputDataBase
 from diffxpy import pkg_constants
 from diffxpy.models.batch_bfgs.optim import Estim_BFGS
 from .det import DifferentialExpressionTestLRT, DifferentialExpressionTestWald, \
@@ -22,12 +19,9 @@
     DifferentialExpressionTestZTestLazy, DifferentialExpressionTestZTest, DifferentialExpressionTestPairwise, \
     DifferentialExpressionTestVsRest, _DifferentialExpressionTestMulti, DifferentialExpressionTestByPartition, \
     DifferentialExpressionTestWaldCont, DifferentialExpressionTestLRTCont
-from .utils import parse_gene_names, parse_data, parse_sample_description, parse_size_factors, parse_grouping, \
+from .utils import parse_gene_names, parse_sample_description, parse_size_factors, parse_grouping, \
     constraint_system_from_star
 
-# Use this to suppress matrix subclass PendingDepreceationWarnings from numpy:
-np.warnings.filterwarnings("ignore")
-
 
 def _fit(
         noise_model,
@@ -46,7 +40,7 @@ def _fit(
         quick_scale: bool = None,
         close_session=True,
         dtype="float64"
-):
+) -> _EstimatorBase:
     """
     :param noise_model: str, noise model to use in model-based unit_test. Possible options:
 
@@ -133,6 +127,7 @@ def _fit(
     }
 
     if isinstance(training_strategy, str) and training_strategy.lower() == 'bfgs':
+        assert False, "depreceated"
         lib_size = np.zeros(data.shape[0])
         if noise_model == "nb" or noise_model == "negative_binomial":
             estim = Estim_BFGS(X=data, design_loc=design_loc, design_scale=design_scale,
@@ -143,15 +138,13 @@ def _fit(
             raise ValueError('base.test(): `noise_model="%s"` not recognized.' % noise_model)
     else:
         if noise_model == "nb" or noise_model == "negative_binomial":
-            from batchglm.api.models.glm_nb import Estimator, InputData
+            from batchglm.api.models.glm_nb import Estimator, InputDataGLM
         elif noise_model == "norm" or noise_model == "normal":
-            from batchglm.api.models.glm_norm import Estimator, InputData
+            from batchglm.api.models.glm_norm import Estimator, InputDataGLM
         else:
             raise ValueError('base.test(): `noise_model="%s"` not recognized.' % noise_model)
 
-        logging.getLogger("diffxpy").info("Fitting model...")
-        logging.getLogger("diffxpy").debug(" * Assembling input data...")
-        input_data = InputData.new(
+        input_data = InputDataGLM(
             data=data,
             design_loc=design_loc,
             design_scale=design_scale,
@@ -161,7 +154,6 @@ def _fit(
             feature_names=gene_names,
         )
 
-        logging.getLogger("diffxpy").debug(" * Set up Estimator...")
         constructor_args = {}
         if batch_size is not None:
             constructor_args["batch_size"] = batch_size
@@ -179,12 +171,9 @@ def _fit(
             dtype=dtype,
             **constructor_args
         )
-
-        logging.getLogger("diffxpy").debug(" * Initializing Estimator...")
         estim.initialize()
 
-        logging.getLogger("diffxpy").debug(" * Run estimation...")
-        # training:
+        # Training:
         if callable(training_strategy):
             # call training_strategy if it is a function
             training_strategy(estim)
@@ -192,17 +181,13 @@ def _fit(
             estim.train_sequence(training_strategy=training_strategy)
 
         if close_session:
-            logging.getLogger("diffxpy").debug(" * Finalize estimation...")
-            model = estim.finalize()
-        else:
-            model = estim
-        logging.getLogger("diffxpy").debug(" * Model fitting done.")
+            estim.finalize()
 
-    return model
+    return estim
 
 
 def lrt(
-        data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
         full_formula_loc: str,
         reduced_formula_loc: str,
         full_formula_scale: str = "~1",
@@ -215,7 +200,7 @@ def lrt(
         noise_model="nb",
         size_factors: Union[np.ndarray, pd.core.series.Series, np.ndarray] = None,
         batch_size: int = None,
-        training_strategy: Union[str, List[Dict[str, object]], Callable] = "DEFAULT",
+        training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
         quick_scale: bool = False,
         dtype="float64",
         **kwargs
@@ -306,11 +291,10 @@ def lrt(
         as_numeric = [as_numeric]
 
     gene_names = parse_gene_names(data, gene_names)
-    X = parse_data(data, gene_names)
     sample_description = parse_sample_description(data, sample_description)
     size_factors = parse_size_factors(
         size_factors=size_factors,
-        data=X,
+        data=data,
         sample_description=sample_description
     )
 
@@ -341,7 +325,7 @@ def lrt(
 
     reduced_model = _fit(
         noise_model=noise_model,
-        data=X,
+        data=data,
         design_loc=reduced_design_loc,
         design_scale=reduced_design_scale,
         constraints_loc=None,
@@ -358,7 +342,7 @@ def lrt(
     )
     full_model = _fit(
         noise_model=noise_model,
-        data=X,
+        data=data,
         design_loc=full_design_loc,
         design_scale=full_design_scale,
         constraints_loc=None,
@@ -387,7 +371,7 @@ def lrt(
 
 
 def wald(
-        data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
         factor_loc_totest: Union[str, List[str]] = None,
         coef_to_test: Union[str, List[str]] = None,
         formula_loc: Union[None, str] = None,
@@ -397,8 +381,8 @@ def wald(
         init_b: Union[np.ndarray, str] = "AUTO",
         gene_names: Union[np.ndarray, list] = None,
         sample_description: Union[None, pd.DataFrame] = None,
-        dmat_loc: Union[patsy.design_info.DesignMatrix, xr.Dataset] = None,
-        dmat_scale: Union[patsy.design_info.DesignMatrix, xr.Dataset] = None,
+        dmat_loc: Union[patsy.design_info.DesignMatrix] = None,
+        dmat_scale: Union[patsy.design_info.DesignMatrix] = None,
         constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None,
         constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None,
         noise_model: str = "nb",
@@ -566,12 +550,11 @@ def wald(
 
     # # Parse input data formats:
     gene_names = parse_gene_names(data, gene_names)
-    X = parse_data(data, gene_names)
     if dmat_loc is None and dmat_scale is None:
         sample_description = parse_sample_description(data, sample_description)
     size_factors = parse_size_factors(
         size_factors=size_factors,
-        data=X,
+        data=data,
         sample_description=sample_description
     )
 
@@ -582,7 +565,6 @@ def wald(
         formula=formula_loc,
         as_numeric=as_numeric,
         constraints=constraints_loc,
-        dims=["design_loc_params", "loc_params"],
         return_type="patsy"
     )
     logging.getLogger("diffxpy").debug("building scale model")
@@ -592,7 +574,6 @@ def wald(
         formula=formula_scale,
         as_numeric=as_numeric,
         constraints=constraints_scale,
-        dims=["design_scale_params", "scale_params"],
         return_type="patsy"
     )
 
@@ -637,7 +618,7 @@ def wald(
 
     model = _fit(
         noise_model=noise_model,
-        data=X,
+        data=data,
         design_loc=design_loc,
         design_scale=design_scale,
         constraints_loc=constraints_loc,
@@ -664,19 +645,18 @@ def wald(
 
 
 def t_test(
-        data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
         grouping,
         gene_names: Union[np.ndarray, list] = None,
         sample_description: pd.DataFrame = None,
         is_logged: bool = False,
-        is_sig_zerovar: bool = True,
-        dtype="float64"
+        is_sig_zerovar: bool = True
 ):
     """
     Perform Welch's t-test for differential expression
     between two groups on adata object for each gene.
 
-    :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations.
+    :param data: Array-like, or anndata.Anndata object containing observations.
         Input data matrix (observations x features) or (cells x genes).
     :param grouping: str, array
 
@@ -692,13 +672,10 @@ def t_test(
         the p-value is set to np.nan.
     """
     gene_names = parse_gene_names(data, gene_names)
-    X = parse_data(data, gene_names)
-    if isinstance(X, SparseXArrayDataSet):
-        X = X.X
     grouping = parse_grouping(data, sample_description, grouping)
 
     de_test = DifferentialExpressionTestTT(
-        data=X.astype(dtype),
+        data=data,
         sample_description=sample_description,
         grouping=grouping,
         gene_names=gene_names,
@@ -710,19 +687,18 @@ def t_test(
 
 
 def rank_test(
-        data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
         grouping: Union[str, np.ndarray, list],
         gene_names: Union[np.ndarray, list] = None,
         sample_description: pd.DataFrame = None,
         is_logged: bool = False,
-        is_sig_zerovar: bool = True,
-        dtype="float64"
+        is_sig_zerovar: bool = True
 ):
     """
     Perform Mann-Whitney rank test (Wilcoxon rank-sum test) for differential expression
     between two groups on adata object for each gene.
 
-    :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations.
+    :param data: Array-like, or anndata.Anndata object containing observations.
         Input data matrix (observations x features) or (cells x genes).
     :param grouping: str, array
 
@@ -738,13 +714,10 @@ def rank_test(
         the p-value is set to np.nan.
     """
     gene_names = parse_gene_names(data, gene_names)
-    X = parse_data(data, gene_names)
-    if isinstance(X, SparseXArrayDataSet):
-        X = X.X
     grouping = parse_grouping(data, sample_description, grouping)
 
     de_test = DifferentialExpressionTestRank(
-        data=X.astype(dtype),
+        data=data,
         sample_description=sample_description,
         grouping=grouping,
         gene_names=gene_names,
@@ -756,7 +729,7 @@ def rank_test(
 
 
 def two_sample(
-        data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
         grouping: Union[str, np.ndarray, list],
         as_numeric: Union[List[str], Tuple[str], str] = (),
         test: str = "t-test",
@@ -802,7 +775,7 @@ def two_sample(
         Doesn't require fitting of generalized linear models.
         Wilcoxon rank sum (Mann-Whitney U) test between both observation groups.
 
-    :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations.
+    :param data: Array-like, or anndata.Anndata object containing observations.
         Input data matrix (observations x features) or (cells x genes).
     :param grouping: str, array
 
@@ -851,7 +824,6 @@ def two_sample(
                          'The t-test is based on a gaussian noise model and wilcoxon is model free.')
 
     gene_names = parse_gene_names(data, gene_names)
-    X = parse_data(data, gene_names)
     grouping = parse_grouping(data, sample_description, grouping)
     sample_description = pd.DataFrame({"grouping": grouping})
 
@@ -867,7 +839,7 @@ def two_sample(
         formula_loc = '~ 1 + grouping'
         formula_scale = '~ 1 + grouping'
         de_test = wald(
-            data=X,
+            data=data,
             factor_loc_totest="grouping",
             as_numeric=as_numeric,
             coef_to_test=None,
@@ -891,7 +863,7 @@ def two_sample(
         reduced_formula_loc = '~ 1'
         reduced_formula_scale = '~ 1'
         de_test = lrt(
-            data=X,
+            data=data,
             full_formula_loc=full_formula_loc,
             reduced_formula_loc=reduced_formula_loc,
             full_formula_scale=full_formula_scale,
@@ -909,7 +881,7 @@ def two_sample(
         )
     elif test.lower() == 't-test' or test.lower() == "t_test" or test.lower() == "ttest":
         de_test = t_test(
-            data=X,
+            data=data,
             gene_names=gene_names,
             grouping=grouping,
             is_sig_zerovar=is_sig_zerovar,
@@ -917,7 +889,7 @@ def two_sample(
         )
     elif test.lower() == 'rank':
         de_test = rank_test(
-            data=X,
+            data=data,
             gene_names=gene_names,
             grouping=grouping,
             is_sig_zerovar=is_sig_zerovar,
@@ -930,7 +902,7 @@ def two_sample(
 
 
 def pairwise(
-        data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
         grouping: Union[str, np.ndarray, list],
         as_numeric: Union[List[str], Tuple[str], str] = (),
         test: str = 'z-test',
@@ -982,7 +954,7 @@ def pairwise(
         Doesn't require fitting of generalized linear models.
         Wilcoxon rank sum (Mann-Whitney U) test between both observation groups.
 
-    :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations.
+    :param data: Array-like, or anndata.Anndata object containing observations.
         Input data matrix (observations x features) or (cells x genes).
     :param grouping: str, array
 
@@ -1048,7 +1020,6 @@ def pairwise(
     # Do not store all models but only p-value and q-value matrix:
     # genes x groups x groups
     gene_names = parse_gene_names(data, gene_names)
-    X = parse_data(data, gene_names)
     sample_description = parse_sample_description(data, sample_description)
     grouping = parse_grouping(data, sample_description, grouping)
     sample_description = pd.DataFrame({"grouping": grouping})
@@ -1061,7 +1032,7 @@ def pairwise(
         )
         model = _fit(
             noise_model=noise_model,
-            data=X,
+            data=data,
             design_loc=dmat,
             design_scale=dmat,
             gene_names=gene_names,
@@ -1089,9 +1060,9 @@ def pairwise(
             )
     else:
         groups = np.unique(grouping)
-        pvals = np.tile(np.NaN, [len(groups), len(groups), X.shape[1]])
+        pvals = np.tile(np.NaN, [len(groups), len(groups), data.shape[1]])
         pvals[np.eye(pvals.shape[0]).astype(bool)] = 0
-        logfc = np.tile(np.NaN, [len(groups), len(groups), X.shape[1]])
+        logfc = np.tile(np.NaN, [len(groups), len(groups), data.shape[1]])
         logfc[np.eye(logfc.shape[0]).astype(bool)] = 0
 
         if keep_full_test_objs:
@@ -1105,7 +1076,7 @@ def pairwise(
 
                 sel = (grouping == g1) | (grouping == g2)
                 de_test_temp = two_sample(
-                    data=X[sel],
+                    data=data[sel],
                     grouping=grouping[sel],
                     as_numeric=as_numeric,
                     test=test,
@@ -1132,7 +1103,7 @@ def pairwise(
             gene_ids=gene_names,
             pval=pvals,
             logfc=logfc,
-            ave=np.mean(X, axis=0),
+            ave=np.mean(data, axis=0),
             groups=groups,
             tests=tests,
             correction_type=pval_correction
@@ -1142,7 +1113,7 @@ def pairwise(
 
 
 def versus_rest(
-        data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
         grouping: Union[str, np.ndarray, list],
         as_numeric: Union[List[str], Tuple[str], str] = (),
         test: str = 'wald',
@@ -1194,7 +1165,7 @@ def versus_rest(
         Doesn't require fitting of generalized linear models.
         Wilcoxon rank sum (Mann-Whitney U) test between both observation groups.
 
-    :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations.
+    :param data: Array-like or anndata.Anndata object containing observations.
         Input data matrix (observations x features) or (cells x genes).
     :param grouping: str, array
 
@@ -1254,14 +1225,13 @@ def versus_rest(
     # Do not store all models but only p-value and q-value matrix:
     # genes x groups
     gene_names = parse_gene_names(data, gene_names)
-    X = parse_data(data, gene_names)
     sample_description = parse_sample_description(data, sample_description)
     grouping = parse_grouping(data, sample_description, grouping)
     sample_description = pd.DataFrame({"grouping": grouping})
 
     groups = np.unique(grouping)
-    pvals = np.zeros([1, len(groups), X.shape[1]])
-    logfc = np.zeros([1, len(groups), X.shape[1]])
+    pvals = np.zeros([1, len(groups), data.shape[1]])
+    logfc = np.zeros([1, len(groups), data.shape[1]])
 
     if keep_full_test_objs:
         tests = np.tile([None], [1, len(groups)])
@@ -1271,7 +1241,7 @@ def versus_rest(
     for i, g1 in enumerate(groups):
         test_grouping = np.where(grouping == g1, "group", "rest")
         de_test_temp = two_sample(
-            data=X,
+            data=data,
             grouping=test_grouping,
             as_numeric=as_numeric,
             test=test,
@@ -1295,7 +1265,7 @@ def versus_rest(
         gene_ids=gene_names,
         pval=pvals,
         logfc=logfc,
-        ave=np.mean(X, axis=0),
+        ave=np.mean(data, axis=0),
         groups=groups,
         tests=tests,
         correction_type=pval_correction
@@ -1305,7 +1275,7 @@ def versus_rest(
 
 
 def partition(
-        data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
         parts: Union[str, np.ndarray, list],
         gene_names: Union[np.ndarray, list] = None,
         sample_description: pd.DataFrame = None
@@ -1320,7 +1290,7 @@ def partition(
 
     Wraps _Partition so that doc strings are nice.
 
-    :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations.
+    :param data: Array-like or anndata.Anndata object containing observations.
         Input data matrix (observations x features) or (cells x genes).
     :param parts: str, array
 
@@ -1348,13 +1318,13 @@ class _Partition:
 
     def __init__(
             self,
-            data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray],
+            data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
             parts: Union[str, np.ndarray, list],
             gene_names: Union[np.ndarray, list] = None,
             sample_description: pd.DataFrame = None
     ):
         """
-        :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations.
+        :param data: Array-like or anndata.Anndata object containing observations.
             Input data matrix (observations x features) or (cells x genes).
         :param parts: str, array
 
@@ -1363,7 +1333,14 @@ def __init__(
         :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these
         :param sample_description: optional pandas.DataFrame containing sample annotations
         """
-        self.X = parse_data(data, gene_names)
+        if isinstance(data, _InputDataBase):
+            self.x = data.x
+        elif isinstance(data, anndata.AnnData) or isinstance(data, Raw):
+            self.x = data.X
+        elif isinstance(data, np.ndarray):
+            self.x = data
+        else:
+            raise ValueError("data type %s not recognized" % type(data))
         self.gene_names = parse_gene_names(data, gene_names)
         self.sample_description = parse_sample_description(data, sample_description)
         self.partition = parse_grouping(data, sample_description, parts)
@@ -1423,7 +1400,7 @@ def two_sample(
         DETestsSingle = []
         for i, idx in enumerate(self.partition_idx):
             DETestsSingle.append(two_sample(
-                data=self.X[idx, :],
+                data=self.x[idx, :],
                 grouping=grouping,
                 as_numeric=as_numeric,
                 test=test,
@@ -1468,7 +1445,7 @@ def t_test(
         DETestsSingle = []
         for i, idx in enumerate(self.partition_idx):
             DETestsSingle.append(t_test(
-                data=self.X[idx, :],
+                data=self.x[idx, :],
                 grouping=grouping,
                 is_logged=is_logged,
                 gene_names=self.gene_names,
@@ -1479,7 +1456,7 @@ def t_test(
         return DifferentialExpressionTestByPartition(
             partitions=self.partitions,
             tests=DETestsSingle,
-            ave=np.mean(self.X, axis=0),
+            ave=np.mean(self.x, axis=0),
             correction_type="by_test")
 
     def rank_test(
@@ -1505,7 +1482,7 @@ def rank_test(
         DETestsSingle = []
         for i, idx in enumerate(self.partition_idx):
             DETestsSingle.append(rank_test(
-                data=self.X[idx, :],
+                data=self.x[idx, :],
                 grouping=grouping,
                 gene_names=self.gene_names,
                 sample_description=self.sample_description.iloc[idx, :],
@@ -1515,7 +1492,7 @@ def rank_test(
         return DifferentialExpressionTestByPartition(
             partitions=self.partitions,
             tests=DETestsSingle,
-            ave=np.mean(self.X, axis=0),
+            ave=np.mean(self.x, axis=0),
             correction_type="by_test")
 
     def lrt(
@@ -1594,7 +1571,7 @@ def lrt(
         DETestsSingle = []
         for i, idx in enumerate(self.partition_idx):
             DETestsSingle.append(lrt(
-                data=self.X[idx, :],
+                data=self.x[idx, :],
                 reduced_formula_loc=reduced_formula_loc,
                 full_formula_loc=full_formula_loc,
                 reduced_formula_scale=reduced_formula_scale,
@@ -1613,7 +1590,7 @@ def lrt(
         return DifferentialExpressionTestByPartition(
             partitions=self.partitions,
             tests=DETestsSingle,
-            ave=np.mean(self.X, axis=0),
+            ave=np.mean(self.x, axis=0),
             correction_type="by_test")
 
     def wald(
@@ -1694,7 +1671,7 @@ def wald(
         DETestsSingle = []
         for i, idx in enumerate(self.partition_idx):
             DETestsSingle.append(wald(
-                data=self.X[idx, :],
+                data=self.x[idx, :],
                 factor_loc_totest=factor_loc_totest,
                 coef_to_test=coef_to_test,
                 formula_loc=formula_loc,
@@ -1713,12 +1690,12 @@ def wald(
         return DifferentialExpressionTestByPartition(
             partitions=self.partitions,
             tests=DETestsSingle,
-            ave=np.mean(self.X, axis=0),
+            ave=np.mean(self.x, axis=0),
             correction_type="by_test")
 
 
 def continuous_1d(
-        data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix],
         continuous: str,
         df: int = 5,
         factor_loc_totest: Union[str, List[str]] = None,
@@ -1759,7 +1736,7 @@ def continuous_1d(
     design matrices are built within this function and the shape of constraint
     matrices depends on the output of this function.
 
-    :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations.
+    :param data: Array-like or anndata.Anndata object containing observations.
         Input data matrix (observations x features) or (cells x genes).
     :param continuous: str
 
@@ -1906,7 +1883,6 @@ def continuous_1d(
     if isinstance(as_numeric, tuple):
         as_numeric = list(as_numeric)
 
-    X = parse_data(data, gene_names)
     gene_names = parse_gene_names(data, gene_names)
     sample_description = parse_sample_description(data, sample_description)
 
@@ -1967,7 +1943,7 @@ def continuous_1d(
         logging.getLogger("diffxpy").debug("formula_scale_new: " + formula_scale_new)
 
         de_test = wald(
-            data=X,
+            data=data,
             factor_loc_totest=factor_loc_totest_new,
             coef_to_test=None,
             formula_loc=formula_loc_new,
@@ -2020,7 +1996,7 @@ def continuous_1d(
         logging.getLogger("diffxpy").debug("reduced_formula_scale: " + reduced_formula_scale)
 
         de_test = lrt(
-            data=X,
+            data=data,
             full_formula_loc=full_formula_loc,
             reduced_formula_loc=reduced_formula_loc,
             full_formula_scale=full_formula_scale,
diff --git a/diffxpy/testing/utils.py b/diffxpy/testing/utils.py
index 3c348f1..3eb525e 100644
--- a/diffxpy/testing/utils.py
+++ b/diffxpy/testing/utils.py
@@ -2,9 +2,8 @@
 import numpy as np
 import pandas as pd
 import patsy
-import scipy
+import scipy.sparse
 from typing import List, Tuple, Union
-import xarray as xr
 
 try:
     from anndata.base import Raw
@@ -12,37 +11,30 @@
     from anndata import Raw
 
 from batchglm import data as data_utils
+from batchglm.models.base import _InputDataBase
 # Relay util functions for diffxpy api.
 # design_matrix, preview_coef_names and constraint_system_from_star are redefined here.
 from batchglm.data import constraint_matrix_from_string, constraint_matrix_from_dict
-from batchglm.data import design_matrix_from_xarray, design_matrix_from_anndata
 from batchglm.data import view_coef_names
 
 
-def parse_gene_names(data, gene_names):
+def parse_gene_names(
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
+        gene_names: Union[list, np.ndarray, None]
+):
     if gene_names is None:
         if anndata is not None and (isinstance(data, anndata.AnnData) or isinstance(data, Raw)):
             gene_names = data.var_names
-        elif isinstance(data, xr.DataArray):
-            gene_names = data["features"]
-        elif isinstance(data, xr.Dataset):
-            gene_names = data["features"]
+        elif isinstance(data, _InputDataBase):
+            gene_names = data.features
         else:
             raise ValueError("Missing gene names")
 
     return np.asarray(gene_names)
 
 
-def parse_data(data, gene_names) -> xr.DataArray:
-    X = data_utils.xarray_from_data(data, dims=("observations", "features"))
-    if gene_names is not None:
-        X.coords["features"] = gene_names
-
-    return X
-
-
 def parse_sample_description(
-        data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
         sample_description: Union[pd.DataFrame, None]
 ) -> pd.DataFrame:
     """
@@ -54,33 +46,32 @@ def parse_sample_description(
     """
     if sample_description is None:
         if anndata is not None and isinstance(data, anndata.AnnData):
-            sample_description = data_utils.sample_description_from_anndata(
-                dataset=data,
-            )
-        elif isinstance(data, xr.Dataset):
-            sample_description = data_utils.sample_description_from_xarray(
-                dataset=data,
-                dim="observations",
-            )
+            sample_description = data.obs
         else:
             raise ValueError(
-                "Please specify `sample_description` or provide `data` as xarray.Dataset or anndata.AnnData " +
+                "Please specify `sample_description` or provide `data` as anndata.AnnData " +
                 "with corresponding sample annotations"
             )
 
     if anndata is not None and isinstance(data, Raw):
         # Raw does not have attribute shape.
         assert data.X.shape[0] == sample_description.shape[0], \
-            "data matrix and sample description must contain same number of cells"
+            "data matrix and sample description must contain same number of cells: %i, %i" % \
+            (data.X.shape[0], sample_description.shape[0])
+    elif isinstance(data, _InputDataBase):
+        assert data.x.shape[0] == sample_description.shape[0], \
+            "data matrix and sample description must contain same number of cells: %i, %i" % \
+            (data.x.shape[0], sample_description.shape[0])
     else:
         assert data.shape[0] == sample_description.shape[0], \
-            "data matrix and sample description must contain same number of cells"
+            "data matrix and sample description must contain same number of cells: %i, %i" % \
+            (data.shape[0], sample_description.shape[0])
     return sample_description
 
 
 def parse_size_factors(
         size_factors: Union[np.ndarray, pd.core.series.Series, np.ndarray],
-        data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
+        data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase],
         sample_description: pd.DataFrame
 ) -> Union[np.ndarray, None]:
     """
@@ -111,7 +102,8 @@ def parse_grouping(data, sample_description, grouping):
     return np.squeeze(np.asarray(grouping))
 
 
-def split_X(data, grouping):
+def split_x(data, grouping):
+    grouping = np.asarray(grouping)
     groups = np.unique(grouping)
     x0 = data[np.where(grouping == groups[0])[0]]
     x1 = data[np.where(grouping == groups[1])[0]]
@@ -126,14 +118,14 @@ def dmat_unique(dmat, sample_description):
 
 
 def design_matrix(
-        data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray,
+        data: Union[anndata.AnnData, Raw, np.ndarray,
                     scipy.sparse.csr_matrix] = None,
         sample_description: Union[None, pd.DataFrame] = None,
         formula: Union[None, str] = None,
         as_numeric: Union[List[str], Tuple[str], str] = (),
         dmat: Union[pd.DataFrame, None] = None,
         return_type: str = "xarray"
-) -> Union[patsy.design_info.DesignMatrix, xr.Dataset, pd.DataFrame]:
+) -> Union[patsy.design_info.DesignMatrix, pd.DataFrame]:
     """ Create a design matrix from some sample description.
 
     This function defaults to perform formatting if dmat is directly supplied as a pd.DataFrame.
@@ -155,8 +147,6 @@ def design_matrix(
 
         - "patsy": return plain patsy.design_info.DesignMatrix object
         - "dataframe": return pd.DataFrame with observations as rows and params as columns
-        - "xarray": return xr.Dataset with design matrix as ds["design"] and the sample description embedded as
-            one variable per column
     :param dmat: model design matrix
     """
     if data is None and sample_description is None and dmat is None:
@@ -217,13 +207,12 @@ def preview_coef_names(
 
 
 def constraint_system_from_star(
-        dmat: Union[None, np.ndarray, xr.DataArray, xr.Dataset] = None,
+        dmat: Union[None, np.ndarray] = None,
         sample_description: Union[None, pd.DataFrame] = None,
         formula: Union[None, str] = None,
         as_numeric: Union[List[str], Tuple[str], str] = (),
         constraints: dict = {},
-        dims: Union[Tuple[str, str], List[str]] = (),
-        return_type: str = "xarray",
+        return_type: str = "patsy",
 ) -> Tuple:
     """
     Create a design matrix and a constraint matrix.
@@ -251,17 +240,12 @@ def constraint_system_from_star(
 
         Can only group by non-constrained effects right now, use constraint_matrix_from_string
         for other cases.
-    :param dims: Dimension names of xarray.
-
-        E.g.: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"]
     :param return_type: type of the returned value.
 
         - "patsy": return plain patsy.design_info.DesignMatrix object
         - "dataframe": return pd.DataFrame with observations as rows and params as columns
-        - "xarray": return xr.Dataset with design matrix as ds["design"] and the sample description embedded as
-            one variable per column
         This option is overridden if constraints are supplied as dict.
-    :return: a model design matrix and a constraint matrix formatted as xr.DataArray
+    :return: a model design matrix and a constraint matrix
     """
     if isinstance(as_numeric, str):
         as_numeric = [as_numeric]
@@ -279,6 +263,5 @@ def constraint_system_from_star(
         formula=formula,
         as_categorical=as_categorical,
         constraints=constraints,
-        dims=dims,
         return_type=return_type
     )
diff --git a/diffxpy/unit_test/test_constrained.py b/diffxpy/unit_test/test_constrained.py
index 9ef21a3..78b4851 100644
--- a/diffxpy/unit_test/test_constrained.py
+++ b/diffxpy/unit_test/test_constrained.py
@@ -45,26 +45,22 @@ def test_forfatal_from_string(self):
         # Build constraints:
         constraints_loc = de.utils.constraint_matrix_from_string(
             dmat=dmat_est_loc,
-            constraints=["bio1+bio2=0", "bio3+bio4=0"],
-            dims=["design_loc_params", "loc_params"]
+            constraints=["bio1+bio2=0", "bio3+bio4=0"]
         )
         constraints_scale = de.utils.constraint_matrix_from_string(
             dmat=dmat_est_scale,
-            constraints=["bio1+bio2=0", "bio3+bio4=0"],
-            dims=["design_scale_params", "scale_params"]
+            constraints=["bio1+bio2=0", "bio3+bio4=0"]
         )
 
         test = de.test.wald(
-            data=sim.X,
+            data=sim.x,
             dmat_loc=dmat_est_loc,
             dmat_scale=dmat_est_scale,
             constraints_loc=constraints_loc,
             constraints_scale=constraints_scale,
             coef_to_test=["treatment1"]
         )
-        summary = test.summary()
-
-        return True
+        _ = test.summary()
 
     def test_forfatal_from_dict(self):
         """
@@ -102,16 +98,14 @@ def test_forfatal_from_dict(self):
         )
 
         test = de.test.wald(
-            data=sim.X,
+            data=sim.x,
             dmat_loc=dmat_loc,
             dmat_scale=dmat_scale,
             constraints_loc=constraints_loc,
             constraints_scale=constraints_scale,
             coef_to_test=["cond[T.cond1]"]
         )
-        summary = test.summary()
-
-        return True
+        _ = test.summary()
 
     def test_null_distribution_wald_constrained(self, n_genes: int = 100):
         """
@@ -155,14 +149,14 @@ def test_null_distribution_wald_constrained(self, n_genes: int = 100):
         )
 
         test = de.test.wald(
-            data=sim.X,
+            data=sim.x,
             dmat_loc=dmat_loc,
             dmat_scale=dmat_scale,
             constraints_loc=constraints_loc,
             constraints_scale=constraints_scale,
             coef_to_test=["cond[T.cond1]"]
         )
-        summary = test.summary()
+        _ = test.summary()
 
         # Compare p-value distribution under null model against uniform distribution.
         pval_h0 = stats.kstest(test.pval, 'uniform').pvalue
@@ -230,13 +224,12 @@ def test_null_distribution_wald_constrained_2layer(self, n_genes: int = 100):
                          "bio5+bio6=0",
                          "bio7+bio8=0",
                          "tech1+tech2=0",
-                         "tech3+tech4=0"],
-            dims=["design_loc_params", "loc_params"]
+                         "tech3+tech4=0"]
         )
         constraints_scale = None
 
         test = de.test.wald(
-            data=sim.X,
+            data=sim.x,
             dmat_loc=dmat_est_loc,
             dmat_scale=dmat_est_scale,
             constraints_loc=constraints_loc,
@@ -298,26 +291,24 @@ def test_null_distribution_wald_multi_constrained_2layer(self, n_genes: int = 50
             dmat=dmat_est_loc,
             constraints=["bio1+bio2=0",
                          "bio3+bio4=0",
-                         "bio5+bio6=0"],
-            dims=["design_loc_params", "loc_params"]
+                         "bio5+bio6=0"]
         )
         constraints_scale = de.utils.constraint_matrix_from_string(
             dmat=dmat_est_scale,
             constraints=["bio1+bio2=0",
                          "bio3+bio4=0",
-                         "bio5+bio6=0"],
-            dims=["design_scale_params", "scale_params"]
+                         "bio5+bio6=0"]
         )
 
         test = de.test.wald(
-            data=sim.X,
+            data=sim.x,
             dmat_loc=dmat_est_loc,
             dmat_scale=dmat_est_scale,
             constraints_loc=constraints_loc,
             constraints_scale=constraints_scale,
             coef_to_test=["treatment1", "treatment2"]
         )
-        summary = test.summary()
+        _ = test.summary()
 
         # Compare p-value distribution under null model against uniform distribution.
         pval_h0 = stats.kstest(test.pval, 'uniform').pvalue
diff --git a/diffxpy/unit_test/test_continuous.py b/diffxpy/unit_test/test_continuous.py
index 642ece4..57ad345 100644
--- a/diffxpy/unit_test/test_continuous.py
+++ b/diffxpy/unit_test/test_continuous.py
@@ -30,8 +30,8 @@ def test_forfatal_functions(self):
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "pseudotime": np.random.random(size=sim.num_observations),
-            "batch": np.random.randint(2, size=sim.num_observations)
+            "pseudotime": np.random.random(size=sim.nobs),
+            "batch": np.random.randint(2, size=sim.nobs)
         })
 
         test = de.test.continuous_1d(
@@ -89,7 +89,7 @@ def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100):
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "pseudotime": np.random.random(size=sim.num_observations)
+            "pseudotime": np.random.random(size=sim.nobs)
         })
 
         test = de.test.continuous_1d(
@@ -135,7 +135,7 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100):
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "pseudotime": np.random.random(size=sim.num_observations)
+            "pseudotime": np.random.random(size=sim.nobs)
         })
 
         test = de.test.continuous_1d(
diff --git a/diffxpy/unit_test/test_correction.py b/diffxpy/unit_test/test_correction.py
index 6941688..cbd602e 100644
--- a/diffxpy/unit_test/test_correction.py
+++ b/diffxpy/unit_test/test_correction.py
@@ -4,7 +4,7 @@
 import scipy.stats as stats
 import logging
 
-from batchglm.api.models.glm_nb import Simulator, Estimator, InputData
+from batchglm.api.models.glm_nb import Simulator, Estimator, InputDataGLM
 import diffxpy.api as de
 
 
@@ -12,5 +12,6 @@
 class TestCorrection(unittest.TestCase):
     pass
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/diffxpy/unit_test/test_data_types.py b/diffxpy/unit_test/test_data_types.py
index 8381c3d..f4874bb 100644
--- a/diffxpy/unit_test/test_data_types.py
+++ b/diffxpy/unit_test/test_data_types.py
@@ -15,95 +15,81 @@ class TestDataTypesSingle(unittest.TestCase):
     def _test_wald(self, data, sample_description, gene_names=None):
         test = de.test.wald(
             data=data,
-            factor_loc_totest="condition",
-            formula_loc="~ 1 + condition",
             sample_description=sample_description,
             gene_names=gene_names,
-            quick_scale=True,
-            training_strategy="DEFAULT",
-            dtype="float64"
+            factor_loc_totest="condition",
+            formula_loc="~ 1 + condition",
+            noise_model="nb",
+            batch_size=5
         )
-        summary = test.summary()
+        _ = test.summary()
 
     def _test_lrt(self, data, sample_description, gene_names=None):
         test = de.test.lrt(
             data=data,
-            full_formula_loc="~ 1 + condition",
-            reduced_formula_loc="~ 1",
             sample_description=sample_description,
             gene_names=gene_names,
-            quick_scale=True,
-            training_strategy="DEFAULT",
-            dtype="float64"
+            full_formula_loc="~ 1 + condition",
+            reduced_formula_loc="~ 1",
+            noise_model="nb"
         )
-        summary = test.summary()
+        _ = test.summary()
 
     def _test_t_test(self, data, sample_description, gene_names=None):
         test = de.test.t_test(
             data=data,
-            grouping="condition",
             sample_description=sample_description,
-            gene_names=gene_names
+            gene_names=gene_names,
+            grouping="condition"
         )
-        summary = test.summary()
+        _ = test.summary()
 
     def _test_rank(self, data, sample_description, gene_names=None):
         test = de.test.rank_test(
             data=data,
-            grouping="condition",
             sample_description=sample_description,
-            gene_names=gene_names
+            gene_names=gene_names,
+            grouping="condition"
         )
-        summary = test.summary()
+        _ = test.summary()
 
-    def simulate(self, n_cells: int = 20, n_genes: int = 2):
+    def simulate(self, n_cells: int = 200, n_genes: int = 2):
         sim = Simulator(num_observations=n_cells, num_features=n_genes)
         sim.generate_sample_description(num_batches=0, num_conditions=0)
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(2, size=sim.num_observations)
+            "condition": np.random.randint(2, size=sim.input_data.num_observations)
         })
-        return sim.X ,random_sample_description
+        return sim.x, random_sample_description
 
     def _test_numpy(self, sparse):
         data, sample_description = self.simulate()
-        gene_names = data.features
-        data = data.values
+        gene_names = ["gene" + str(i) for i in range(data.shape[1])]
         if sparse:
             data = scipy.sparse.csr_matrix(data)
 
         self._test_wald(data=data, sample_description=sample_description, gene_names=gene_names)
-        #self._test_lrt(data=data, sample_description=sample_description, gene_names=gene_names)
+        self._test_lrt(data=data, sample_description=sample_description, gene_names=gene_names)
         self._test_t_test(data=data, sample_description=sample_description, gene_names=gene_names)
         self._test_rank(data=data, sample_description=sample_description, gene_names=gene_names)
 
-    def _test_xarray(self):
-        data, sample_description = self.simulate()
-
-        self._test_wald(data=data, sample_description=sample_description)
-        #self._test_lrt(data=data, sample_description=sample_description)
-        self._test_t_test(data=data, sample_description=sample_description)
-        self._test_rank(data=data, sample_description=sample_description)
-
     def _test_anndata(self, sparse):
         data, sample_description = self.simulate()
-        gene_names = [str(x) for x in data.features.values]
-        data = data.values
+        gene_names = ["gene" + str(i) for i in range(data.shape[1])]
         if sparse:
             data = scipy.sparse.csr_matrix(data)
 
         data = anndata.AnnData(data)
         data.var_names = gene_names
         self._test_wald(data=data, sample_description=sample_description)
-        #self._test_lrt(data=data, sample_description=sample_description)
+        self._test_lrt(data=data, sample_description=sample_description)
         self._test_t_test(data=data, sample_description=sample_description)
         self._test_rank(data=data, sample_description=sample_description)
 
     def _test_anndata_raw(self, sparse):
         data, sample_description = self.simulate()
-        gene_names = [str(x) for x in data.features.values]
-        data = data.values
+        gene_names = ["gene" + str(i) for i in range(data.shape[1])]
         if sparse:
             data = scipy.sparse.csr_matrix(data)
 
@@ -111,7 +97,7 @@ def _test_anndata_raw(self, sparse):
         data.var_names = gene_names
         data.raw = data
         self._test_wald(data=data.raw, sample_description=sample_description)
-        #self._test_lrt(data=data.raw, sample_description=sample_description)
+        self._test_lrt(data=data.raw, sample_description=sample_description)
         self._test_t_test(data=data, sample_description=sample_description)
         self._test_rank(data=data, sample_description=sample_description)
 
@@ -125,15 +111,6 @@ def test_numpy(self):
 
         return True
 
-    def test_xarray(self):
-        logging.getLogger("tensorflow").setLevel(logging.ERROR)
-        logging.getLogger("batchglm").setLevel(logging.WARNING)
-        logging.getLogger("diffxpy").setLevel(logging.WARNING)
-
-        self._test_xarray()
-
-        return True
-
     def test_anndata(self):
         logging.getLogger("tensorflow").setLevel(logging.ERROR)
         logging.getLogger("batchglm").setLevel(logging.WARNING)
@@ -146,5 +123,6 @@ def test_anndata(self):
 
         return True
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/diffxpy/unit_test/test_extreme_values.py b/diffxpy/unit_test/test_extreme_values.py
index cb19922..c430784 100644
--- a/diffxpy/unit_test/test_extreme_values.py
+++ b/diffxpy/unit_test/test_extreme_values.py
@@ -3,7 +3,6 @@
 
 import numpy as np
 import pandas as pd
-import scipy.stats as stats
 
 from batchglm.api.models.glm_nb import Simulator
 import diffxpy.api as de
@@ -19,25 +18,26 @@ def test_t_test_zero_variance(self):
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.WARNING)
 
+        np.random.seed(1)
         sim = Simulator(num_observations=1000, num_features=10)
         sim.generate_sample_description(num_batches=0, num_conditions=0)
         sim.generate()
-        sim.data.X[:, 0] = 0
-        sim.data.X[:, 1] = 5
+        sim.input_data.x[:, 0] = 0
+        sim.input_data.x[:, 1] = 5
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(2, size=sim.num_observations)
+            "condition": np.random.randint(2, size=sim.nobs)
         })
 
         test = de.test.t_test(
-            data=sim.X,
-            grouping="condition",
+            data=sim.input_data,
             sample_description=random_sample_description,
+            grouping="condition",
             is_sig_zerovar=True
         )
 
         assert np.isnan(test.pval[0]) and test.pval[1] == 1, \
-            "rank test did not assign p-value of zero to groups with zero variance and same mean, %f, %f" % \
+            "t test did not assign p-value of zero to groups with zero variance and same mean, %f, %f" % \
             (test.pval[0], test.pval[1])
         return True
 
@@ -49,20 +49,21 @@ def test_rank_test_zero_variance(self):
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.WARNING)
 
+        np.random.seed(1)
         sim = Simulator(num_observations=1000, num_features=10)
         sim.generate_sample_description(num_batches=0, num_conditions=0)
         sim.generate()
-        sim.data.X[:, 0] = 0
-        sim.data.X[:, 1] = 5
+        sim.input_data.x[:, 0] = 0
+        sim.input_data.x[:, 1] = 5
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(2, size=sim.num_observations)
+            "condition": np.random.randint(2, size=sim.nobs)
         })
 
         test = de.test.rank_test(
-            data=sim.X,
-            grouping="condition",
+            data=sim.input_data,
             sample_description=random_sample_description,
+            grouping="condition",
             is_sig_zerovar=True
         )
 
diff --git a/diffxpy/unit_test/test_fit.py b/diffxpy/unit_test/test_fit.py
new file mode 100644
index 0000000..80b177f
--- /dev/null
+++ b/diffxpy/unit_test/test_fit.py
@@ -0,0 +1,286 @@
+import unittest
+import logging
+import numpy as np
+import pandas as pd
+
+import diffxpy.api as de
+
+
+class _TestFit:
+
+    def _test_model_fit(
+            self,
+            n_cells: int,
+            n_genes: int,
+            noise_model: str
+    ):
+        """
+        Test if de.wald() generates a uniform p-value distribution
+        if it is given data simulated based on the null model. Returns the p-value
+        of the two-side Kolmgorov-Smirnov test for equality of the observed 
+        p-value distribution and a uniform distribution.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        :param noise_model: Noise model to use for data fitting.
+        """
+        if noise_model == "nb":
+            from batchglm.api.models.glm_nb import Simulator
+            rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape)
+        elif noise_model == "norm":
+            from batchglm.api.models.glm_norm import Simulator
+            rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape)
+        else:
+            raise ValueError("noise model %s not recognized" % noise_model)
+
+        sim = Simulator(num_observations=n_cells, num_features=n_genes)
+        sim.generate_sample_description(num_batches=0, num_conditions=0)
+        sim.generate_params(rand_fn_scale=rand_fn_scale)
+        sim.generate_data()
+
+        random_sample_description = pd.DataFrame({
+            "condition": np.random.randint(2, size=sim.nobs),
+            "batch": np.random.randint(2, size=sim.nobs)
+        })
+
+        estim = de.fit.model(
+            data=sim.input_data,
+            sample_description=random_sample_description,
+            formula_loc="~ 1 + condition + batch",
+            noise_model=noise_model
+        )
+        return True
+
+    def _test_model_fit_partition(
+            self,
+            n_cells: int,
+            n_genes: int,
+            noise_model: str
+    ):
+        """
+        Test if de.wald() generates a uniform p-value distribution
+        if it is given data simulated based on the null model. Returns the p-value
+        of the two-side Kolmgorov-Smirnov test for equality of the observed
+        p-value distribution and a uniform distribution.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        :param noise_model: Noise model to use for data fitting.
+        """
+        if noise_model == "nb":
+            from batchglm.api.models.glm_nb import Simulator
+            rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape)
+        elif noise_model == "norm":
+            from batchglm.api.models.glm_norm import Simulator
+            rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape)
+        else:
+            raise ValueError("noise model %s not recognized" % noise_model)
+
+        sim = Simulator(num_observations=n_cells, num_features=n_genes)
+        sim.generate_sample_description(num_batches=0, num_conditions=0)
+        sim.generate_params(rand_fn_scale=rand_fn_scale)
+        sim.generate_data()
+
+        random_sample_description = pd.DataFrame({
+            "condition": np.random.randint(2, size=sim.nobs),
+            "batch": np.random.randint(2, size=sim.nobs)
+        })
+
+        partition = de.fit.partition(
+            data=sim.input_data,
+            sample_description=random_sample_description,
+            parts="condition"
+        )
+        estim = partition.model(
+            formula_loc="~ 1 + batch",
+            noise_model=noise_model
+        )
+        return True
+
+    def _test_residuals_fit(
+            self,
+            n_cells: int,
+            n_genes: int,
+            noise_model: str
+    ):
+        """
+        Test if de.wald() (multivariate mode) generates a uniform p-value distribution
+        if it is given data simulated based on the null model. Returns the p-value
+        of the two-side Kolmgorov-Smirnov test for equality of the observed
+        p-value distribution and a uniform distribution.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        :param noise_model: Noise model to use for data fitting.
+        """
+        if noise_model == "nb":
+            from batchglm.api.models.glm_nb import Simulator
+        elif noise_model == "norm":
+            from batchglm.api.models.glm_norm import Simulator
+        else:
+            raise ValueError("noise model %s not recognized" % noise_model)
+
+        sim = Simulator(num_observations=n_cells, num_features=n_genes)
+        sim.generate_sample_description(num_batches=0, num_conditions=0)
+        sim.generate()
+
+        random_sample_description = pd.DataFrame({
+            "condition": np.random.randint(2, size=sim.nobs),
+            "batch": np.random.randint(2, size=sim.nobs)
+        })
+
+        res = de.fit.residuals(
+            data=sim.input_data,
+            sample_description=random_sample_description,
+            formula_loc="~ 1 + condition + batch",
+            noise_model=noise_model
+        )
+        return True
+
+
+class TestFitNb(_TestFit, unittest.TestCase):
+    """
+    Negative binomial noise model unit tests that tests whether model fit relay works.
+    """
+
+    def test_model_fit(
+            self,
+            n_cells: int = 2000,
+            n_genes: int = 2
+    ):
+        """
+        Test if model for "nb" noise model works.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        np.random.seed(1)
+        return self._test_model_fit(
+            n_cells=n_cells,
+            n_genes=n_genes,
+            noise_model="nb"
+        )
+
+    def test_model_fit_partition(
+            self,
+            n_cells: int = 2000,
+            n_genes: int = 2
+    ):
+        """
+        Test if partitioned model for "nb" noise model works.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        np.random.seed(1)
+        return self._test_model_fit_partition(
+            n_cells=n_cells,
+            n_genes=n_genes,
+            noise_model="nb"
+        )
+
+    def test_residuals_fit(
+            self,
+            n_cells: int = 2000,
+            n_genes: int = 2
+    ):
+        """
+        Test if residual fit for "nb" noise model works.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        np.random.seed(1)
+        return self._test_residuals_fit(
+            n_cells=n_cells,
+            n_genes=n_genes,
+            noise_model="nb"
+        )
+
+
+class TestFitNorm(_TestFit, unittest.TestCase):
+    """
+    Normal noise model unit tests that tests whether model fit relay works.
+    """
+
+    def test_model_fit(
+            self,
+            n_cells: int = 2000,
+            n_genes: int = 2
+    ):
+        """
+        Test if model fit for "norm" noise model works.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        np.random.seed(1)
+        return self._test_model_fit(
+            n_cells=n_cells,
+            n_genes=n_genes,
+            noise_model="norm"
+        )
+
+    def test_model_fit_partition(
+            self,
+            n_cells: int = 2000,
+            n_genes: int = 2
+    ):
+        """
+        Test if partitioned model fit for "norm" noise model works.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        np.random.seed(1)
+        return self._test_model_fit_partition(
+            n_cells=n_cells,
+            n_genes=n_genes,
+            noise_model="norm"
+        )
+
+    def test_residuals_fit(
+            self,
+            n_cells: int = 2000,
+            n_genes: int = 2
+    ):
+        """
+        Test if residual fit for "norm" noise model works.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        np.random.seed(1)
+        return self._test_residuals_fit(
+            n_cells=n_cells,
+            n_genes=n_genes,
+            noise_model="norm"
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/diffxpy/unit_test/test_numeric.py b/diffxpy/unit_test/test_numeric_covar.py
similarity index 92%
rename from diffxpy/unit_test/test_numeric.py
rename to diffxpy/unit_test/test_numeric_covar.py
index 05e53b6..2c3bedd 100644
--- a/diffxpy/unit_test/test_numeric.py
+++ b/diffxpy/unit_test/test_numeric_covar.py
@@ -7,7 +7,7 @@
 import diffxpy.api as de
 
 
-class TestNumeric(unittest.TestCase):
+class TestNumericCovar(unittest.TestCase):
 
     def test(self):
         """
@@ -25,11 +25,11 @@ def test(self):
         sim.generate_data()
 
         sample_description = sim.sample_description
-        sample_description["numeric1"] = np.random.random(size=sim.num_observations)
-        sample_description["numeric2"] = np.random.random(size=sim.num_observations)
+        sample_description["numeric1"] = np.random.random(size=sim.nobs)
+        sample_description["numeric2"] = np.random.random(size=sim.nobs)
 
         test = de.test.wald(
-            data=sim.X,
+            data=sim.input_data,
             sample_description=sample_description,
             formula_loc="~ 1 + condition + numeric1 + numeric2",
             formula_scale="~ 1",
diff --git a/diffxpy/unit_test/test_pairwise.py b/diffxpy/unit_test/test_pairwise.py
index 5b52746..6cc6c59 100644
--- a/diffxpy/unit_test/test_pairwise.py
+++ b/diffxpy/unit_test/test_pairwise.py
@@ -29,11 +29,11 @@ def test_null_distribution_ztest(self, n_cells: int = 2000, n_genes: int = 100,
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(n_groups, size=sim.num_observations)
+            "condition": np.random.randint(n_groups, size=sim.nobs)
         })
 
         test = de.test.pairwise(
-            data=sim.X,
+            data=sim.x,
             grouping="condition",
             test="z-test",
             noise_model="nb",
@@ -69,11 +69,11 @@ def test_null_distribution_z_lazy(self, n_cells: int = 2000, n_genes: int = 100)
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(4, size=sim.num_observations)
+            "condition": np.random.randint(4, size=sim.nobs)
         })
 
         test = de.test.pairwise(
-            data=sim.X,
+            data=sim.x,
             grouping="condition",
             test='z-test',
             lazy=True,
@@ -112,11 +112,11 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100, n_
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(n_groups, size=sim.num_observations)
+            "condition": np.random.randint(n_groups, size=sim.nobs)
         })
 
         test = de.test.pairwise(
-            data=sim.X,
+            data=sim.x,
             grouping="condition",
             test="lrt",
             noise_model="nb",
@@ -151,11 +151,11 @@ def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 10000
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(n_groups, size=sim.num_observations)
+            "condition": np.random.randint(n_groups, size=sim.nobs)
         })
 
         test = de.test.pairwise(
-            data=sim.X,
+            data=sim.x,
             grouping="condition",
             test="t-test",
             sample_description=random_sample_description,
@@ -189,11 +189,11 @@ def test_null_distribution_wilcoxon(self, n_cells: int = 2000, n_genes: int = 10
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(n_groups, size=sim.num_observations)
+            "condition": np.random.randint(n_groups, size=sim.nobs)
         })
 
         test = de.test.pairwise(
-            data=sim.X,
+            data=sim.x,
             grouping="condition",
             test="wilcoxon",
             sample_description=random_sample_description,
@@ -239,7 +239,7 @@ def test_ztest_de(self, n_cells: int = 2000, n_genes: int = 500):
         sample_description = sim.sample_description
 
         test = de.test.pairwise(
-            data=sim.X,
+            data=sim.x,
             grouping="condition",
             test="z-test",
             noise_model="nb",
diff --git a/diffxpy/unit_test/test_partition.py b/diffxpy/unit_test/test_partition.py
index 98d1009..7f83374 100644
--- a/diffxpy/unit_test/test_partition.py
+++ b/diffxpy/unit_test/test_partition.py
@@ -29,13 +29,13 @@ def test_null_distribution_wald(self, n_cells: int = 4000, n_genes: int = 200):
         sim.generate()
 
         sample_description = pd.DataFrame({
-            "covar1": np.random.randint(2, size=sim.num_observations),
-            "covar2": np.random.randint(2, size=sim.num_observations)
+            "covar1": np.random.randint(2, size=sim.nobs),
+            "covar2": np.random.randint(2, size=sim.nobs)
         })
         sample_description["cond"] = sim.sample_description["condition"].values
 
         partition = de.test.partition(
-            data=sim.X,
+            data=sim.x,
             parts="cond",
             sample_description=sample_description
         )
@@ -45,7 +45,7 @@ def test_null_distribution_wald(self, n_cells: int = 4000, n_genes: int = 200):
             training_strategy="DEFAULT",
             dtype="float64"
         )
-        summary = det.summary()
+        _ = det.summary()
 
         # Compare p-value distribution under null model against uniform distribution.
         pval_h0 = stats.kstest(det.pval.flatten(), 'uniform').pvalue
@@ -74,13 +74,13 @@ def test_null_distribution_wald_multi(self, n_cells: int = 4000, n_genes: int =
         sim.generate()
 
         sample_description = pd.DataFrame({
-            "covar1": np.random.randint(4, size=sim.num_observations),
-            "covar2": np.random.randint(2, size=sim.num_observations)
+            "covar1": np.random.randint(4, size=sim.nobs),
+            "covar2": np.random.randint(2, size=sim.nobs)
         })
         sample_description["cond"] = sim.sample_description["condition"].values
 
         partition = de.test.partition(
-            data=sim.X,
+            data=sim.x,
             parts="cond",
             sample_description=sample_description
         )
@@ -90,7 +90,7 @@ def test_null_distribution_wald_multi(self, n_cells: int = 4000, n_genes: int =
             training_strategy="DEFAULT",
             dtype="float64"
         )
-        summary = det.summary()
+        _ = det.summary()
 
         # Compare p-value distribution under null model against uniform distribution.
         pval_h0 = stats.kstest(det.pval.flatten(), 'uniform').pvalue
@@ -119,13 +119,13 @@ def test_null_distribution_lrt(self, n_cells: int = 4000, n_genes: int = 200):
         sim.generate()
 
         sample_description = pd.DataFrame({
-            "covar1": np.random.randint(2, size=sim.num_observations),
-            "covar2": np.random.randint(2, size=sim.num_observations)
+            "covar1": np.random.randint(2, size=sim.nobs),
+            "covar2": np.random.randint(2, size=sim.nobs)
         })
         sample_description["cond"] = sim.sample_description["condition"].values
 
         partition = de.test.partition(
-            data=sim.X,
+            data=sim.x,
             parts="cond",
             sample_description=sample_description
         )
@@ -137,7 +137,7 @@ def test_null_distribution_lrt(self, n_cells: int = 4000, n_genes: int = 200):
             training_strategy="DEFAULT",
             dtype="float64"
         )
-        summary = det.summary()
+        _ = det.summary()
 
         # Compare p-value distribution under null model against uniform distribution.
         pval_h0 = stats.kstest(det.pval.flatten(), 'uniform').pvalue
@@ -166,12 +166,12 @@ def test_null_distribution_ttest(self, n_cells: int = 4000, n_genes: int = 200):
         sim.generate()
 
         sample_description = pd.DataFrame({
-            "covar1": np.random.randint(2, size=sim.num_observations)
+            "covar1": np.random.randint(2, size=sim.nobs)
         })
         sample_description["cond"] = sim.sample_description["condition"].values
 
         partition = de.test.partition(
-            data=sim.X,
+            data=sim.x,
             parts="cond",
             sample_description=sample_description
         )
@@ -209,12 +209,12 @@ def test_null_distribution_rank(self, n_cells: int = 4000, n_genes: int = 200):
         sim.generate()
 
         sample_description = pd.DataFrame({
-            "covar1": np.random.randint(2, size=sim.num_observations)
+            "covar1": np.random.randint(2, size=sim.nobs)
         })
         sample_description["cond"] = sim.sample_description["condition"].values
 
         partition = de.test.partition(
-            data=sim.X,
+            data=sim.x,
             parts="cond",
             sample_description=sample_description
         )
diff --git a/diffxpy/unit_test/test_single_de.py b/diffxpy/unit_test/test_single_de.py
index 693e1fc..8edf03f 100644
--- a/diffxpy/unit_test/test_single_de.py
+++ b/diffxpy/unit_test/test_single_de.py
@@ -1,13 +1,11 @@
 import unittest
 import logging
 import numpy as np
-import pandas as pd
-import scipy.stats as stats
 
 import diffxpy.api as de
 
 
-class _TestSingleDE:
+class _TestSingleDe:
 
     def _prepare_data(
             self,
@@ -23,8 +21,12 @@ def _prepare_data(
         """
         if noise_model == "nb":
             from batchglm.api.models.glm_nb import Simulator
+            rand_fn_loc = lambda shape: np.random.uniform(5, 10, shape)
+            rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape)
         elif noise_model == "norm":
             from batchglm.api.models.glm_norm import Simulator
+            rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape)
+            rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape)
         else:
             raise ValueError("noise model %s not recognized" % noise_model)
 
@@ -32,22 +34,21 @@ def _prepare_data(
         sim = Simulator(num_observations=n_cells, num_features=n_genes)
         sim.generate_sample_description(num_batches=0, num_conditions=2)
         sim.generate_params(
-            rand_fn_ave=lambda shape: np.random.poisson(500, shape) + 1,
-            rand_fn=lambda shape: np.abs(np.random.uniform(1, 0.5, shape))
+            rand_fn_loc=rand_fn_loc,
+            rand_fn_scale=rand_fn_scale
         )
-        sim.params["a_var"][1, :num_non_de] = 0
-        sim.params["b_var"][1, :num_non_de] = 0
-        sim.params["isDE"] = ("features",), np.arange(n_genes) >= num_non_de
+        sim.a_var[1, :num_non_de] = 0
+        sim.b_var[1, :num_non_de] = 0
+        self.isDE = np.arange(n_genes) >= num_non_de
         sim.generate_data()
-
         return sim
 
     def _eval(self, sim, test):
-        idx_de = np.where(sim.params["isDE"] == True)[0]
-        idx_nonde = np.where(sim.params["isDE"] == False)[0]
+        idx_de = np.where(self.isDE)[0]
+        idx_nonde = np.where(np.logical_not(self.isDE))[0]
 
-        frac_de_of_non_de = np.sum(test.qval[idx_nonde] < 0.05) / len(idx_nonde)
-        frac_de_of_de = np.sum(test.qval[idx_de] < 0.05) / len(idx_de)
+        frac_de_of_non_de = np.mean(test.qval[idx_nonde] < 0.05)
+        frac_de_of_de = np.mean(test.qval[idx_de] < 0.05)
 
         logging.getLogger("diffxpy").info(
             'fraction of non-DE genes with q-value < 0.05: %.1f%%' %
@@ -82,10 +83,9 @@ def _test_rank_de(
         )
 
         test = de.test.rank_test(
-            data=sim.X,
-            grouping="condition",
+            data=sim.input_data,
             sample_description=sim.sample_description,
-            dtype="float64"
+            grouping="condition"
         )
 
         self._eval(sim=sim, test=test)
@@ -112,10 +112,9 @@ def _test_t_test_de(
         )
 
         test = de.test.t_test(
-            data=sim.X,
+            data=sim.input_data,
             grouping="condition",
-            sample_description=sim.sample_description,
-            dtype="float64"
+            sample_description=sim.sample_description
         )
 
         self._eval(sim=sim, test=test)
@@ -144,10 +143,10 @@ def _test_wald_de(
         )
 
         test = de.test.wald(
-            data=sim.X,
+            data=sim.input_data,
+            sample_description=sim.sample_description,
             factor_loc_totest="condition",
             formula_loc="~ 1 + condition",
-            sample_description=sim.sample_description,
             noise_model=noise_model,
             training_strategy="DEFAULT",
             dtype="float64"
@@ -179,12 +178,12 @@ def _test_lrt_de(
         )
 
         test = de.test.lrt(
-            data=sim.X,
+            data=sim.input_data,
+            sample_description=sim.sample_description,
             full_formula_loc="~ 1 + condition",
             full_formula_scale="~ 1",
             reduced_formula_loc="~ 1",
             reduced_formula_scale="~ 1",
-            sample_description=sim.sample_description,
             noise_model=noise_model,
             training_strategy="DEFAULT",
             dtype="float64"
@@ -195,7 +194,7 @@ def _test_lrt_de(
         return True
 
 
-class TestSingleDE_STANDARD(_TestSingleDE, unittest.TestCase):
+class TestSingleDeStandard(_TestSingleDe, unittest.TestCase):
     """
     Noise model-independent tests unit tests that tests false positive and false negative rates.
     """
@@ -209,6 +208,11 @@ def test_ttest_de(
         :param n_cells: Number of cells to simulate (number of observations per test).
         :param n_genes: Number of genes to simulate (number of tests).
         """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        np.random.seed(1)
         return self._test_t_test_de(
             n_cells=n_cells,
             n_genes=n_genes
@@ -223,13 +227,18 @@ def test_rank_de(
         :param n_cells: Number of cells to simulate (number of observations per test).
         :param n_genes: Number of genes to simulate (number of tests).
         """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        np.random.seed(1)
         return self._test_rank_de(
             n_cells=n_cells,
             n_genes=n_genes
         )
 
 
-class TestSingleDE_NB(_TestSingleDE, unittest.TestCase):
+class TestSingleDeNb(_TestSingleDe, unittest.TestCase):
     """
     Negative binomial noise model unit tests that tests false positive and false negative rates.
     """
@@ -243,6 +252,11 @@ def test_wald_de_nb(
         :param n_cells: Number of cells to simulate (number of observations per test).
         :param n_genes: Number of genes to simulate (number of tests).
         """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        np.random.seed(1)
         return self._test_wald_de(
             n_cells=n_cells,
             n_genes=n_genes,
@@ -258,6 +272,11 @@ def test_lrt_de_nb(
         :param n_cells: Number of cells to simulate (number of observations per test).
         :param n_genes: Number of genes to simulate (number of tests).
         """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        np.random.seed(1)
         return self._test_lrt_de(
             n_cells=n_cells,
             n_genes=n_genes,
@@ -265,7 +284,7 @@ def test_lrt_de_nb(
         )
 
 
-class TestSingleDE_NORM(_TestSingleDE, unittest.TestCase):
+class TestSingleDeNorm(_TestSingleDe, unittest.TestCase):
     """
     Normal noise model unit tests that tests false positive and false negative rates.
     """
@@ -279,6 +298,11 @@ def test_wald_de_norm(
         :param n_cells: Number of cells to simulate (number of observations per test).
         :param n_genes: Number of genes to simulate (number of tests).
         """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        np.random.seed(1)
         return self._test_wald_de(
             n_cells=n_cells,
             n_genes=n_genes,
@@ -294,6 +318,11 @@ def test_lrt_de_norm(
         :param n_cells: Number of cells to simulate (number of observations per test).
         :param n_genes: Number of genes to simulate (number of tests).
         """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        np.random.seed(1)
         return self._test_lrt_de(
             n_cells=n_cells,
             n_genes=n_genes,
diff --git a/diffxpy/unit_test/test_single_external_libs.py b/diffxpy/unit_test/test_single_external_libs.py
index 0590c2a..b1d4883 100644
--- a/diffxpy/unit_test/test_single_external_libs.py
+++ b/diffxpy/unit_test/test_single_external_libs.py
@@ -1,7 +1,6 @@
 import unittest
 import logging
 import numpy as np
-import pandas as pd
 import scipy.stats as stats
 
 from batchglm.api.models.glm_nb import Simulator
@@ -18,10 +17,7 @@ def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100):
         """
         sim = Simulator(num_observations=n_cells, num_features=n_genes)
         sim.generate_sample_description(num_batches=0, num_conditions=2)
-        sim.generate_params(
-            rand_fn_ave=lambda shape: np.random.poisson(500, shape) + 1,
-            rand_fn=lambda shape: np.abs(np.random.uniform(1, 0.5, shape))
-        )
+        sim.generate_params()
         sim.generate_data()
 
         return sim
@@ -45,8 +41,8 @@ def _eval(self, test, ref_pvals):
             'mean absolute log p-value deviation: %f' %
             float(mean_dev)
         )
-        assert max_dev < 1e-3, "maximum deviation too large"
-        assert max_log_dev < 1e-1, "maximum deviation in log space too large"
+        assert max_dev < 1e-3, "maximum deviation too large: %f" % max_dev
+        assert max_log_dev < 1e-1, "maximum deviation in log space too large: %f" % max_log_dev
 
     def test_t_test_ref(self, n_cells: int = 2000, n_genes: int = 100):
         """
@@ -59,23 +55,25 @@ def test_t_test_ref(self, n_cells: int = 2000, n_genes: int = 100):
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.INFO)
 
+        np.random.seed(1)
         sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes)
-
         test = de.test.t_test(
-            data=sim.X,
+            data=sim.input_data,
             grouping="condition",
-            sample_description=sim.sample_description,
-            dtype="float64"
+            sample_description=sim.sample_description
         )
 
         # Run scipy t-tests as a reference.
         conds = np.unique(sim.sample_description["condition"].values)
         ind_a = np.where(sim.sample_description["condition"] == conds[0])[0]
         ind_b = np.where(sim.sample_description["condition"] == conds[1])[0]
-        scipy_pvals = stats.ttest_ind(a=sim.X[ind_a, :], b=sim.X[ind_b, :], axis=0, equal_var=False).pvalue
-
+        scipy_pvals = stats.ttest_ind(
+            a=sim.x[ind_a, :],
+            b=sim.x[ind_b, :],
+            axis=0,
+            equal_var=False
+        ).pvalue
         self._eval(test=test, ref_pvals=scipy_pvals)
-
         return True
 
     def test_rank_ref(self, n_cells: int = 2000, n_genes: int = 100):
@@ -89,13 +87,12 @@ def test_rank_ref(self, n_cells: int = 2000, n_genes: int = 100):
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.INFO)
 
+        np.random.seed(1)
         sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes)
-
         test = de.test.rank_test(
-            data=sim.X,
+            data=sim.input_data,
             grouping="condition",
-            sample_description=sim.sample_description,
-            dtype="float64"
+            sample_description=sim.sample_description
         )
 
         # Run scipy t-tests as a reference.
@@ -103,13 +100,15 @@ def test_rank_ref(self, n_cells: int = 2000, n_genes: int = 100):
         ind_a = np.where(sim.sample_description["condition"] == conds[0])[0]
         ind_b = np.where(sim.sample_description["condition"] == conds[1])[0]
         scipy_pvals = np.array([
-            stats.mannwhitneyu(x=sim.X[ind_a, i], y=sim.X[ind_b, i],
-                               use_continuity=True, alternative="two-sided").pvalue
-            for i in range(sim.X.shape[1])
-            ])
-
+            stats.mannwhitneyu(
+                x=sim.x[ind_a, i],
+                y=sim.x[ind_b, i],
+                use_continuity=True,
+                alternative="two-sided"
+            ).pvalue
+            for i in range(sim.x.shape[1])
+        ])
         self._eval(test=test, ref_pvals=scipy_pvals)
-
         return True
 
 
diff --git a/diffxpy/unit_test/test_single_null.py b/diffxpy/unit_test/test_single_null.py
index 34ca47d..3c9834c 100644
--- a/diffxpy/unit_test/test_single_null.py
+++ b/diffxpy/unit_test/test_single_null.py
@@ -27,31 +27,34 @@ def _test_null_distribution_wald(
         """
         if noise_model == "nb":
             from batchglm.api.models.glm_nb import Simulator
+            rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape)
         elif noise_model == "norm":
             from batchglm.api.models.glm_norm import Simulator
+            rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape)
         else:
             raise ValueError("noise model %s not recognized" % noise_model)
 
         sim = Simulator(num_observations=n_cells, num_features=n_genes)
         sim.generate_sample_description(num_batches=0, num_conditions=0)
-        sim.generate()
+        sim.generate_params(rand_fn_scale=rand_fn_scale)
+        sim.generate_data()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(2, size=sim.num_observations),
-            "batch": np.random.randint(2, size=sim.num_observations)
+            "condition": np.random.randint(2, size=sim.nobs),
+            "batch": np.random.randint(2, size=sim.nobs)
         })
 
         test = de.test.wald(
-            data=sim.X,
+            data=sim.input_data,
+            sample_description=random_sample_description,
             factor_loc_totest="condition",
             formula_loc="~ 1 + condition + batch",
-            sample_description=random_sample_description,
             batch_size=500,
             noise_model=noise_model,
             training_strategy="DEFAULT",
             dtype="float64"
         )
-        summary = test.summary()
+        _ = test.summary()
 
         # Compare p-value distribution under null model against uniform distribution.
         pval_h0 = stats.kstest(test.pval, 'uniform').pvalue
@@ -89,19 +92,19 @@ def _test_null_distribution_wald_multi(
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(4, size=sim.num_observations)
+            "condition": np.random.randint(4, size=sim.nobs)
         })
 
         test = de.test.wald(
-            data=sim.X,
+            data=sim.input_data,
+            sample_description=random_sample_description,
             factor_loc_totest="condition",
             formula_loc="~ 1 + condition",
-            sample_description=random_sample_description,
             noise_model=noise_model,
             training_strategy="DEFAULT",
             dtype="float64"
         )
-        summary = test.summary()
+        _ = test.summary()
 
         # Compare p-value distribution under null model against uniform distribution.
         pval_h0 = stats.kstest(test.pval, 'uniform').pvalue
@@ -139,21 +142,21 @@ def _test_null_distribution_lrt(
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(2, size=sim.num_observations)
+            "condition": np.random.randint(2, size=sim.nobs)
         })
 
         test = de.test.lrt(
-            data=sim.X,
+            data=sim.input_data,
+            sample_description=random_sample_description,
             full_formula_loc="~ 1 + condition",
             full_formula_scale="~ 1",
             reduced_formula_loc="~ 1",
             reduced_formula_scale="~ 1",
-            sample_description=random_sample_description,
             noise_model=noise_model,
             training_strategy="DEFAULT",
             dtype="float64"
         )
-        summary = test.summary()
+        _ = test.summary()
 
         # Compare p-value distribution under null model against uniform distribution.
         pval_h0 = stats.kstest(test.pval, 'uniform').pvalue
@@ -184,17 +187,16 @@ def _test_null_distribution_ttest(
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(2, size=sim.num_observations)
+            "condition": np.random.randint(2, size=sim.nobs)
         })
 
         test = de.test.t_test(
-            data=sim.X,
-            grouping="condition",
+            data=sim.input_data,
             sample_description=random_sample_description,
-            is_logged=False,
-            dtype="float64"
+            grouping="condition",
+            is_logged=False
         )
-        summary = test.summary()
+        _ = test.summary()
 
         # Compare p-value distribution under null model against uniform distribution.
         pval_h0 = stats.kstest(test.pval, 'uniform').pvalue
@@ -225,16 +227,15 @@ def _test_null_distribution_rank(
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(2, size=sim.num_observations)
+            "condition": np.random.randint(2, size=sim.nobs)
         })
 
         test = de.test.rank_test(
-            data=sim.X,
-            grouping="condition",
+            data=sim.input_data,
             sample_description=random_sample_description,
-            dtype="float64"
+            grouping="condition"
         )
-        summary = test.summary()
+        _ = test.summary()
 
         # Compare p-value distribution under null model against uniform distribution.
         pval_h0 = stats.kstest(test.pval, 'uniform').pvalue
@@ -250,6 +251,7 @@ class TestSingleNullStandard(_TestSingleNull, unittest.TestCase):
     Noise model-independent unit tests that test whether a test generates uniformly
     distributed p-values if data are sampled from the null model.
     """
+
     def test_null_distribution_ttest(
             self,
             n_cells: int = 2000,
@@ -265,6 +267,7 @@ def test_null_distribution_ttest(
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.WARNING)
 
+        np.random.seed(1)
         return self._test_null_distribution_ttest(
             n_cells=n_cells,
             n_genes=n_genes
@@ -285,13 +288,14 @@ def test_null_distribution_rank(
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.WARNING)
 
+        np.random.seed(1)
         return self._test_null_distribution_rank(
             n_cells=n_cells,
             n_genes=n_genes
         )
 
 
-class TestSingleNullNB(_TestSingleNull, unittest.TestCase):
+class TestSingleNullNb(_TestSingleNull, unittest.TestCase):
     """
     Negative binomial noise model unit tests that test whether a test generates uniformly
     distributed p-values if data are sampled from the null model.
@@ -312,6 +316,7 @@ def test_null_distribution_wald_nb(
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.WARNING)
 
+        np.random.seed(1)
         return self._test_null_distribution_wald(
             n_cells=n_cells,
             n_genes=n_genes,
@@ -334,6 +339,7 @@ def test_null_distribution_wald_multi_nb(
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.WARNING)
 
+        np.random.seed(1)
         return self._test_null_distribution_wald_multi(
             n_cells=n_cells,
             n_genes=n_genes,
@@ -355,6 +361,7 @@ def test_null_distribution_lrt_nb(
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.WARNING)
 
+        np.random.seed(1)
         return self._test_null_distribution_lrt(
             n_cells=n_cells,
             n_genes=n_genes,
@@ -362,7 +369,7 @@ def test_null_distribution_lrt_nb(
         )
 
 
-class TestSingleNullNORM(_TestSingleNull, unittest.TestCase):
+class TestSingleNullNorm(_TestSingleNull, unittest.TestCase):
     """
     Normal noise model unit tests that test whether a test generates uniformly
     distributed p-values if data are sampled from the null model.
@@ -382,6 +389,7 @@ def test_null_distribution_wald_norm(
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.WARNING)
 
+        np.random.seed(1)
         return self._test_null_distribution_wald(
             n_cells=n_cells,
             n_genes=n_genes,
@@ -404,6 +412,7 @@ def test_null_distribution_wald_multi_norm(
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.WARNING)
 
+        np.random.seed(1)
         return self._test_null_distribution_wald_multi(
             n_cells=n_cells,
             n_genes=n_genes,
@@ -425,11 +434,13 @@ def test_null_distribution_lrt_norm(
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.WARNING)
 
+        np.random.seed(1)
         return self._test_null_distribution_lrt(
             n_cells=n_cells,
             n_genes=n_genes,
             noise_model="norm"
         )
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/diffxpy/unit_test/test_vsrest.py b/diffxpy/unit_test/test_vsrest.py
index 28a56cf..9a78505 100644
--- a/diffxpy/unit_test/test_vsrest.py
+++ b/diffxpy/unit_test/test_vsrest.py
@@ -29,11 +29,11 @@ def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(n_groups, size=sim.num_observations)
+            "condition": np.random.randint(n_groups, size=sim.nobs)
         })
 
         test = de.test.versus_rest(
-            data=sim.X,
+            data=sim.x,
             grouping="condition",
             test="wald",
             noise_model="nb",
@@ -72,11 +72,11 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100):
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(2, size=sim.num_observations)
+            "condition": np.random.randint(2, size=sim.nobs)
         })
 
         test = de.test.versus_rest(
-            data=sim.X,
+            data=sim.x,
             grouping="condition",
             test="lrt",
             noise_model="nb",
@@ -115,11 +115,11 @@ def test_null_distribution_rank(self, n_cells: int = 2000, n_genes: int = 100, n
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(n_groups, size=sim.num_observations)
+            "condition": np.random.randint(n_groups, size=sim.nobs)
         })
 
         test = de.test.versus_rest(
-            data=sim.X,
+            data=sim.x,
             grouping="condition",
             test="rank",
             sample_description=random_sample_description,
@@ -155,11 +155,11 @@ def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100,
         sim.generate()
 
         random_sample_description = pd.DataFrame({
-            "condition": np.random.randint(n_groups, size=sim.num_observations)
+            "condition": np.random.randint(n_groups, size=sim.nobs)
         })
 
         test = de.test.versus_rest(
-            data=sim.X,
+            data=sim.x,
             grouping="condition",
             test="t-test",
             sample_description=random_sample_description,
diff --git a/requirements.txt b/requirements.txt
index 7d2d66e..1b9b636 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ numpy>=1.14.0
 scipy
 pandas
 patsy>=0.5.0
-batchglm>=0.4.0
+batchglm>=0.6.1
 xarray
 statsmodels
 anndata
diff --git a/setup.py b/setup.py
index 2f43452..e8acae9 100644
--- a/setup.py
+++ b/setup.py
@@ -17,12 +17,11 @@
     long_description_content_type="text/markdown",
     packages=find_packages(),
     install_requires=[
-        'numpy>=1.14.0',
-        'scipy',
+        'numpy==1.16.4',
+        'scipy>=1.2.1',
         'pandas',
         'patsy>=0.5.0',
-        'batchglm>=0.6.1',
-        'xarray',
+        'batchglm>=0.6.3',
         'statsmodels',
     ],
     extras_require={