diff --git a/diffxpy/api/__init__.py b/diffxpy/api/__init__.py index a1c1dad..18dd1b4 100644 --- a/diffxpy/api/__init__.py +++ b/diffxpy/api/__init__.py @@ -3,6 +3,7 @@ from . import test from . import enrich +from . import fit from . import stats from . import utils from .. import pkg_constants diff --git a/diffxpy/api/fit.py b/diffxpy/api/fit.py new file mode 100644 index 0000000..f08ce84 --- /dev/null +++ b/diffxpy/api/fit.py @@ -0,0 +1 @@ +from diffxpy.fit import model, residuals, partition diff --git a/diffxpy/api/utils.py b/diffxpy/api/utils.py index fb653a7..b2e4ded 100644 --- a/diffxpy/api/utils.py +++ b/diffxpy/api/utils.py @@ -1,4 +1,4 @@ from diffxpy.testing.utils import constraint_matrix_from_string, constraint_matrix_from_dict, \ constraint_system_from_star -from diffxpy.testing.utils import design_matrix, design_matrix_from_xarray, design_matrix_from_anndata +from diffxpy.testing.utils import design_matrix from diffxpy.testing.utils import view_coef_names, preview_coef_names diff --git a/diffxpy/fit/__init__.py b/diffxpy/fit/__init__.py new file mode 100644 index 0000000..d1ddf78 --- /dev/null +++ b/diffxpy/fit/__init__.py @@ -0,0 +1 @@ +from .fit import model, residuals, partition \ No newline at end of file diff --git a/diffxpy/fit/external.py b/diffxpy/fit/external.py new file mode 100644 index 0000000..330ea9c --- /dev/null +++ b/diffxpy/fit/external.py @@ -0,0 +1,3 @@ +from diffxpy.testing.tests import _fit +from diffxpy.testing.utils import parse_gene_names, parse_sample_description, parse_size_factors, parse_grouping, \ + constraint_system_from_star \ No newline at end of file diff --git a/diffxpy/fit/fit.py b/diffxpy/fit/fit.py new file mode 100644 index 0000000..995a4d3 --- /dev/null +++ b/diffxpy/fit/fit.py @@ -0,0 +1,811 @@ +import anndata +try: + from anndata.base import Raw +except ImportError: + from anndata import Raw +import logging +import numpy as np +import pandas as pd +import patsy +import scipy.sparse +from typing import Union, List, Dict, Callable, Tuple + +from batchglm.models.base import _InputDataBase +from .external import _fit +from .external import parse_gene_names, parse_sample_description, parse_size_factors, parse_grouping, \ + constraint_system_from_star + + +def model( + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], + formula_loc: Union[None, str] = None, + formula_scale: Union[None, str] = "~1", + as_numeric: Union[List[str], Tuple[str], str] = (), + init_a: Union[np.ndarray, str] = "AUTO", + init_b: Union[np.ndarray, str] = "AUTO", + gene_names: Union[np.ndarray, list] = None, + sample_description: Union[None, pd.DataFrame] = None, + dmat_loc: Union[patsy.design_info.DesignMatrix] = None, + dmat_scale: Union[patsy.design_info.DesignMatrix] = None, + constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + noise_model: str = "nb", + size_factors: Union[np.ndarray, pd.core.series.Series, str] = None, + batch_size: int = None, + training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO", + quick_scale: bool = False, + dtype="float64", + **kwargs +): + """ + Fit model via maximum likelihood for each gene. + + :param data: Input data matrix (observations x features) or (cells x genes). + :param formula_loc: formula + model formula for location and scale parameter models. + If not specified, `formula` will be used instead. + :param formula_scale: formula + model formula for scale parameter model. + If not specified, `formula` will be used instead. + :param as_numeric: + Which columns of sample_description to treat as numeric and + not as categorical. This yields columns in the design matrix + which do not correspond to one-hot encoded discrete factors. + This makes sense for number of genes, time, pseudotime or space + for example. + :param init_a: (Optional) Low-level initial values for a. + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize intercept with observed mean + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'a' + :param init_b: (Optional) Low-level initial values for b + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize with zeros + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'b' + :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these + :param sample_description: optional pandas.DataFrame containing sample annotations + :param dmat_loc: Pre-built location model design matrix. + This over-rides formula_loc and sample description information given in + data or sample_description. + :param dmat_scale: Pre-built scale model design matrix. + This over-rides formula_scale and sample description information given in + data or sample_description. + :param constraints_loc: Constraints for location model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the location model, dmat_loc, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param constraints_scale: Constraints for scale model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the scale model, dmat_scale, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param size_factors: 1D array of transformed library size factors for each cell in the + same order as in data or string-type column identifier of size-factor containing + column in sample description. + :param noise_model: str, noise model to use in model-based unit_test. Possible options: + + - 'nb': default + :param batch_size: The batch size to use for the estimator. + :param training_strategy: {str, function, list} training strategy to use. Can be: + + - str: will use Estimator.TrainingStrategy[training_strategy] to train + - function: Can be used to implement custom training function will be called as + `training_strategy(estimator)`. + - list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of + method arguments. + :param quick_scale: Depending on the optimizer, `scale` will be fitted faster and maybe less accurate. + + Useful in scenarios where fitting the exact `scale` is not absolutely necessary. + :param dtype: Allows specifying the precision which should be used to fit data. + + Should be "float32" for single precision or "float64" for double precision. + :param kwargs: [Debugging] Additional arguments will be passed to the _fit method. + """ + if len(kwargs) != 0: + logging.getLogger("diffxpy").debug("additional kwargs: %s", str(kwargs)) + + if (dmat_loc is None and formula_loc is None) or \ + (dmat_loc is not None and formula_loc is not None): + raise ValueError("Supply either dmat_loc or formula_loc.") + if (dmat_scale is None and formula_scale is None) or \ + (dmat_scale is not None and formula_scale != "~1"): + raise ValueError("Supply either dmat_scale or formula_scale.") + + # # Parse input data formats: + gene_names = parse_gene_names(data, gene_names) + if dmat_loc is None and dmat_scale is None: + sample_description = parse_sample_description(data, sample_description) + size_factors = parse_size_factors( + size_factors=size_factors, + data=data, + sample_description=sample_description + ) + + design_loc, constraints_loc = constraint_system_from_star( + dmat=dmat_loc, + sample_description=sample_description, + formula=formula_loc, + as_numeric=as_numeric, + constraints=constraints_loc, + return_type="patsy" + ) + design_scale, constraints_scale = constraint_system_from_star( + dmat=dmat_scale, + sample_description=sample_description, + formula=formula_scale, + as_numeric=as_numeric, + constraints=constraints_scale, + return_type="patsy" + ) + + model = _fit( + noise_model=noise_model, + data=data, + design_loc=design_loc, + design_scale=design_scale, + constraints_loc=constraints_loc, + constraints_scale=constraints_scale, + init_a=init_a, + init_b=init_b, + gene_names=gene_names, + size_factors=size_factors, + batch_size=batch_size, + training_strategy=training_strategy, + quick_scale=quick_scale, + dtype=dtype + ) + return model + + +def residuals( + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], + formula_loc: Union[None, str] = None, + formula_scale: Union[None, str] = "~1", + as_numeric: Union[List[str], Tuple[str], str] = (), + init_a: Union[np.ndarray, str] = "AUTO", + init_b: Union[np.ndarray, str] = "AUTO", + gene_names: Union[np.ndarray, list] = None, + sample_description: Union[None, pd.DataFrame] = None, + dmat_loc: Union[patsy.design_info.DesignMatrix] = None, + dmat_scale: Union[patsy.design_info.DesignMatrix] = None, + constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + noise_model: str = "nb", + size_factors: Union[np.ndarray, pd.core.series.Series, str] = None, + batch_size: int = None, + training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO", + quick_scale: bool = False, + dtype="float64", + **kwargs +): + """ + Fits model for each gene and returns residuals. + + :param data: Input data matrix (observations x features) or (cells x genes). + :param formula_loc: formula + model formula for location and scale parameter models. + If not specified, `formula` will be used instead. + :param formula_scale: formula + model formula for scale parameter model. + If not specified, `formula` will be used instead. + :param as_numeric: + Which columns of sample_description to treat as numeric and + not as categorical. This yields columns in the design matrix + which do not correspond to one-hot encoded discrete factors. + This makes sense for number of genes, time, pseudotime or space + for example. + :param init_a: (Optional) Low-level initial values for a. + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize intercept with observed mean + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'a' + :param init_b: (Optional) Low-level initial values for b + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize with zeros + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'b' + :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these + :param sample_description: optional pandas.DataFrame containing sample annotations + :param dmat_loc: Pre-built location model design matrix. + This over-rides formula_loc and sample description information given in + data or sample_description. + :param dmat_scale: Pre-built scale model design matrix. + This over-rides formula_scale and sample description information given in + data or sample_description. + :param constraints_loc: Constraints for location model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the location model, dmat_loc, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param constraints_scale: Constraints for scale model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the scale model, dmat_scale, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param size_factors: 1D array of transformed library size factors for each cell in the + same order as in data or string-type column identifier of size-factor containing + column in sample description. + :param noise_model: str, noise model to use in model-based unit_test. Possible options: + + - 'nb': default + :param batch_size: The batch size to use for the estimator. + :param training_strategy: {str, function, list} training strategy to use. Can be: + + - str: will use Estimator.TrainingStrategy[training_strategy] to train + - function: Can be used to implement custom training function will be called as + `training_strategy(estimator)`. + - list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of + method arguments. + :param quick_scale: Depending on the optimizer, `scale` will be fitted faster and maybe less accurate. + + Useful in scenarios where fitting the exact `scale` is not absolutely necessary. + :param dtype: Allows specifying the precision which should be used to fit data. + + Should be "float32" for single precision or "float64" for double precision. + :param kwargs: [Debugging] Additional arguments will be passed to the _fit method. + """ + estim = model( + data=data, + formula_loc=formula_loc, + formula_scale=formula_scale, + as_numeric=as_numeric, + init_a=init_a, + init_b=init_b, + gene_names=gene_names, + sample_description=sample_description, + dmat_loc=dmat_loc, + dmat_scale=dmat_scale, + constraints_loc=constraints_loc, + constraints_scale=constraints_scale, + noise_model=noise_model, + size_factors=size_factors, + batch_size=batch_size, + training_strategy=training_strategy, + quick_scale=quick_scale, + dtype=dtype, + ** kwargs + ) + residuals = estim.x - estim.model.location + return residuals + + +def partition( + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], + parts: Union[str, np.ndarray, list], + gene_names: Union[np.ndarray, list] = None, + sample_description: pd.DataFrame = None, + dmat_loc: Union[patsy.design_info.DesignMatrix] = None, + dmat_scale: Union[patsy.design_info.DesignMatrix] = None, + size_factors: Union[np.ndarray, pd.core.series.Series, str] = None +): + r""" + Perform differential expression test for each group. This class handles + the partitioning of the data set, the differential test callls and + the sumamry of the individual tests into one + DifferentialExpressionTestMulti object. All functions the yield + DifferentialExpressionTestSingle objects can be performed on each + partition. + + Wraps _Partition so that doc strings are nice. + + :param data: Array-like or anndata.Anndata object containing observations. + Input data matrix (observations x features) or (cells x genes). + :param parts: str, array + + - column in data.obs/sample_description which contains the split of observations into the two groups. + - array of length `num_observations` containing group labels + :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these + :param sample_description: optional pandas.DataFrame containing sample annotations. + :param dmat_loc: Pre-built location model design matrix. + This over-rides formula_loc and sample description information given in + data or sample_description. + :param dmat_scale: Pre-built scale model design matrix. + This over-rides formula_scale and sample description information given in + data or sample_description. + :param size_factors: 1D array of transformed library size factors for each cell in the + same order as in data or string-type column identifier of size-factor containing + column in sample description. + """ + return (_Partition( + data=data, + parts=parts, + gene_names=gene_names, + sample_description=sample_description, + dmat_loc=dmat_loc, + dmat_scale=dmat_scale, + size_factors=size_factors + )) + + +class _Partition: + """ + Perform model fit separately for each group. + """ + + def __init__( + self, + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], + parts: Union[str, np.ndarray, list], + gene_names: Union[np.ndarray, list] = None, + sample_description: pd.DataFrame = None, + dmat_loc: Union[patsy.design_info.DesignMatrix] = None, + dmat_scale: Union[patsy.design_info.DesignMatrix] = None, + size_factors: Union[np.ndarray, pd.core.series.Series, str] = None + ): + """ + :param data: Array-like or anndata.Anndata object containing observations. + Input data matrix (observations x features) or (cells x genes). + :param parts: str, array + + - column in data.obs/sample_description which contains the split of observations into the two groups. + - array of length `num_observations` containing group labels + :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these + :param sample_description: optional pandas.DataFrame containing sample annotations. + :param dmat_loc: Pre-built location model design matrix. + This over-rides formula_loc and sample description information given in + data or sample_description. + :param dmat_scale: Pre-built scale model design matrix. + This over-rides formula_scale and sample description information given in + data or sample_description. + :param size_factors: 1D array of transformed library size factors for each cell in the + same order as in data or string-type column identifier of size-factor containing + column in sample description. + """ + if isinstance(data, _InputDataBase): + self.x = data.x + elif isinstance(data, anndata.AnnData) or isinstance(data, Raw): + self.x = data.X + elif isinstance(data, np.ndarray): + self.x = data + else: + raise ValueError("data type %s not recognized" % type(data)) + self.gene_names = parse_gene_names(data, gene_names) + self.sample_description = parse_sample_description(data, sample_description) + self.dmat_loc = dmat_loc + self.dmat_scale = dmat_scale + self.size_factors = size_factors + self.partition = parse_grouping(data, sample_description, parts) + self.partitions = np.unique(self.partition) + self.partition_idx = [np.where(self.partition == x)[0] for x in self.partitions] + + def model( + self, + formula_loc: Union[None, str] = None, + formula_scale: Union[None, str] = "~1", + as_numeric: Union[List[str], Tuple[str], str] = (), + init_a: Union[np.ndarray, str] = "AUTO", + init_b: Union[np.ndarray, str] = "AUTO", + constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + noise_model: str = "nb", + batch_size: int = None, + training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO", + quick_scale: bool = False, + dtype="float64", + **kwargs + ): + """ + Fit model for each gene and for each group. + + :param formula_loc: formula + model formula for location and scale parameter models. + If not specified, `formula` will be used instead. + :param formula_scale: formula + model formula for scale parameter model. + If not specified, `formula` will be used instead. + :param as_numeric: + Which columns of sample_description to treat as numeric and + not as categorical. This yields columns in the design matrix + which do not correspond to one-hot encoded discrete factors. + This makes sense for number of genes, time, pseudotime or space + for example. + :param init_a: (Optional) Low-level initial values for a. + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize intercept with observed mean + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'a' + :param init_b: (Optional) Low-level initial values for b + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize with zeros + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'b' + :param constraints_loc: Constraints for location model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the location model, dmat_loc, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param constraints_scale: Constraints for scale model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the scale model, dmat_scale, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param noise_model: str, noise model to use in model-based unit_test. Possible options: + + - 'nb': default + :param batch_size: The batch size to use for the estimator. + :param training_strategy: {str, function, list} training strategy to use. Can be: + + - str: will use Estimator.TrainingStrategy[training_strategy] to train + - function: Can be used to implement custom training function will be called as + `training_strategy(estimator)`. + - list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of + method arguments. + :param quick_scale: Depending on the optimizer, `scale` will be fitted faster and maybe less accurate. + + Useful in scenarios where fitting the exact `scale` is not absolutely necessary. + :param dtype: Allows specifying the precision which should be used to fit data. + + Should be "float32" for single precision or "float64" for double precision. + :param kwargs: [Debugging] Additional arguments will be passed to the _fit method. + """ + estims = [] + for i, idx in enumerate(self.partition_idx): + estims.append(model( + data=self.x[idx, :], + formula_loc=formula_loc, + formula_scale=formula_scale, + as_numeric=as_numeric, + init_a=init_a, + init_b=init_b, + gene_names=self.gene_names, + sample_description=self.sample_description.iloc[idx, :], + dmat_loc=self.dmat_loc[idx, :] if self.dmat_loc is not None else None, + dmat_scale=self.dmat_scale[idx, :] if self.dmat_scale is not None else None, + constraints_loc=constraints_loc, + constraints_scale=constraints_scale, + noise_model=noise_model, + size_factors=self.size_factors[idx] if self.size_factors is not None else None, + batch_size=batch_size, + training_strategy=training_strategy, + quick_scale=quick_scale, + dtype=dtype, + **kwargs + )) + return estims + + def residuals( + self, + formula_loc: Union[None, str] = None, + formula_scale: Union[None, str] = "~1", + as_numeric: Union[List[str], Tuple[str], str] = (), + init_a: Union[np.ndarray, str] = "AUTO", + init_b: Union[np.ndarray, str] = "AUTO", + constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + noise_model: str = "nb", + batch_size: int = None, + training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO", + quick_scale: bool = False, + dtype="float64", + **kwargs + ): + """ + Fit model for each gene and for each group and computes residuals. + + :param formula_loc: formula + model formula for location and scale parameter models. + If not specified, `formula` will be used instead. + :param formula_scale: formula + model formula for scale parameter model. + If not specified, `formula` will be used instead. + :param as_numeric: + Which columns of sample_description to treat as numeric and + not as categorical. This yields columns in the design matrix + which do not correspond to one-hot encoded discrete factors. + This makes sense for number of genes, time, pseudotime or space + for example. + :param init_a: (Optional) Low-level initial values for a. + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize intercept with observed mean + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'a' + :param init_b: (Optional) Low-level initial values for b + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize with zeros + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'b' + :param constraints_loc: Constraints for location model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the location model, dmat_loc, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param constraints_scale: Constraints for scale model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the scale model, dmat_scale, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param noise_model: str, noise model to use in model-based unit_test. Possible options: + + - 'nb': default + :param batch_size: The batch size to use for the estimator. + :param training_strategy: {str, function, list} training strategy to use. Can be: + + - str: will use Estimator.TrainingStrategy[training_strategy] to train + - function: Can be used to implement custom training function will be called as + `training_strategy(estimator)`. + - list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of + method arguments. + :param quick_scale: Depending on the optimizer, `scale` will be fitted faster and maybe less accurate. + + Useful in scenarios where fitting the exact `scale` is not absolutely necessary. + :param dtype: Allows specifying the precision which should be used to fit data. + + Should be "float32" for single precision or "float64" for double precision. + :param kwargs: [Debugging] Additional arguments will be passed to the _fit method. + """ + residuals = [] + for i, idx in enumerate(self.partition_idx): + residuals.append(residuals( + data=self.x[idx, :], + formula_loc=formula_loc, + formula_scale=formula_scale, + as_numeric=as_numeric, + init_a=init_a, + init_b=init_b, + gene_names=self.gene_names, + sample_description=self.sample_description.iloc[idx, :], + dmat_loc=self.dmat_loc[idx, :] if self.dmat_loc is not None else None, + dmat_scale=self.dmat_scale[idx, :] if self.dmat_scale is not None else None, + constraints_loc=constraints_loc, + constraints_scale=constraints_scale, + noise_model=noise_model, + size_factors=self.size_factors[idx] if self.size_factors is not None else None, + batch_size=batch_size, + training_strategy=training_strategy, + quick_scale=quick_scale, + dtype=dtype, + **kwargs + )) + return residuals + + diff --git a/diffxpy/models/batch_bfgs/optim.py b/diffxpy/models/batch_bfgs/optim.py index ef27ec5..98c234e 100644 --- a/diffxpy/models/batch_bfgs/optim.py +++ b/diffxpy/models/batch_bfgs/optim.py @@ -16,17 +16,17 @@ class Estim_BFGS_Model(): def __init__(self, Estim_BFGS, nproc): - self._num_observations = Estim_BFGS.X.shape[0] - self._num_features = Estim_BFGS.X.shape[1] + self._num_observations = Estim_BFGS.x.shape[0] + self._num_features = Estim_BFGS.x.shape[1] self._features = Estim_BFGS.feature_names - self._observations = Estim_BFGS.X.shape[0] + self._observations = Estim_BFGS.x.shape[0] self._design_loc = Estim_BFGS.design_loc self._design_scale = Estim_BFGS.design_scale self._loss = xr.DataArray(Estim_BFGS.full_loss(nproc)) self._log_probs = -self._loss self._probs = np.exp(self._log_probs) self._mles = xr.DataArray(np.transpose(Estim_BFGS.mles())) - self._gradient = xr.DataArray(np.zeros([Estim_BFGS.X.shape[1]])) + self._gradient = xr.DataArray(np.zeros([Estim_BFGS.x.shape[1]])) self._fisher_inv = xr.DataArray(Estim_BFGS.fisher_inv) self._idx_loc = np.arange(0, Estim_BFGS.design_loc.shape[1]) self._idx_scale = np.arange(Estim_BFGS.design_loc.shape[1], diff --git a/diffxpy/pkg_constants.py b/diffxpy/pkg_constants.py index 389f621..399a45d 100644 --- a/diffxpy/pkg_constants.py +++ b/diffxpy/pkg_constants.py @@ -1,5 +1,3 @@ -DE_TREAT_ZEROVAR_TT_AS_SIG = True - BATCHGLM_OPTIM_GD = False BATCHGLM_OPTIM_ADAM = False BATCHGLM_OPTIM_ADAGRAD = False diff --git a/diffxpy/stats/stats.py b/diffxpy/stats/stats.py index 9f60d26..5b8f8fe 100644 --- a/diffxpy/stats/stats.py +++ b/diffxpy/stats/stats.py @@ -1,9 +1,8 @@ -from typing import Union - import numpy as np import numpy.linalg +import scipy.sparse import scipy.stats -import xarray as xr +from typing import Union def likelihood_ratio_test( @@ -34,42 +33,40 @@ def likelihood_ratio_test( delta_df = df_full - df_reduced # Compute the deviance test statistic. delta_dev = 2 * (ll_full - ll_reduced) - # Compute the p-values based on the deviance and its expection based on the chi-square distribution. + # Compute the p-values based on the deviance and its expectation based on the chi-square distribution. pvals = 1 - scipy.stats.chi2(delta_df).cdf(delta_dev) return pvals def mann_whitney_u_test( - x0: np.ndarray, - x1: np.ndarray, + x0: Union[np.ndarray, scipy.sparse.csr_matrix], + x1: Union[np.ndarray, scipy.sparse.csr_matrix] ): """ - Perform Wilcoxon rank sum test (Mann-Whitney U test) along second axis - (for each gene). + Perform Wilcoxon rank sum test (Mann-Whitney U test) along second axis, ie. for each gene. The Wilcoxon rank sum test is a non-parameteric test to compare two groups of observations. - :param x0: np.array (observations x genes) + :param x0: (observations x genes) Observations in first group by gene - :param x1: np.array (observations x genes) + :param x1: (observations x genes) Observations in second group by gene. """ axis = 1 if np.any(np.ndim(x0) != np.ndim(x1)): - raise ValueError('stats.wilcoxon(): number of dimensions is not allowed to differ between x0 and x1!') + raise ValueError('number of dimensions is not allowed to differ between x0 and x1') # Reshape into 2D array if only one test is performed. if np.ndim(x0) == 1: x0 = x0.reshape([x0.shape[0], 1]) x1 = x1.reshape([x1.shape[0], 1]) if np.any(x0.shape[axis] != x1.shape[axis]): - raise ValueError( - 'stats.wilcoxon(): the first axis (number of tests) is not allowed to differ between x0 and x1!') + raise ValueError('the first axis (number of tests) is not allowed to differ between x0 and x1') pvals = np.asarray([ scipy.stats.mannwhitneyu( - x=x0[:, i].flatten(), - y=x1[:, i].flatten(), + x=np.asarray(x0[:, i].todense()).flatten() if isinstance(x0, scipy.sparse.csr_matrix) else x0[:, i], + y=np.asarray(x1[:, i].todense()).flatten() if isinstance(x0, scipy.sparse.csr_matrix) else x1[:, i], use_continuity=True, alternative="two-sided" ).pvalue for i in range(x0.shape[1]) @@ -252,6 +249,8 @@ def wald_test_chisq( if theta_mle.shape[0] != theta_covar.shape[1]: raise ValueError( 'stats.wald_test(): theta_mle and theta_covar have to contain the same number of parameters') + if len(theta_covar.shape) != 3: + raise ValueError('stats.wald_test(): theta_covar should have 3 dimensions but has %i' % len(theta_covar.shape)) if theta_mle.shape[1] != theta_covar.shape[0]: raise ValueError('stats.wald_test(): theta_mle and theta_covar have to contain the same number of genes') if theta_covar.shape[1] != theta_covar.shape[2]: @@ -264,8 +263,6 @@ def wald_test_chisq( theta_diff = theta_mle - theta0 # Convert to nd.array to avoid gufunc error. - if isinstance(theta_diff, xr.DataArray): - theta_diff = theta_diff.values wald_statistic = np.array([ np.matmul( np.matmul( diff --git a/diffxpy/testing/det.py b/diffxpy/testing/det.py index 412a15d..94c6f99 100644 --- a/diffxpy/testing/det.py +++ b/diffxpy/testing/det.py @@ -3,20 +3,19 @@ from typing import Union, Dict, Tuple, List, Set import pandas as pd from random import sample +import scipy.sparse import numpy as np -import xarray as xr import patsy -from .utils import split_X, dmat_unique +from .utils import split_x, dmat_unique try: import anndata except ImportError: anndata = None -from batchglm.xarray_sparse.base import SparseXArrayDataArray -from batchglm.models.glm_nb import Model as GeneralizedLinearModel +from batchglm.models.base import _EstimatorBase, _InputDataBase from ..stats import stats from . import correction @@ -25,84 +24,6 @@ logger = logging.getLogger("diffxpy") -class _Estimation(GeneralizedLinearModel, metaclass=abc.ABCMeta): - """ - Dummy class specifying all needed methods / parameters necessary for a model - fitted for DifferentialExpressionTest. - Useful for type hinting. - """ - - @property - @abc.abstractmethod - def X(self) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def design_loc(self) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def design_scale(self) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def constraints_loc(self) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def constraints_scale(self) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def num_observations(self) -> int: - pass - - @property - @abc.abstractmethod - def num_features(self) -> int: - pass - - @property - @abc.abstractmethod - def features(self) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def observations(self) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def log_likelihood(self, **kwargs) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def loss(self, **kwargs) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def gradients(self, **kwargs) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def hessians(self, **kwargs) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def fisher_inv(self, **kwargs) -> np.ndarray: - pass - - class _DifferentialExpressionTest(metaclass=abc.ABCMeta): """ Dummy class specifying all needed methods / parameters necessary for DifferentialExpressionTest. @@ -132,7 +53,7 @@ def gene_ids(self) -> np.ndarray: @property @abc.abstractmethod - def X(self): + def x(self): pass @abc.abstractmethod @@ -169,24 +90,22 @@ def _correction(self, method) -> np.ndarray: def _ave(self): """ - Returns a xr.DataArray containing the mean expression by gene + Returns the mean expression by gene. - :return: xr.DataArray + :return: np.ndarray """ pass @property def log_likelihood(self): if self._log_likelihood is None: - self._log_likelihood = self._ll().compute() + self._log_likelihood = self._ll() return self._log_likelihood @property def mean(self): if self._mean is None: self._mean = self._ave() - if isinstance(self._mean, xr.DataArray): # Could also be np.ndarray coming out of XArraySparseDataArray - self._mean = self._mean.compute() return self._mean @property @@ -340,10 +259,7 @@ def plot_volcano( logfc = np.reshape(self.log2_fold_change(), -1) # Clipping throws errors if not performed in actual data format (ndarray or DataArray): - if isinstance(logfc, xr.DataArray): - logfc = logfc.clip(-log2_fc_threshold, log2_fc_threshold) - else: - logfc = np.clip(logfc, -log2_fc_threshold, log2_fc_threshold, logfc) + logfc = np.clip(logfc, -log2_fc_threshold, log2_fc_threshold, logfc) fig, ax = plt.subplots() @@ -389,6 +305,7 @@ def plot_volcano( plt.show() plt.close(fig) + plt.ion() if return_axs: return ax @@ -450,10 +367,7 @@ def plot_ma( logfc = np.reshape(self.log2_fold_change(), -1) # Clipping throws errors if not performed in actual data format (ndarray or DataArray): - if isinstance(logfc, xr.DataArray): - logfc = logfc.clip(-log2_fc_threshold, log2_fc_threshold) - else: - logfc = np.clip(logfc, -log2_fc_threshold, log2_fc_threshold, logfc) + logfc = np.clip(logfc, -log2_fc_threshold, log2_fc_threshold, logfc) fig, ax = plt.subplots() @@ -554,17 +468,17 @@ class DifferentialExpressionTestLRT(_DifferentialExpressionTestSingle): sample_description: pd.DataFrame full_design_loc_info: patsy.design_info - full_estim: _Estimation + full_estim: _EstimatorBase reduced_design_loc_info: patsy.design_info - reduced_estim: _Estimation + reduced_estim: _EstimatorBase def __init__( self, sample_description: pd.DataFrame, full_design_loc_info: patsy.design_info, - full_estim, + full_estim: _EstimatorBase, reduced_design_loc_info: patsy.design_info, - reduced_estim + reduced_estim: _EstimatorBase ): super().__init__() self.sample_description = sample_description @@ -575,19 +489,19 @@ def __init__( @property def gene_ids(self) -> np.ndarray: - return np.asarray(self.full_estim.features) + return np.asarray(self.full_estim.input_data.features) @property - def X(self): - return self.full_estim.X + def x(self): + return self.full_estim.x @property def reduced_model_gradient(self): - return self.reduced_estim.gradients + return self.reduced_estim.jacobian @property def full_model_gradient(self): - return self.full_estim.gradients + return self.full_estim.jacobian def _test(self): if np.any(self.full_estim.log_likelihood < self.reduced_estim.log_likelihood): @@ -596,27 +510,29 @@ def _test(self): return stats.likelihood_ratio_test( ll_full=self.full_estim.log_likelihood, ll_reduced=self.reduced_estim.log_likelihood, - df_full=self.full_estim.constraints_loc.shape[1] + self.full_estim.constraints_scale.shape[1], - df_reduced=self.reduced_estim.constraints_loc.shape[1] + self.reduced_estim.constraints_scale.shape[1], + df_full=self.full_estim.input_data.constraints_loc.shape[1] + + self.full_estim.input_data.constraints_scale.shape[1], + df_reduced=self.reduced_estim.input_data.constraints_loc.shape[1] + + self.reduced_estim.input_data.constraints_scale.shape[1], ) def _ave(self): """ - Returns a xr.DataArray containing the mean expression by gene + Returns the mean expression by gene - :return: xr.DataArray + :return: np.ndarray """ - return np.mean(self.full_estim.X, axis=0) + return np.asarray(np.mean(self.full_estim.x, axis=0)).flatten() def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e): """ - Returns a xr.DataArray containing the locations for the different categories of the factors + Returns the locations for the different categories of the factors :param factors: the factors to select. E.g. `condition` or `batch` if formula would be `~ 1 + batch + condition` :param base: the log base to use; default is the natural logarithm - :return: xr.DataArray + :return: np.ndarray """ if not (isinstance(factors, list) or isinstance(factors, tuple) or isinstance(factors, set)): @@ -626,7 +542,7 @@ def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e): di = self.full_design_loc_info sample_description = self.sample_description[[f.name() for f in di.subset(factors).factor_infos]] - dmat = self.full_estim.design_loc + dmat = self.full_estim.input_data.design_loc # make rows unique dmat, sample_description = dmat_unique(dmat, sample_description) @@ -645,19 +561,11 @@ def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e): # make the design matrix + sample description unique again dmat, sample_description = dmat_unique(dmat, sample_description) - locations = self.full_estim.inverse_link_loc(dmat.dot(self.full_estim.par_link_loc)) + locations = self.full_estim.model.inverse_link_loc(np.matmul(dmat, self.full_estim.model.a)) locations = np.log(locations) / np.log(base) dist = np.expand_dims(locations, axis=0) dist = np.transpose(dist, [1, 0, 2]) - dist - dist = xr.DataArray(dist, dims=("minuend", "subtrahend", "gene")) - # retval = xr.Dataset({"logFC": retval}) - - dist.coords["gene"] = self.gene_ids - - for col in sample_description: - dist.coords["minuend_" + col] = (("minuend",), sample_description[col]) - dist.coords["subtrahend_" + col] = (("subtrahend",), sample_description[col]) # # If this is a pairwise comparison, return only one fold change per gene # if dist.shape[:2] == (2, 2): @@ -695,7 +603,7 @@ def log_fold_change(self, base=np.e, return_type="vector"): return None else: dists = self._log_fold_change(factors=factors, base=base) - return dists[1, 0].values + return dists[1, 0] else: dists = self._log_fold_change(factors=factors, base=base) return dists @@ -709,12 +617,12 @@ def locations(self): di = self.full_design_loc_info sample_description = self.sample_description[[f.name() for f in di.factor_infos]] - dmat = self.full_estim.design_loc + dmat = self.full_estim.input_data.design_loc dmat, sample_description = dmat_unique(dmat, sample_description) - retval = self.full_estim.inverse_link_loc(dmat.dot(self.full_estim.par_link_loc)) - retval = pd.DataFrame(retval, columns=self.full_estim.features) + retval = self.full_estim.model.inverse_link_loc(np.matmul(dmat, self.full_estim.model.a)) + retval = pd.DataFrame(retval, columns=self.full_estim.input_data.features) for col in sample_description: retval[col] = sample_description[col] @@ -731,12 +639,12 @@ def scales(self): di = self.full_design_loc_info sample_description = self.sample_description[[f.name() for f in di.factor_infos]] - dmat = self.full_estim.design_scale + dmat = self.full_estim.input_data.design_scale dmat, sample_description = dmat_unique(dmat, sample_description) retval = self.full_estim.inverse_link_scale(dmat.doc(self.full_estim.par_link_scale)) - retval = pd.DataFrame(retval, columns=self.full_estim.features) + retval = pd.DataFrame(retval, columns=self.full_estim.input_data.features) for col in sample_description: retval[col] = sample_description[col] @@ -781,7 +689,7 @@ class DifferentialExpressionTestWald(_DifferentialExpressionTestSingle): Single wald test per gene. """ - model_estim: _Estimation + model_estim: _EstimatorBase sample_description: pd.DataFrame coef_loc_totest: np.ndarray theta_mle: np.ndarray @@ -791,7 +699,7 @@ class DifferentialExpressionTestWald(_DifferentialExpressionTestSingle): def __init__( self, - model_estim: _Estimation, + model_estim: _EstimatorBase, col_indices: np.ndarray, noise_model: str, sample_description: pd.DataFrame @@ -809,28 +717,32 @@ def __init__( self._store_ols = None try: - if model_estim._error_codes is not None: - self._error_codes = model_estim._error_codes + if self.model_estim.error_codes is not None: + self._error_codes = self.model_estim.error_codes + else: + self._error_codes = None except Exception as e: self._error_codes = None try: - if model_estim._niter is not None: - self._niter = model_estim._niter + if self.model_estim.niter is not None: + self._niter = self.model_estim.niter + else: + self._niter = None except Exception as e: self._niter = None @property def gene_ids(self) -> np.ndarray: - return np.asarray(self.model_estim.features) + return np.asarray(self.model_estim.input_data.features) @property - def X(self): - return self.model_estim.X + def x(self): + return self.model_estim.x @property def model_gradient(self): - return self.model_estim.gradients + return self.model_estim.jacobian def log_fold_change(self, base=np.e, **kwargs): """ @@ -854,35 +766,34 @@ def log_fold_change(self, base=np.e, **kwargs): def _ll(self): """ - Returns a xr.DataArray containing the log likelihood of each gene + Returns the log likelihood of each gene - :return: xr.DataArray + :return: np.ndarray """ return self.model_estim.log_likelihood def _ave(self): """ - Returns a xr.DataArray containing the mean expression by gene + Returns the mean expression by gene - :return: xr.DataArray + :return: np.ndarray """ - return self.X.mean(axis=0) + return np.asarray(self.x.mean(axis=0)).flatten() def _test(self): """ - Returns a xr.DataArray containing the p-value for differential expression for each gene + Returns the p-value for differential expression for each gene - :return: xr.DataArray + :return: np.ndarray """ # Check whether single- or multiple parameters are tested. # For a single parameter, the wald statistic distribution is approximated # with a normal distribution, for multiple parameters, a chi-square distribution is used. self.theta_mle = self.model_estim.a_var[self.coef_loc_totest] if len(self.coef_loc_totest) == 1: - self.theta_mle = self.theta_mle[0] # Make xarray one dimensional for stats.wald_test. - self.theta_sd = self.model_estim.fisher_inv[:, self.coef_loc_totest[0], self.coef_loc_totest[0]].values - self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd, - where=self.theta_sd < np.nextafter(0, np.inf)) + self.theta_mle = self.theta_mle[0] + self.theta_sd = self.model_estim.fisher_inv[:, self.coef_loc_totest[0], self.coef_loc_totest[0]] + self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd, where=self.theta_sd < np.nextafter(0, np.inf)) self.theta_sd = np.sqrt(self.theta_sd) return stats.wald_test( theta_mle=self.theta_mle, @@ -891,12 +802,11 @@ def _test(self): ) else: self.theta_sd = np.diagonal(self.model_estim.fisher_inv, axis1=-2, axis2=-1).copy() - self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd, - where=self.theta_sd < np.nextafter(0, np.inf)) + self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd, where=self.theta_sd < np.nextafter(0, np.inf)) self.theta_sd = np.sqrt(self.theta_sd) return stats.wald_test_chisq( theta_mle=self.theta_mle, - theta_covar=self.model_estim.fisher_inv[:, self.coef_loc_totest, self.coef_loc_totest], + theta_covar=self.model_estim.fisher_inv[:, self.coef_loc_totest, :][:, :, self.coef_loc_totest], theta0=0 ) @@ -944,12 +854,19 @@ def summary( def plot_vs_ttest( self, log10=False, + show: bool = True, + save: Union[str, None] = None, + suffix: str = "_plot_vs_ttest.png", return_axs: bool = False ): """ Normalizes data by size factors if any were used in model. :param log10: + :param show: Whether (if save is not None) and where (save indicates dir and file stem) to display plot. + :param save: Path+file name stem to save plots to. + File will be save+suffix. Does not save if save is None. + :param suffix: Suffix for file name to save plot to. Also use this to set the file type. :param return_axs: Whether to return axis objects. :return: @@ -958,12 +875,17 @@ def plot_vs_ttest( import seaborn as sns from .tests import t_test - grouping = np.asarray(self.model_estim.design_loc[:, self.coef_loc_totest]) + plt.ioff() + + grouping = np.asarray(self.model_estim.input_data.design_loc[:, self.coef_loc_totest]) # Normalize by size factors that were used in regression. - sf = np.broadcast_to(np.expand_dims(self.model_estim.size_factors, axis=1), - shape=self.model_estim.X.shape) + if self.model_estim.input_data.size_factors is not None: + sf = np.broadcast_to(np.expand_dims(self.model_estim.input_data.size_factors, axis=1), + shape=self.model_estim.x.shape) + else: + sf = np.ones(shape=(self.model_estim.x.shape[0], 1)) ttest = t_test( - data=self.model_estim.X.multiply(1 / sf, copy=True), + data=self.model_estim.x / sf, grouping=grouping, gene_names=self.gene_ids, ) @@ -980,6 +902,16 @@ def plot_vs_ttest( ax.set(xlabel="t-test", ylabel='wald test') + # Save, show and return figure. + if save is not None: + plt.savefig(save + suffix) + + if show: + plt.show() + + plt.close(fig) + plt.ion() + if return_axs: return ax else: @@ -1019,11 +951,13 @@ def plot_comparison_ols_coef( import matplotlib.pyplot as plt from matplotlib import gridspec from matplotlib import rcParams - from batchglm.api.models.glm_norm import Estimator, InputData + from batchglm.api.models.glm_norm import Estimator, InputDataGLM + + plt.ioff() # Run OLS model fit to have comparison coefficients. if self._store_ols is None: - input_data_ols = InputData.new( + input_data_ols = InputDataGLM( data=self.model_estim.input_data.data, design_loc=self.model_estim.input_data.design_loc, design_scale=self.model_estim.input_data.design_scale[:, [0]], @@ -1048,10 +982,10 @@ def plot_comparison_ols_coef( # Prepare parameter summary of both model fits. par_loc = self.model_estim.input_data.data.coords["design_loc_params"].values - a_var_ols = store_ols.a_var.values + a_var_ols = store_ols.a_var a_var_ols[1:, :] = (a_var_ols[1:, :] + a_var_ols[[0], :]) / a_var_ols[[0], :] - a_var_user = self.model_estim.a_var.values + a_var_user = self.model_estim.a_var # Translate coefficients from both fits to be multiplicative in identity space. if self.noise_model == "nb": a_var_user = np.exp(a_var_user) # self.model_estim.inverse_link_loc(a_var_user) @@ -1156,11 +1090,13 @@ def plot_comparison_ols_pred( import matplotlib.pyplot as plt from matplotlib import gridspec from matplotlib import rcParams - from batchglm.api.models.glm_norm import Estimator, InputData + from batchglm.api.models.glm_norm import Estimator, InputDataGLM + + plt.ioff() # Run OLS model fit to have comparison coefficients. if self._store_ols is None: - input_data_ols = InputData.new( + input_data_ols = InputDataGLM( data=self.model_estim.input_data.data, design_loc=self.model_estim.input_data.design_loc, design_scale=self.model_estim.input_data.design_scale[:, [0]], @@ -1203,19 +1139,16 @@ def plot_comparison_ols_pred( pred_n_cells = sample( population=list(np.arange(0, self.model_estim.X.shape[0])), - k=np.min([20, self.model_estim.design_loc.shape[0]]) + k=np.min([20, self.model_estim.input_data.design_loc.shape[0]]) ) - if isinstance(self.model_estim.X, SparseXArrayDataArray): - x = np.asarray(self.model_estim.X.X[pred_n_cells, :].todense()).flatten() - else: - x = np.asarray(self.model_estim.X[pred_n_cells, :]).flatten() + x = np.asarray(self.model_estim.X[pred_n_cells, :]).flatten() - y_user = self.model_estim.inverse_link_loc( - np.matmul(self.model_estim.design_loc[pred_n_cells, :].values, self.model_estim.a_var.values).flatten() + y_user = self.model_estim.model.inverse_link_loc( + np.matmul(self.model_estim.input_data.design_loc[pred_n_cells, :], self.model_estim.a_var).flatten() ) y_ols = store_ols.inverse_link_loc( - np.matmul(store_ols.design_loc[pred_n_cells, :].values, store_ols.a_var.values).flatten() + np.matmul(store_ols.design_loc[pred_n_cells, :], store_ols.a_var).flatten() ) if log1p_transform: x = np.log(x+1) @@ -1309,15 +1242,15 @@ def _assemble_gene_fits( :param log1p_transform: Whether to log transform observations before estimating the distribution with boxplot. Model estimates are adjusted accordingly. :param incl_fits: Whether to include fits in plot. - :return summaries_genes: List with data frame for seabron in it. + :return summaries_genes: List with data frame for seaborn in it. """ summaries_genes = [] for i, g in enumerate(gene_names): - assert g in self.model_estim.features, "gene %g not found" % g - g_idx = self.model_estim.features.tolist().index(g) + assert g in self.model_estim.input_data.features, "gene %g not found" % g + g_idx = self.model_estim.input_data.features.index(g) # Raw data for boxplot: - y = self.model_estim.X[:, g_idx] + y = self.model_estim.x[:, g_idx] # Model fits: loc = self.model_estim.location[:, g_idx] scale = self.model_estim.scale[:, g_idx] @@ -1613,16 +1546,20 @@ def __init__( is_sig_zerovar: bool = True ): super().__init__() - self._X = data + if isinstance(data, anndata.AnnData) or isinstance(data, anndata.Raw): + data = data.X + elif isinstance(data, _InputDataBase): + data = data.x + self._x = data self.sample_description = sample_description self.grouping = grouping self._gene_names = np.asarray(gene_names) - x0, x1 = split_X(data, grouping) + x0, x1 = split_x(data, grouping) # Only compute p-values for genes with non-zero observations and non-zero group-wise variance. - mean_x0 = x0.mean(axis=0).astype(dtype=np.float) - mean_x1 = x1.mean(axis=0).astype(dtype=np.float) + mean_x0 = np.asarray(np.mean(x0, axis=0)).flatten().astype(dtype=np.float) + mean_x1 = np.asarray(np.mean(x1, axis=0)).flatten().astype(dtype=np.float) # Avoid unnecessary mean computation: self._mean = np.average( a=np.vstack([mean_x0, mean_x1]), @@ -1632,8 +1569,13 @@ def __init__( returned=False ) self._ave_nonzero = self._mean != 0 # omit all-zero features - var_x0 = np.asarray(x0.var(axis=0)).flatten().astype(dtype=np.float) - var_x1 = np.asarray(x1.var(axis=0)).flatten().astype(dtype=np.float) + if isinstance(x0, scipy.sparse.csr_matrix): + # Efficient analytic expression of variance without densification. + var_x0 = np.asarray(np.mean(x0.power(2), axis=0)).flatten().astype(dtype=np.float) - np.square(mean_x0) + var_x1 = np.asarray(np.mean(x1.power(2), axis=0)).flatten().astype(dtype=np.float) - np.square(mean_x1) + else: + var_x0 = np.asarray(np.var(x0, axis=0)).flatten().astype(dtype=np.float) + var_x1 = np.asarray(np.var(x1, axis=0)).flatten().astype(dtype=np.float) self._var_geq_zero = np.logical_or( var_x0 > 0, var_x1 > 0 @@ -1652,6 +1594,10 @@ def __init__( np.logical_and(mean_x0 == mean_x1, self._mean > 0), np.logical_not(self._var_geq_zero) ))[0]] = 1.0 + # Depening on user choice via is_sig_zerovar: + # Set p-value to 0 if LFC was non-zero and variances are zero, + # this causes division by zero in the test statistic. This + # is a highly significant result if one believes the variance estimate. if is_sig_zerovar: pval[np.where(np.logical_and( mean_x0 != mean_x1, @@ -1664,36 +1610,14 @@ def __init__( self._logfc = mean_x1 - mean_x0 else: self._logfc = np.log(mean_x1) - np.log(mean_x0) - # Return 0 if LFC was non-zero and variances are zero, - # this causes division by zero in the test statistic. This - # is a highly significant result if one believes the variance estimate. - # This is the default which can be changed and can be changed - # via DIFFXPY_TREAT_ZEROVAR_TT_AS_SIG. - pval[np.where(np.logical_and(np.logical_and( - np.logical_not(self._var_geq_zero), - self._ave_nonzero), - np.abs(self._logfc) < np.nextafter(0, 1) - ))] = 0 - if pkg_constants.DE_TREAT_ZEROVAR_TT_AS_SIG: - pval[np.where(np.logical_and(np.logical_and( - np.logical_not(self._var_geq_zero), - self._ave_nonzero), - np.abs(self._logfc) >= np.nextafter(0, 1) - ))] = 1 - else: - pval[np.where(np.logical_and(np.logical_and( - np.logical_not(self._var_geq_zero), - self._ave_nonzero), - np.abs(self._logfc) >= np.nextafter(0, 1) - ))] = 0 @property def gene_ids(self) -> np.ndarray: return self._gene_names @property - def X(self): - return self._X + def x(self): + return self._x def log_fold_change(self, base=np.e, **kwargs): """ @@ -1747,15 +1671,19 @@ def __init__( is_sig_zerovar: bool = True ): super().__init__() - self._X = data + if isinstance(data, anndata.AnnData) or isinstance(data, anndata.Raw): + data = data.X + elif isinstance(data, _InputDataBase): + data = data.x + self._x = data self.sample_description = sample_description self.grouping = grouping self._gene_names = np.asarray(gene_names) - x0, x1 = split_X(data, grouping) + x0, x1 = split_x(data, grouping) - mean_x0 = x0.mean(axis=0).astype(dtype=np.float) - mean_x1 = x1.mean(axis=0).astype(dtype=np.float) + mean_x0 = np.asarray(np.mean(x0, axis=0)).flatten().astype(dtype=np.float) + mean_x1 = np.asarray(np.mean(x1, axis=0)).flatten().astype(dtype=np.float) # Avoid unnecessary mean computation: self._mean = np.average( a=np.vstack([mean_x0, mean_x1]), @@ -1764,30 +1692,33 @@ def __init__( axis=0, returned=False ) - var_x0 = np.asarray(x0.var(axis=0)).flatten().astype(dtype=np.float) - var_x1 = np.asarray(x1.var(axis=0)).flatten().astype(dtype=np.float) + if isinstance(x0, scipy.sparse.csr_matrix): + # Efficient analytic expression of variance without densification. + var_x0 = np.asarray(np.mean(x0.power(2), axis=0)).flatten().astype(dtype=np.float) - np.square(mean_x0) + var_x1 = np.asarray(np.mean(x1.power(2), axis=0)).flatten().astype(dtype=np.float) - np.square(mean_x1) + else: + var_x0 = np.asarray(np.var(x0, axis=0)).flatten().astype(dtype=np.float) + var_x1 = np.asarray(np.var(x1, axis=0)).flatten().astype(dtype=np.float) self._var_geq_zero = np.logical_or( var_x0 > 0, var_x1 > 0 ) idx_run = np.where(np.logical_and(self._mean != 0, self._var_geq_zero))[0] - # TODO: can this be done on sparse? + # TODO: can this be done directly on sparse? pval = np.zeros([data.shape[1]]) + np.nan - if isinstance(x0, xr.DataArray): - pval[idx_run] = stats.mann_whitney_u_test( - x0=x0.data[:, idx_run], - x1=x1.data[:, idx_run] - ) - else: - pval[idx_run] = stats.mann_whitney_u_test( - x0=x0.X[:, idx_run].toarray(), - x1=x1.X[:, idx_run].toarray() - ) + pval[idx_run] = stats.mann_whitney_u_test( + x0=x0[:, idx_run], + x1=x1[:, idx_run] + ) pval[np.where(np.logical_and( np.logical_and(mean_x0 == mean_x1, self._mean > 0), np.logical_not(self._var_geq_zero) ))[0]] = 1.0 + # Depening on user choice via is_sig_zerovar: + # Set p-value to 0 if LFC was non-zero and variances are zero, + # this causes division by zero in the test statistic. This + # is a highly significant result if one believes the variance estimate. if is_sig_zerovar: pval[np.where(np.logical_and( mean_x0 != mean_x1, @@ -1806,8 +1737,8 @@ def gene_ids(self) -> np.ndarray: return self._gene_names @property - def X(self): - return self._X + def x(self): + return self._x def log_fold_change(self, base=np.e, **kwargs): """ @@ -1855,7 +1786,7 @@ def plot_vs_ttest(self, log10=False): grouping = self.grouping ttest = t_test( - data=self.X, + data=self.x, grouping=grouping, gene_names=self.gene_ids, ) @@ -1967,7 +1898,7 @@ def gene_ids(self) -> np.ndarray: return self._gene_ids @property - def X(self): + def x(self): return None @property @@ -2159,11 +2090,17 @@ class DifferentialExpressionTestZTest(_DifferentialExpressionTestMulti): Pairwise unit_test between more than 2 groups per gene. """ - model_estim: _Estimation + model_estim: _EstimatorBase theta_mle: np.ndarray theta_sd: np.ndarray - def __init__(self, model_estim: _Estimation, grouping, groups, correction_type: str): + def __init__( + self, + model_estim: _EstimatorBase, + grouping, + groups, + correction_type: str + ): super().__init__(correction_type=correction_type) self.model_estim = model_estim self.grouping = grouping @@ -2182,7 +2119,7 @@ def __init__(self, model_estim: _Estimation, grouping, groups, correction_type: def _test(self, **kwargs): groups = self.groups - num_features = self.model_estim.X.shape[1] + num_features = self.model_estim.x.shape[1] pvals = np.tile(np.NaN, [len(groups), len(groups), num_features]) pvals[np.eye(pvals.shape[0]).astype(bool)] = 1 @@ -2202,28 +2139,28 @@ def _test(self, **kwargs): @property def gene_ids(self) -> np.ndarray: - return np.asarray(self.model_estim.features) + return np.asarray(self.model_estim.input_data.features) @property - def X(self): - return self.model_estim.X + def x(self): + return self.model_estim.x @property def log_likelihood(self): - return np.sum(self.model_estim.log_probs(), axis=0) + return self.model_estim.log_likelihood @property def model_gradient(self): - return self.model_estim.gradients + return self.model_estim.jacobian def _ave(self): """ - Returns a xr.DataArray containing the mean expression by gene + Returns the mean expression by gene - :return: xr.DataArray + :return: np.ndarray """ - return np.mean(self.model_estim.X, axis=0) + return np.mean(self.model_estim.x, axis=0) def log_fold_change(self, base=np.e, **kwargs): """ @@ -2231,7 +2168,7 @@ def log_fold_change(self, base=np.e, **kwargs): """ if self._logfc is None: groups = self.groups - num_features = self.model_estim.X.shape[1] + num_features = self.model_estim.x.shape[1] logfc = np.tile(np.NaN, [len(groups), len(groups), num_features]) logfc[np.eye(logfc.shape[0]).astype(bool)] = 0 @@ -2356,11 +2293,16 @@ class DifferentialExpressionTestZTestLazy(_DifferentialExpressionTestMulti): memory. """ - model_estim: _Estimation + model_estim: _EstimatorBase _theta_mle: np.ndarray _theta_sd: np.ndarray - def __init__(self, model_estim: _Estimation, grouping, groups, correction_type="global"): + def __init__( + self, + model_estim: _EstimatorBase, + grouping, groups, + correction_type="global" + ): super().__init__(correction_type=correction_type) self.model_estim = model_estim self.grouping = grouping @@ -2370,7 +2312,7 @@ def __init__(self, model_estim: _Estimation, grouping, groups, correction_type=" self.groups = groups.tolist() # values of parameter estimates: coefficients x genes array with one coefficient per group - self._theta_mle = model_estim.par_link_loc + self._theta_mle = model_estim.a_var # standard deviation of estimates: coefficients x genes array with one coefficient per group # theta_sd = sqrt(diagonal(fisher_inv)) self._theta_sd = np.sqrt(np.diagonal(model_estim.fisher_inv, axis1=-2, axis2=-1)).T @@ -2410,7 +2352,7 @@ def _test(self, **kwargs): pass def _test_pairs(self, groups0, groups1): - num_features = self.model_estim.X.shape[1] + num_features = self.model_estim.x.shape[1] pvals = np.tile(np.NaN, [len(groups0), len(groups1), num_features]) @@ -2430,28 +2372,28 @@ def _test_pairs(self, groups0, groups1): @property def gene_ids(self) -> np.ndarray: - return np.asarray(self.model_estim.features) + return np.asarray(self.model_estim.input_data.features) @property - def X(self): - return self.model_estim.X + def x(self): + return self.model_estim.x @property def log_likelihood(self): - return np.sum(self.model_estim.log_probs(), axis=0) + return self.model_estim.log_likelihood @property def model_gradient(self): - return self.model_estim.gradients + return self.model_estim.jacobian def _ave(self): """ - Returns a xr.DataArray containing the mean expression by gene + Returns the mean expression by gene - :return: xr.DataArray + :return: np.ndarray """ - return np.mean(self.model_estim.X, axis=0) + return np.mean(self.model_estim.x, axis=0) @property def pval(self, **kwargs): @@ -2459,7 +2401,8 @@ def pval(self, **kwargs): This function is not available in lazy results evaluation as it would require all pairwise tests to be performed. """ - pass + raise ValueError("This function is not available in lazy results evaluation as it would " + "require all pairwise tests to be performed.") @property def qval(self, **kwargs): @@ -2467,7 +2410,8 @@ def qval(self, **kwargs): This function is not available in lazy results evaluation as it would require all pairwise tests to be performed. """ - pass + raise ValueError("This function is not available in lazy results evaluation as it would " + "require all pairwise tests to be performed.") def log_fold_change(self, base=np.e, **kwargs): """ @@ -2499,9 +2443,9 @@ def summary( pass def _check_groups(self, groups0, groups1): - if isinstance(groups0, list)==False: + if not isinstance(groups0, list): groups0 = [groups0] - if isinstance(groups1, list)==False: + if not isinstance(groups1, list): groups1 = [groups1] for g in groups0: if g not in self.groups: @@ -2587,7 +2531,7 @@ def log_fold_change_pairs(self, groups0=None, groups1=None, base=np.e): logfc = np.zeros(shape=(len(groups0), len(groups1), num_features)) for i, g0 in enumerate(groups0): for j, g1 in enumerate(groups1): - logfc[i, j, :] = self._theta_mle[g0, :].values - self._theta_mle[g1, :].values + logfc[i, j, :] = self._theta_mle[g0, :] - self._theta_mle[g1, :] if base == np.e: return logfc @@ -2718,7 +2662,16 @@ class DifferentialExpressionTestVsRest(_DifferentialExpressionTestMulti): Tests between between each group and the rest for more than 2 groups per gene. """ - def __init__(self, gene_ids, pval, logfc, ave, groups, tests, correction_type: str): + def __init__( + self, + gene_ids, + pval, + logfc, + ave, + groups, + tests, + correction_type: str + ): super().__init__(correction_type=correction_type) self._gene_ids = np.asarray(gene_ids) self._pval = pval @@ -2744,7 +2697,7 @@ def gene_ids(self) -> np.ndarray: return self._gene_ids @property - def X(self) -> Union[np.ndarray, None]: + def x(self) -> Union[np.ndarray, None]: return None def log_fold_change(self, base=np.e, **kwargs): @@ -2853,7 +2806,7 @@ def gene_ids(self) -> np.ndarray: return self._gene_ids @property - def X(self) -> np.ndarray: + def x(self) -> np.ndarray: return None def log_fold_change(self, base=np.e, **kwargs): @@ -2903,7 +2856,7 @@ def summary(self, qval_thres=None, fc_upper_thres=None, class _DifferentialExpressionTestCont(_DifferentialExpressionTestSingle): _de_test: _DifferentialExpressionTestSingle - _model_estim: _Estimation + _model_estim: _EstimatorBase _size_factors: np.ndarray _continuous_coords: np.ndarray _spline_coefs: list @@ -2911,7 +2864,7 @@ class _DifferentialExpressionTestCont(_DifferentialExpressionTestSingle): def __init__( self, de_test: _DifferentialExpressionTestSingle, - model_estim: _Estimation, + model_estim: _EstimatorBase, size_factors: np.ndarray, continuous_coords: np.ndarray, spline_coefs: list, @@ -2929,8 +2882,8 @@ def gene_ids(self) -> np.ndarray: return self._de_test.gene_ids @property - def X(self): - return self._de_test.X + def x(self): + return self._de_test.x @property def pval(self) -> np.ndarray: @@ -2948,8 +2901,14 @@ def mean(self) -> np.ndarray: def log_likelihood(self) -> np.ndarray: return self._de_test.log_likelihood - def summary(self, nonnumeric=False, qval_thres=None, fc_upper_thres=None, - fc_lower_thres=None, mean_thres=None) -> pd.DataFrame: + def summary( + self, + nonnumeric=False, + qval_thres=None, + fc_upper_thres=None, + fc_lower_thres=None, + mean_thres=None + ) -> pd.DataFrame: """ Summarize differential expression results into an output table. @@ -2987,7 +2946,7 @@ def log_fold_change(self, base=np.e, genes=None, nonnumeric=False): :return: Log-fold change of fitted expression value by gene. """ if genes is None: - genes = np.asarray(range(self.X.shape[1])) + genes = np.asarray(range(self.x.shape[1])) else: genes = self._idx_genes(genes) @@ -3016,7 +2975,7 @@ def _filter_genes_int(self, genes: list): :param genes: List of genes to filter. :return: Filtered list of genes """ - genes_found = np.array([x < self.X.shape[1] for x in genes]) + genes_found = np.array([x < self.x.shape[1] for x in genes]) if any(genes_found == False): logger.info("did not find some genes, omitting") genes = genes[genes_found] @@ -3062,13 +3021,13 @@ def _continuous_model(self, idx, non_numeric=False): """ idx = np.asarray(idx) if non_numeric: - mu = np.matmul(self._model_estim.design_loc.values, + mu = np.matmul(self._model_estim.input_data.design_loc, self._model_estim.par_link_loc[:, idx]) if self._size_factors is not None: mu = mu + self._size_factors else: idx_basis = self._spline_par_loc_idx(intercept=True) - mu = np.matmul(self._model_estim.design_loc[:, idx_basis].values, + mu = np.matmul(self._model_estim.input_data.design_loc[:, idx_basis], self._model_estim.par_link_loc[idx_basis, idx]) mu = np.exp(mu) @@ -3189,7 +3148,7 @@ def plot_genes( ax = plt.subplot(gs[i]) axs.append(ax) - y = self.X[:, g] + y = self.x[:, g] yhat = self._continuous_model(idx=g, non_numeric=non_numeric) if log: y = np.log(y + 1) diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py index cba2037..0a87d2f 100644 --- a/diffxpy/testing/tests.py +++ b/diffxpy/testing/tests.py @@ -1,20 +1,17 @@ -from typing import Union, List, Dict, Callable, Tuple - import anndata +try: + from anndata.base import Raw +except ImportError: + from anndata import Raw import logging import numpy as np import pandas as pd import patsy import scipy.sparse -import xarray as xr - -try: - from anndata.base import Raw -except ImportError: - from anndata import Raw +from typing import Union, List, Dict, Callable, Tuple from batchglm import data as data_utils -from batchglm.xarray_sparse import SparseXArrayDataSet +from batchglm.models.base import _EstimatorBase, _InputDataBase from diffxpy import pkg_constants from diffxpy.models.batch_bfgs.optim import Estim_BFGS from .det import DifferentialExpressionTestLRT, DifferentialExpressionTestWald, \ @@ -22,12 +19,9 @@ DifferentialExpressionTestZTestLazy, DifferentialExpressionTestZTest, DifferentialExpressionTestPairwise, \ DifferentialExpressionTestVsRest, _DifferentialExpressionTestMulti, DifferentialExpressionTestByPartition, \ DifferentialExpressionTestWaldCont, DifferentialExpressionTestLRTCont -from .utils import parse_gene_names, parse_data, parse_sample_description, parse_size_factors, parse_grouping, \ +from .utils import parse_gene_names, parse_sample_description, parse_size_factors, parse_grouping, \ constraint_system_from_star -# Use this to suppress matrix subclass PendingDepreceationWarnings from numpy: -np.warnings.filterwarnings("ignore") - def _fit( noise_model, @@ -46,7 +40,7 @@ def _fit( quick_scale: bool = None, close_session=True, dtype="float64" -): +) -> _EstimatorBase: """ :param noise_model: str, noise model to use in model-based unit_test. Possible options: @@ -133,6 +127,7 @@ def _fit( } if isinstance(training_strategy, str) and training_strategy.lower() == 'bfgs': + assert False, "depreceated" lib_size = np.zeros(data.shape[0]) if noise_model == "nb" or noise_model == "negative_binomial": estim = Estim_BFGS(X=data, design_loc=design_loc, design_scale=design_scale, @@ -143,15 +138,13 @@ def _fit( raise ValueError('base.test(): `noise_model="%s"` not recognized.' % noise_model) else: if noise_model == "nb" or noise_model == "negative_binomial": - from batchglm.api.models.glm_nb import Estimator, InputData + from batchglm.api.models.glm_nb import Estimator, InputDataGLM elif noise_model == "norm" or noise_model == "normal": - from batchglm.api.models.glm_norm import Estimator, InputData + from batchglm.api.models.glm_norm import Estimator, InputDataGLM else: raise ValueError('base.test(): `noise_model="%s"` not recognized.' % noise_model) - logging.getLogger("diffxpy").info("Fitting model...") - logging.getLogger("diffxpy").debug(" * Assembling input data...") - input_data = InputData.new( + input_data = InputDataGLM( data=data, design_loc=design_loc, design_scale=design_scale, @@ -161,7 +154,6 @@ def _fit( feature_names=gene_names, ) - logging.getLogger("diffxpy").debug(" * Set up Estimator...") constructor_args = {} if batch_size is not None: constructor_args["batch_size"] = batch_size @@ -179,12 +171,9 @@ def _fit( dtype=dtype, **constructor_args ) - - logging.getLogger("diffxpy").debug(" * Initializing Estimator...") estim.initialize() - logging.getLogger("diffxpy").debug(" * Run estimation...") - # training: + # Training: if callable(training_strategy): # call training_strategy if it is a function training_strategy(estim) @@ -192,17 +181,13 @@ def _fit( estim.train_sequence(training_strategy=training_strategy) if close_session: - logging.getLogger("diffxpy").debug(" * Finalize estimation...") - model = estim.finalize() - else: - model = estim - logging.getLogger("diffxpy").debug(" * Model fitting done.") + estim.finalize() - return model + return estim def lrt( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], full_formula_loc: str, reduced_formula_loc: str, full_formula_scale: str = "~1", @@ -215,7 +200,7 @@ def lrt( noise_model="nb", size_factors: Union[np.ndarray, pd.core.series.Series, np.ndarray] = None, batch_size: int = None, - training_strategy: Union[str, List[Dict[str, object]], Callable] = "DEFAULT", + training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO", quick_scale: bool = False, dtype="float64", **kwargs @@ -306,11 +291,10 @@ def lrt( as_numeric = [as_numeric] gene_names = parse_gene_names(data, gene_names) - X = parse_data(data, gene_names) sample_description = parse_sample_description(data, sample_description) size_factors = parse_size_factors( size_factors=size_factors, - data=X, + data=data, sample_description=sample_description ) @@ -341,7 +325,7 @@ def lrt( reduced_model = _fit( noise_model=noise_model, - data=X, + data=data, design_loc=reduced_design_loc, design_scale=reduced_design_scale, constraints_loc=None, @@ -358,7 +342,7 @@ def lrt( ) full_model = _fit( noise_model=noise_model, - data=X, + data=data, design_loc=full_design_loc, design_scale=full_design_scale, constraints_loc=None, @@ -387,7 +371,7 @@ def lrt( def wald( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], factor_loc_totest: Union[str, List[str]] = None, coef_to_test: Union[str, List[str]] = None, formula_loc: Union[None, str] = None, @@ -397,8 +381,8 @@ def wald( init_b: Union[np.ndarray, str] = "AUTO", gene_names: Union[np.ndarray, list] = None, sample_description: Union[None, pd.DataFrame] = None, - dmat_loc: Union[patsy.design_info.DesignMatrix, xr.Dataset] = None, - dmat_scale: Union[patsy.design_info.DesignMatrix, xr.Dataset] = None, + dmat_loc: Union[patsy.design_info.DesignMatrix] = None, + dmat_scale: Union[patsy.design_info.DesignMatrix] = None, constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, noise_model: str = "nb", @@ -566,12 +550,11 @@ def wald( # # Parse input data formats: gene_names = parse_gene_names(data, gene_names) - X = parse_data(data, gene_names) if dmat_loc is None and dmat_scale is None: sample_description = parse_sample_description(data, sample_description) size_factors = parse_size_factors( size_factors=size_factors, - data=X, + data=data, sample_description=sample_description ) @@ -582,7 +565,6 @@ def wald( formula=formula_loc, as_numeric=as_numeric, constraints=constraints_loc, - dims=["design_loc_params", "loc_params"], return_type="patsy" ) logging.getLogger("diffxpy").debug("building scale model") @@ -592,7 +574,6 @@ def wald( formula=formula_scale, as_numeric=as_numeric, constraints=constraints_scale, - dims=["design_scale_params", "scale_params"], return_type="patsy" ) @@ -637,7 +618,7 @@ def wald( model = _fit( noise_model=noise_model, - data=X, + data=data, design_loc=design_loc, design_scale=design_scale, constraints_loc=constraints_loc, @@ -664,19 +645,18 @@ def wald( def t_test( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], grouping, gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None, is_logged: bool = False, - is_sig_zerovar: bool = True, - dtype="float64" + is_sig_zerovar: bool = True ): """ Perform Welch's t-test for differential expression between two groups on adata object for each gene. - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like, or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param grouping: str, array @@ -692,13 +672,10 @@ def t_test( the p-value is set to np.nan. """ gene_names = parse_gene_names(data, gene_names) - X = parse_data(data, gene_names) - if isinstance(X, SparseXArrayDataSet): - X = X.X grouping = parse_grouping(data, sample_description, grouping) de_test = DifferentialExpressionTestTT( - data=X.astype(dtype), + data=data, sample_description=sample_description, grouping=grouping, gene_names=gene_names, @@ -710,19 +687,18 @@ def t_test( def rank_test( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], grouping: Union[str, np.ndarray, list], gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None, is_logged: bool = False, - is_sig_zerovar: bool = True, - dtype="float64" + is_sig_zerovar: bool = True ): """ Perform Mann-Whitney rank test (Wilcoxon rank-sum test) for differential expression between two groups on adata object for each gene. - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like, or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param grouping: str, array @@ -738,13 +714,10 @@ def rank_test( the p-value is set to np.nan. """ gene_names = parse_gene_names(data, gene_names) - X = parse_data(data, gene_names) - if isinstance(X, SparseXArrayDataSet): - X = X.X grouping = parse_grouping(data, sample_description, grouping) de_test = DifferentialExpressionTestRank( - data=X.astype(dtype), + data=data, sample_description=sample_description, grouping=grouping, gene_names=gene_names, @@ -756,7 +729,7 @@ def rank_test( def two_sample( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], grouping: Union[str, np.ndarray, list], as_numeric: Union[List[str], Tuple[str], str] = (), test: str = "t-test", @@ -802,7 +775,7 @@ def two_sample( Doesn't require fitting of generalized linear models. Wilcoxon rank sum (Mann-Whitney U) test between both observation groups. - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like, or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param grouping: str, array @@ -851,7 +824,6 @@ def two_sample( 'The t-test is based on a gaussian noise model and wilcoxon is model free.') gene_names = parse_gene_names(data, gene_names) - X = parse_data(data, gene_names) grouping = parse_grouping(data, sample_description, grouping) sample_description = pd.DataFrame({"grouping": grouping}) @@ -867,7 +839,7 @@ def two_sample( formula_loc = '~ 1 + grouping' formula_scale = '~ 1 + grouping' de_test = wald( - data=X, + data=data, factor_loc_totest="grouping", as_numeric=as_numeric, coef_to_test=None, @@ -891,7 +863,7 @@ def two_sample( reduced_formula_loc = '~ 1' reduced_formula_scale = '~ 1' de_test = lrt( - data=X, + data=data, full_formula_loc=full_formula_loc, reduced_formula_loc=reduced_formula_loc, full_formula_scale=full_formula_scale, @@ -909,7 +881,7 @@ def two_sample( ) elif test.lower() == 't-test' or test.lower() == "t_test" or test.lower() == "ttest": de_test = t_test( - data=X, + data=data, gene_names=gene_names, grouping=grouping, is_sig_zerovar=is_sig_zerovar, @@ -917,7 +889,7 @@ def two_sample( ) elif test.lower() == 'rank': de_test = rank_test( - data=X, + data=data, gene_names=gene_names, grouping=grouping, is_sig_zerovar=is_sig_zerovar, @@ -930,7 +902,7 @@ def two_sample( def pairwise( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], grouping: Union[str, np.ndarray, list], as_numeric: Union[List[str], Tuple[str], str] = (), test: str = 'z-test', @@ -982,7 +954,7 @@ def pairwise( Doesn't require fitting of generalized linear models. Wilcoxon rank sum (Mann-Whitney U) test between both observation groups. - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like, or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param grouping: str, array @@ -1048,7 +1020,6 @@ def pairwise( # Do not store all models but only p-value and q-value matrix: # genes x groups x groups gene_names = parse_gene_names(data, gene_names) - X = parse_data(data, gene_names) sample_description = parse_sample_description(data, sample_description) grouping = parse_grouping(data, sample_description, grouping) sample_description = pd.DataFrame({"grouping": grouping}) @@ -1061,7 +1032,7 @@ def pairwise( ) model = _fit( noise_model=noise_model, - data=X, + data=data, design_loc=dmat, design_scale=dmat, gene_names=gene_names, @@ -1089,9 +1060,9 @@ def pairwise( ) else: groups = np.unique(grouping) - pvals = np.tile(np.NaN, [len(groups), len(groups), X.shape[1]]) + pvals = np.tile(np.NaN, [len(groups), len(groups), data.shape[1]]) pvals[np.eye(pvals.shape[0]).astype(bool)] = 0 - logfc = np.tile(np.NaN, [len(groups), len(groups), X.shape[1]]) + logfc = np.tile(np.NaN, [len(groups), len(groups), data.shape[1]]) logfc[np.eye(logfc.shape[0]).astype(bool)] = 0 if keep_full_test_objs: @@ -1105,7 +1076,7 @@ def pairwise( sel = (grouping == g1) | (grouping == g2) de_test_temp = two_sample( - data=X[sel], + data=data[sel], grouping=grouping[sel], as_numeric=as_numeric, test=test, @@ -1132,7 +1103,7 @@ def pairwise( gene_ids=gene_names, pval=pvals, logfc=logfc, - ave=np.mean(X, axis=0), + ave=np.mean(data, axis=0), groups=groups, tests=tests, correction_type=pval_correction @@ -1142,7 +1113,7 @@ def pairwise( def versus_rest( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], grouping: Union[str, np.ndarray, list], as_numeric: Union[List[str], Tuple[str], str] = (), test: str = 'wald', @@ -1194,7 +1165,7 @@ def versus_rest( Doesn't require fitting of generalized linear models. Wilcoxon rank sum (Mann-Whitney U) test between both observation groups. - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param grouping: str, array @@ -1254,14 +1225,13 @@ def versus_rest( # Do not store all models but only p-value and q-value matrix: # genes x groups gene_names = parse_gene_names(data, gene_names) - X = parse_data(data, gene_names) sample_description = parse_sample_description(data, sample_description) grouping = parse_grouping(data, sample_description, grouping) sample_description = pd.DataFrame({"grouping": grouping}) groups = np.unique(grouping) - pvals = np.zeros([1, len(groups), X.shape[1]]) - logfc = np.zeros([1, len(groups), X.shape[1]]) + pvals = np.zeros([1, len(groups), data.shape[1]]) + logfc = np.zeros([1, len(groups), data.shape[1]]) if keep_full_test_objs: tests = np.tile([None], [1, len(groups)]) @@ -1271,7 +1241,7 @@ def versus_rest( for i, g1 in enumerate(groups): test_grouping = np.where(grouping == g1, "group", "rest") de_test_temp = two_sample( - data=X, + data=data, grouping=test_grouping, as_numeric=as_numeric, test=test, @@ -1295,7 +1265,7 @@ def versus_rest( gene_ids=gene_names, pval=pvals, logfc=logfc, - ave=np.mean(X, axis=0), + ave=np.mean(data, axis=0), groups=groups, tests=tests, correction_type=pval_correction @@ -1305,7 +1275,7 @@ def versus_rest( def partition( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], parts: Union[str, np.ndarray, list], gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None @@ -1320,7 +1290,7 @@ def partition( Wraps _Partition so that doc strings are nice. - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param parts: str, array @@ -1348,13 +1318,13 @@ class _Partition: def __init__( self, - data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], parts: Union[str, np.ndarray, list], gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None ): """ - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param parts: str, array @@ -1363,7 +1333,14 @@ def __init__( :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these :param sample_description: optional pandas.DataFrame containing sample annotations """ - self.X = parse_data(data, gene_names) + if isinstance(data, _InputDataBase): + self.x = data.x + elif isinstance(data, anndata.AnnData) or isinstance(data, Raw): + self.x = data.X + elif isinstance(data, np.ndarray): + self.x = data + else: + raise ValueError("data type %s not recognized" % type(data)) self.gene_names = parse_gene_names(data, gene_names) self.sample_description = parse_sample_description(data, sample_description) self.partition = parse_grouping(data, sample_description, parts) @@ -1423,7 +1400,7 @@ def two_sample( DETestsSingle = [] for i, idx in enumerate(self.partition_idx): DETestsSingle.append(two_sample( - data=self.X[idx, :], + data=self.x[idx, :], grouping=grouping, as_numeric=as_numeric, test=test, @@ -1468,7 +1445,7 @@ def t_test( DETestsSingle = [] for i, idx in enumerate(self.partition_idx): DETestsSingle.append(t_test( - data=self.X[idx, :], + data=self.x[idx, :], grouping=grouping, is_logged=is_logged, gene_names=self.gene_names, @@ -1479,7 +1456,7 @@ def t_test( return DifferentialExpressionTestByPartition( partitions=self.partitions, tests=DETestsSingle, - ave=np.mean(self.X, axis=0), + ave=np.mean(self.x, axis=0), correction_type="by_test") def rank_test( @@ -1505,7 +1482,7 @@ def rank_test( DETestsSingle = [] for i, idx in enumerate(self.partition_idx): DETestsSingle.append(rank_test( - data=self.X[idx, :], + data=self.x[idx, :], grouping=grouping, gene_names=self.gene_names, sample_description=self.sample_description.iloc[idx, :], @@ -1515,7 +1492,7 @@ def rank_test( return DifferentialExpressionTestByPartition( partitions=self.partitions, tests=DETestsSingle, - ave=np.mean(self.X, axis=0), + ave=np.mean(self.x, axis=0), correction_type="by_test") def lrt( @@ -1594,7 +1571,7 @@ def lrt( DETestsSingle = [] for i, idx in enumerate(self.partition_idx): DETestsSingle.append(lrt( - data=self.X[idx, :], + data=self.x[idx, :], reduced_formula_loc=reduced_formula_loc, full_formula_loc=full_formula_loc, reduced_formula_scale=reduced_formula_scale, @@ -1613,7 +1590,7 @@ def lrt( return DifferentialExpressionTestByPartition( partitions=self.partitions, tests=DETestsSingle, - ave=np.mean(self.X, axis=0), + ave=np.mean(self.x, axis=0), correction_type="by_test") def wald( @@ -1694,7 +1671,7 @@ def wald( DETestsSingle = [] for i, idx in enumerate(self.partition_idx): DETestsSingle.append(wald( - data=self.X[idx, :], + data=self.x[idx, :], factor_loc_totest=factor_loc_totest, coef_to_test=coef_to_test, formula_loc=formula_loc, @@ -1713,12 +1690,12 @@ def wald( return DifferentialExpressionTestByPartition( partitions=self.partitions, tests=DETestsSingle, - ave=np.mean(self.X, axis=0), + ave=np.mean(self.x, axis=0), correction_type="by_test") def continuous_1d( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix], continuous: str, df: int = 5, factor_loc_totest: Union[str, List[str]] = None, @@ -1759,7 +1736,7 @@ def continuous_1d( design matrices are built within this function and the shape of constraint matrices depends on the output of this function. - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param continuous: str @@ -1906,7 +1883,6 @@ def continuous_1d( if isinstance(as_numeric, tuple): as_numeric = list(as_numeric) - X = parse_data(data, gene_names) gene_names = parse_gene_names(data, gene_names) sample_description = parse_sample_description(data, sample_description) @@ -1967,7 +1943,7 @@ def continuous_1d( logging.getLogger("diffxpy").debug("formula_scale_new: " + formula_scale_new) de_test = wald( - data=X, + data=data, factor_loc_totest=factor_loc_totest_new, coef_to_test=None, formula_loc=formula_loc_new, @@ -2020,7 +1996,7 @@ def continuous_1d( logging.getLogger("diffxpy").debug("reduced_formula_scale: " + reduced_formula_scale) de_test = lrt( - data=X, + data=data, full_formula_loc=full_formula_loc, reduced_formula_loc=reduced_formula_loc, full_formula_scale=full_formula_scale, diff --git a/diffxpy/testing/utils.py b/diffxpy/testing/utils.py index 3c348f1..3eb525e 100644 --- a/diffxpy/testing/utils.py +++ b/diffxpy/testing/utils.py @@ -2,9 +2,8 @@ import numpy as np import pandas as pd import patsy -import scipy +import scipy.sparse from typing import List, Tuple, Union -import xarray as xr try: from anndata.base import Raw @@ -12,37 +11,30 @@ from anndata import Raw from batchglm import data as data_utils +from batchglm.models.base import _InputDataBase # Relay util functions for diffxpy api. # design_matrix, preview_coef_names and constraint_system_from_star are redefined here. from batchglm.data import constraint_matrix_from_string, constraint_matrix_from_dict -from batchglm.data import design_matrix_from_xarray, design_matrix_from_anndata from batchglm.data import view_coef_names -def parse_gene_names(data, gene_names): +def parse_gene_names( + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], + gene_names: Union[list, np.ndarray, None] +): if gene_names is None: if anndata is not None and (isinstance(data, anndata.AnnData) or isinstance(data, Raw)): gene_names = data.var_names - elif isinstance(data, xr.DataArray): - gene_names = data["features"] - elif isinstance(data, xr.Dataset): - gene_names = data["features"] + elif isinstance(data, _InputDataBase): + gene_names = data.features else: raise ValueError("Missing gene names") return np.asarray(gene_names) -def parse_data(data, gene_names) -> xr.DataArray: - X = data_utils.xarray_from_data(data, dims=("observations", "features")) - if gene_names is not None: - X.coords["features"] = gene_names - - return X - - def parse_sample_description( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], sample_description: Union[pd.DataFrame, None] ) -> pd.DataFrame: """ @@ -54,33 +46,32 @@ def parse_sample_description( """ if sample_description is None: if anndata is not None and isinstance(data, anndata.AnnData): - sample_description = data_utils.sample_description_from_anndata( - dataset=data, - ) - elif isinstance(data, xr.Dataset): - sample_description = data_utils.sample_description_from_xarray( - dataset=data, - dim="observations", - ) + sample_description = data.obs else: raise ValueError( - "Please specify `sample_description` or provide `data` as xarray.Dataset or anndata.AnnData " + + "Please specify `sample_description` or provide `data` as anndata.AnnData " + "with corresponding sample annotations" ) if anndata is not None and isinstance(data, Raw): # Raw does not have attribute shape. assert data.X.shape[0] == sample_description.shape[0], \ - "data matrix and sample description must contain same number of cells" + "data matrix and sample description must contain same number of cells: %i, %i" % \ + (data.X.shape[0], sample_description.shape[0]) + elif isinstance(data, _InputDataBase): + assert data.x.shape[0] == sample_description.shape[0], \ + "data matrix and sample description must contain same number of cells: %i, %i" % \ + (data.x.shape[0], sample_description.shape[0]) else: assert data.shape[0] == sample_description.shape[0], \ - "data matrix and sample description must contain same number of cells" + "data matrix and sample description must contain same number of cells: %i, %i" % \ + (data.shape[0], sample_description.shape[0]) return sample_description def parse_size_factors( size_factors: Union[np.ndarray, pd.core.series.Series, np.ndarray], - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], sample_description: pd.DataFrame ) -> Union[np.ndarray, None]: """ @@ -111,7 +102,8 @@ def parse_grouping(data, sample_description, grouping): return np.squeeze(np.asarray(grouping)) -def split_X(data, grouping): +def split_x(data, grouping): + grouping = np.asarray(grouping) groups = np.unique(grouping) x0 = data[np.where(grouping == groups[0])[0]] x1 = data[np.where(grouping == groups[1])[0]] @@ -126,14 +118,14 @@ def dmat_unique(dmat, sample_description): def design_matrix( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix] = None, sample_description: Union[None, pd.DataFrame] = None, formula: Union[None, str] = None, as_numeric: Union[List[str], Tuple[str], str] = (), dmat: Union[pd.DataFrame, None] = None, return_type: str = "xarray" -) -> Union[patsy.design_info.DesignMatrix, xr.Dataset, pd.DataFrame]: +) -> Union[patsy.design_info.DesignMatrix, pd.DataFrame]: """ Create a design matrix from some sample description. This function defaults to perform formatting if dmat is directly supplied as a pd.DataFrame. @@ -155,8 +147,6 @@ def design_matrix( - "patsy": return plain patsy.design_info.DesignMatrix object - "dataframe": return pd.DataFrame with observations as rows and params as columns - - "xarray": return xr.Dataset with design matrix as ds["design"] and the sample description embedded as - one variable per column :param dmat: model design matrix """ if data is None and sample_description is None and dmat is None: @@ -217,13 +207,12 @@ def preview_coef_names( def constraint_system_from_star( - dmat: Union[None, np.ndarray, xr.DataArray, xr.Dataset] = None, + dmat: Union[None, np.ndarray] = None, sample_description: Union[None, pd.DataFrame] = None, formula: Union[None, str] = None, as_numeric: Union[List[str], Tuple[str], str] = (), constraints: dict = {}, - dims: Union[Tuple[str, str], List[str]] = (), - return_type: str = "xarray", + return_type: str = "patsy", ) -> Tuple: """ Create a design matrix and a constraint matrix. @@ -251,17 +240,12 @@ def constraint_system_from_star( Can only group by non-constrained effects right now, use constraint_matrix_from_string for other cases. - :param dims: Dimension names of xarray. - - E.g.: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"] :param return_type: type of the returned value. - "patsy": return plain patsy.design_info.DesignMatrix object - "dataframe": return pd.DataFrame with observations as rows and params as columns - - "xarray": return xr.Dataset with design matrix as ds["design"] and the sample description embedded as - one variable per column This option is overridden if constraints are supplied as dict. - :return: a model design matrix and a constraint matrix formatted as xr.DataArray + :return: a model design matrix and a constraint matrix """ if isinstance(as_numeric, str): as_numeric = [as_numeric] @@ -279,6 +263,5 @@ def constraint_system_from_star( formula=formula, as_categorical=as_categorical, constraints=constraints, - dims=dims, return_type=return_type ) diff --git a/diffxpy/unit_test/test_constrained.py b/diffxpy/unit_test/test_constrained.py index 9ef21a3..78b4851 100644 --- a/diffxpy/unit_test/test_constrained.py +++ b/diffxpy/unit_test/test_constrained.py @@ -45,26 +45,22 @@ def test_forfatal_from_string(self): # Build constraints: constraints_loc = de.utils.constraint_matrix_from_string( dmat=dmat_est_loc, - constraints=["bio1+bio2=0", "bio3+bio4=0"], - dims=["design_loc_params", "loc_params"] + constraints=["bio1+bio2=0", "bio3+bio4=0"] ) constraints_scale = de.utils.constraint_matrix_from_string( dmat=dmat_est_scale, - constraints=["bio1+bio2=0", "bio3+bio4=0"], - dims=["design_scale_params", "scale_params"] + constraints=["bio1+bio2=0", "bio3+bio4=0"] ) test = de.test.wald( - data=sim.X, + data=sim.x, dmat_loc=dmat_est_loc, dmat_scale=dmat_est_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["treatment1"] ) - summary = test.summary() - - return True + _ = test.summary() def test_forfatal_from_dict(self): """ @@ -102,16 +98,14 @@ def test_forfatal_from_dict(self): ) test = de.test.wald( - data=sim.X, + data=sim.x, dmat_loc=dmat_loc, dmat_scale=dmat_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["cond[T.cond1]"] ) - summary = test.summary() - - return True + _ = test.summary() def test_null_distribution_wald_constrained(self, n_genes: int = 100): """ @@ -155,14 +149,14 @@ def test_null_distribution_wald_constrained(self, n_genes: int = 100): ) test = de.test.wald( - data=sim.X, + data=sim.x, dmat_loc=dmat_loc, dmat_scale=dmat_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["cond[T.cond1]"] ) - summary = test.summary() + _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue @@ -230,13 +224,12 @@ def test_null_distribution_wald_constrained_2layer(self, n_genes: int = 100): "bio5+bio6=0", "bio7+bio8=0", "tech1+tech2=0", - "tech3+tech4=0"], - dims=["design_loc_params", "loc_params"] + "tech3+tech4=0"] ) constraints_scale = None test = de.test.wald( - data=sim.X, + data=sim.x, dmat_loc=dmat_est_loc, dmat_scale=dmat_est_scale, constraints_loc=constraints_loc, @@ -298,26 +291,24 @@ def test_null_distribution_wald_multi_constrained_2layer(self, n_genes: int = 50 dmat=dmat_est_loc, constraints=["bio1+bio2=0", "bio3+bio4=0", - "bio5+bio6=0"], - dims=["design_loc_params", "loc_params"] + "bio5+bio6=0"] ) constraints_scale = de.utils.constraint_matrix_from_string( dmat=dmat_est_scale, constraints=["bio1+bio2=0", "bio3+bio4=0", - "bio5+bio6=0"], - dims=["design_scale_params", "scale_params"] + "bio5+bio6=0"] ) test = de.test.wald( - data=sim.X, + data=sim.x, dmat_loc=dmat_est_loc, dmat_scale=dmat_est_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["treatment1", "treatment2"] ) - summary = test.summary() + _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue diff --git a/diffxpy/unit_test/test_continuous.py b/diffxpy/unit_test/test_continuous.py index 642ece4..57ad345 100644 --- a/diffxpy/unit_test/test_continuous.py +++ b/diffxpy/unit_test/test_continuous.py @@ -30,8 +30,8 @@ def test_forfatal_functions(self): sim.generate() random_sample_description = pd.DataFrame({ - "pseudotime": np.random.random(size=sim.num_observations), - "batch": np.random.randint(2, size=sim.num_observations) + "pseudotime": np.random.random(size=sim.nobs), + "batch": np.random.randint(2, size=sim.nobs) }) test = de.test.continuous_1d( @@ -89,7 +89,7 @@ def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100): sim.generate() random_sample_description = pd.DataFrame({ - "pseudotime": np.random.random(size=sim.num_observations) + "pseudotime": np.random.random(size=sim.nobs) }) test = de.test.continuous_1d( @@ -135,7 +135,7 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100): sim.generate() random_sample_description = pd.DataFrame({ - "pseudotime": np.random.random(size=sim.num_observations) + "pseudotime": np.random.random(size=sim.nobs) }) test = de.test.continuous_1d( diff --git a/diffxpy/unit_test/test_correction.py b/diffxpy/unit_test/test_correction.py index 6941688..cbd602e 100644 --- a/diffxpy/unit_test/test_correction.py +++ b/diffxpy/unit_test/test_correction.py @@ -4,7 +4,7 @@ import scipy.stats as stats import logging -from batchglm.api.models.glm_nb import Simulator, Estimator, InputData +from batchglm.api.models.glm_nb import Simulator, Estimator, InputDataGLM import diffxpy.api as de @@ -12,5 +12,6 @@ class TestCorrection(unittest.TestCase): pass + if __name__ == '__main__': unittest.main() diff --git a/diffxpy/unit_test/test_data_types.py b/diffxpy/unit_test/test_data_types.py index 8381c3d..f4874bb 100644 --- a/diffxpy/unit_test/test_data_types.py +++ b/diffxpy/unit_test/test_data_types.py @@ -15,95 +15,81 @@ class TestDataTypesSingle(unittest.TestCase): def _test_wald(self, data, sample_description, gene_names=None): test = de.test.wald( data=data, - factor_loc_totest="condition", - formula_loc="~ 1 + condition", sample_description=sample_description, gene_names=gene_names, - quick_scale=True, - training_strategy="DEFAULT", - dtype="float64" + factor_loc_totest="condition", + formula_loc="~ 1 + condition", + noise_model="nb", + batch_size=5 ) - summary = test.summary() + _ = test.summary() def _test_lrt(self, data, sample_description, gene_names=None): test = de.test.lrt( data=data, - full_formula_loc="~ 1 + condition", - reduced_formula_loc="~ 1", sample_description=sample_description, gene_names=gene_names, - quick_scale=True, - training_strategy="DEFAULT", - dtype="float64" + full_formula_loc="~ 1 + condition", + reduced_formula_loc="~ 1", + noise_model="nb" ) - summary = test.summary() + _ = test.summary() def _test_t_test(self, data, sample_description, gene_names=None): test = de.test.t_test( data=data, - grouping="condition", sample_description=sample_description, - gene_names=gene_names + gene_names=gene_names, + grouping="condition" ) - summary = test.summary() + _ = test.summary() def _test_rank(self, data, sample_description, gene_names=None): test = de.test.rank_test( data=data, - grouping="condition", sample_description=sample_description, - gene_names=gene_names + gene_names=gene_names, + grouping="condition" ) - summary = test.summary() + _ = test.summary() - def simulate(self, n_cells: int = 20, n_genes: int = 2): + def simulate(self, n_cells: int = 200, n_genes: int = 2): sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.input_data.num_observations) }) - return sim.X ,random_sample_description + return sim.x, random_sample_description def _test_numpy(self, sparse): data, sample_description = self.simulate() - gene_names = data.features - data = data.values + gene_names = ["gene" + str(i) for i in range(data.shape[1])] if sparse: data = scipy.sparse.csr_matrix(data) self._test_wald(data=data, sample_description=sample_description, gene_names=gene_names) - #self._test_lrt(data=data, sample_description=sample_description, gene_names=gene_names) + self._test_lrt(data=data, sample_description=sample_description, gene_names=gene_names) self._test_t_test(data=data, sample_description=sample_description, gene_names=gene_names) self._test_rank(data=data, sample_description=sample_description, gene_names=gene_names) - def _test_xarray(self): - data, sample_description = self.simulate() - - self._test_wald(data=data, sample_description=sample_description) - #self._test_lrt(data=data, sample_description=sample_description) - self._test_t_test(data=data, sample_description=sample_description) - self._test_rank(data=data, sample_description=sample_description) - def _test_anndata(self, sparse): data, sample_description = self.simulate() - gene_names = [str(x) for x in data.features.values] - data = data.values + gene_names = ["gene" + str(i) for i in range(data.shape[1])] if sparse: data = scipy.sparse.csr_matrix(data) data = anndata.AnnData(data) data.var_names = gene_names self._test_wald(data=data, sample_description=sample_description) - #self._test_lrt(data=data, sample_description=sample_description) + self._test_lrt(data=data, sample_description=sample_description) self._test_t_test(data=data, sample_description=sample_description) self._test_rank(data=data, sample_description=sample_description) def _test_anndata_raw(self, sparse): data, sample_description = self.simulate() - gene_names = [str(x) for x in data.features.values] - data = data.values + gene_names = ["gene" + str(i) for i in range(data.shape[1])] if sparse: data = scipy.sparse.csr_matrix(data) @@ -111,7 +97,7 @@ def _test_anndata_raw(self, sparse): data.var_names = gene_names data.raw = data self._test_wald(data=data.raw, sample_description=sample_description) - #self._test_lrt(data=data.raw, sample_description=sample_description) + self._test_lrt(data=data.raw, sample_description=sample_description) self._test_t_test(data=data, sample_description=sample_description) self._test_rank(data=data, sample_description=sample_description) @@ -125,15 +111,6 @@ def test_numpy(self): return True - def test_xarray(self): - logging.getLogger("tensorflow").setLevel(logging.ERROR) - logging.getLogger("batchglm").setLevel(logging.WARNING) - logging.getLogger("diffxpy").setLevel(logging.WARNING) - - self._test_xarray() - - return True - def test_anndata(self): logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) @@ -146,5 +123,6 @@ def test_anndata(self): return True + if __name__ == '__main__': unittest.main() diff --git a/diffxpy/unit_test/test_extreme_values.py b/diffxpy/unit_test/test_extreme_values.py index cb19922..c430784 100644 --- a/diffxpy/unit_test/test_extreme_values.py +++ b/diffxpy/unit_test/test_extreme_values.py @@ -3,7 +3,6 @@ import numpy as np import pandas as pd -import scipy.stats as stats from batchglm.api.models.glm_nb import Simulator import diffxpy.api as de @@ -19,25 +18,26 @@ def test_t_test_zero_variance(self): logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) sim = Simulator(num_observations=1000, num_features=10) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() - sim.data.X[:, 0] = 0 - sim.data.X[:, 1] = 5 + sim.input_data.x[:, 0] = 0 + sim.input_data.x[:, 1] = 5 random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.nobs) }) test = de.test.t_test( - data=sim.X, - grouping="condition", + data=sim.input_data, sample_description=random_sample_description, + grouping="condition", is_sig_zerovar=True ) assert np.isnan(test.pval[0]) and test.pval[1] == 1, \ - "rank test did not assign p-value of zero to groups with zero variance and same mean, %f, %f" % \ + "t test did not assign p-value of zero to groups with zero variance and same mean, %f, %f" % \ (test.pval[0], test.pval[1]) return True @@ -49,20 +49,21 @@ def test_rank_test_zero_variance(self): logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) sim = Simulator(num_observations=1000, num_features=10) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() - sim.data.X[:, 0] = 0 - sim.data.X[:, 1] = 5 + sim.input_data.x[:, 0] = 0 + sim.input_data.x[:, 1] = 5 random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.nobs) }) test = de.test.rank_test( - data=sim.X, - grouping="condition", + data=sim.input_data, sample_description=random_sample_description, + grouping="condition", is_sig_zerovar=True ) diff --git a/diffxpy/unit_test/test_fit.py b/diffxpy/unit_test/test_fit.py new file mode 100644 index 0000000..80b177f --- /dev/null +++ b/diffxpy/unit_test/test_fit.py @@ -0,0 +1,286 @@ +import unittest +import logging +import numpy as np +import pandas as pd + +import diffxpy.api as de + + +class _TestFit: + + def _test_model_fit( + self, + n_cells: int, + n_genes: int, + noise_model: str + ): + """ + Test if de.wald() generates a uniform p-value distribution + if it is given data simulated based on the null model. Returns the p-value + of the two-side Kolmgorov-Smirnov test for equality of the observed + p-value distribution and a uniform distribution. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + :param noise_model: Noise model to use for data fitting. + """ + if noise_model == "nb": + from batchglm.api.models.glm_nb import Simulator + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + elif noise_model == "norm": + from batchglm.api.models.glm_norm import Simulator + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + else: + raise ValueError("noise model %s not recognized" % noise_model) + + sim = Simulator(num_observations=n_cells, num_features=n_genes) + sim.generate_sample_description(num_batches=0, num_conditions=0) + sim.generate_params(rand_fn_scale=rand_fn_scale) + sim.generate_data() + + random_sample_description = pd.DataFrame({ + "condition": np.random.randint(2, size=sim.nobs), + "batch": np.random.randint(2, size=sim.nobs) + }) + + estim = de.fit.model( + data=sim.input_data, + sample_description=random_sample_description, + formula_loc="~ 1 + condition + batch", + noise_model=noise_model + ) + return True + + def _test_model_fit_partition( + self, + n_cells: int, + n_genes: int, + noise_model: str + ): + """ + Test if de.wald() generates a uniform p-value distribution + if it is given data simulated based on the null model. Returns the p-value + of the two-side Kolmgorov-Smirnov test for equality of the observed + p-value distribution and a uniform distribution. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + :param noise_model: Noise model to use for data fitting. + """ + if noise_model == "nb": + from batchglm.api.models.glm_nb import Simulator + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + elif noise_model == "norm": + from batchglm.api.models.glm_norm import Simulator + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + else: + raise ValueError("noise model %s not recognized" % noise_model) + + sim = Simulator(num_observations=n_cells, num_features=n_genes) + sim.generate_sample_description(num_batches=0, num_conditions=0) + sim.generate_params(rand_fn_scale=rand_fn_scale) + sim.generate_data() + + random_sample_description = pd.DataFrame({ + "condition": np.random.randint(2, size=sim.nobs), + "batch": np.random.randint(2, size=sim.nobs) + }) + + partition = de.fit.partition( + data=sim.input_data, + sample_description=random_sample_description, + parts="condition" + ) + estim = partition.model( + formula_loc="~ 1 + batch", + noise_model=noise_model + ) + return True + + def _test_residuals_fit( + self, + n_cells: int, + n_genes: int, + noise_model: str + ): + """ + Test if de.wald() (multivariate mode) generates a uniform p-value distribution + if it is given data simulated based on the null model. Returns the p-value + of the two-side Kolmgorov-Smirnov test for equality of the observed + p-value distribution and a uniform distribution. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + :param noise_model: Noise model to use for data fitting. + """ + if noise_model == "nb": + from batchglm.api.models.glm_nb import Simulator + elif noise_model == "norm": + from batchglm.api.models.glm_norm import Simulator + else: + raise ValueError("noise model %s not recognized" % noise_model) + + sim = Simulator(num_observations=n_cells, num_features=n_genes) + sim.generate_sample_description(num_batches=0, num_conditions=0) + sim.generate() + + random_sample_description = pd.DataFrame({ + "condition": np.random.randint(2, size=sim.nobs), + "batch": np.random.randint(2, size=sim.nobs) + }) + + res = de.fit.residuals( + data=sim.input_data, + sample_description=random_sample_description, + formula_loc="~ 1 + condition + batch", + noise_model=noise_model + ) + return True + + +class TestFitNb(_TestFit, unittest.TestCase): + """ + Negative binomial noise model unit tests that tests whether model fit relay works. + """ + + def test_model_fit( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if model for "nb" noise model works. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_model_fit( + n_cells=n_cells, + n_genes=n_genes, + noise_model="nb" + ) + + def test_model_fit_partition( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if partitioned model for "nb" noise model works. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_model_fit_partition( + n_cells=n_cells, + n_genes=n_genes, + noise_model="nb" + ) + + def test_residuals_fit( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if residual fit for "nb" noise model works. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_residuals_fit( + n_cells=n_cells, + n_genes=n_genes, + noise_model="nb" + ) + + +class TestFitNorm(_TestFit, unittest.TestCase): + """ + Normal noise model unit tests that tests whether model fit relay works. + """ + + def test_model_fit( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if model fit for "norm" noise model works. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_model_fit( + n_cells=n_cells, + n_genes=n_genes, + noise_model="norm" + ) + + def test_model_fit_partition( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if partitioned model fit for "norm" noise model works. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_model_fit_partition( + n_cells=n_cells, + n_genes=n_genes, + noise_model="norm" + ) + + def test_residuals_fit( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if residual fit for "norm" noise model works. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_residuals_fit( + n_cells=n_cells, + n_genes=n_genes, + noise_model="norm" + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/diffxpy/unit_test/test_numeric.py b/diffxpy/unit_test/test_numeric_covar.py similarity index 92% rename from diffxpy/unit_test/test_numeric.py rename to diffxpy/unit_test/test_numeric_covar.py index 05e53b6..2c3bedd 100644 --- a/diffxpy/unit_test/test_numeric.py +++ b/diffxpy/unit_test/test_numeric_covar.py @@ -7,7 +7,7 @@ import diffxpy.api as de -class TestNumeric(unittest.TestCase): +class TestNumericCovar(unittest.TestCase): def test(self): """ @@ -25,11 +25,11 @@ def test(self): sim.generate_data() sample_description = sim.sample_description - sample_description["numeric1"] = np.random.random(size=sim.num_observations) - sample_description["numeric2"] = np.random.random(size=sim.num_observations) + sample_description["numeric1"] = np.random.random(size=sim.nobs) + sample_description["numeric2"] = np.random.random(size=sim.nobs) test = de.test.wald( - data=sim.X, + data=sim.input_data, sample_description=sample_description, formula_loc="~ 1 + condition + numeric1 + numeric2", formula_scale="~ 1", diff --git a/diffxpy/unit_test/test_pairwise.py b/diffxpy/unit_test/test_pairwise.py index 5b52746..6cc6c59 100644 --- a/diffxpy/unit_test/test_pairwise.py +++ b/diffxpy/unit_test/test_pairwise.py @@ -29,11 +29,11 @@ def test_null_distribution_ztest(self, n_cells: int = 2000, n_genes: int = 100, sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.num_observations) + "condition": np.random.randint(n_groups, size=sim.nobs) }) test = de.test.pairwise( - data=sim.X, + data=sim.x, grouping="condition", test="z-test", noise_model="nb", @@ -69,11 +69,11 @@ def test_null_distribution_z_lazy(self, n_cells: int = 2000, n_genes: int = 100) sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(4, size=sim.num_observations) + "condition": np.random.randint(4, size=sim.nobs) }) test = de.test.pairwise( - data=sim.X, + data=sim.x, grouping="condition", test='z-test', lazy=True, @@ -112,11 +112,11 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100, n_ sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.num_observations) + "condition": np.random.randint(n_groups, size=sim.nobs) }) test = de.test.pairwise( - data=sim.X, + data=sim.x, grouping="condition", test="lrt", noise_model="nb", @@ -151,11 +151,11 @@ def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 10000 sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.num_observations) + "condition": np.random.randint(n_groups, size=sim.nobs) }) test = de.test.pairwise( - data=sim.X, + data=sim.x, grouping="condition", test="t-test", sample_description=random_sample_description, @@ -189,11 +189,11 @@ def test_null_distribution_wilcoxon(self, n_cells: int = 2000, n_genes: int = 10 sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.num_observations) + "condition": np.random.randint(n_groups, size=sim.nobs) }) test = de.test.pairwise( - data=sim.X, + data=sim.x, grouping="condition", test="wilcoxon", sample_description=random_sample_description, @@ -239,7 +239,7 @@ def test_ztest_de(self, n_cells: int = 2000, n_genes: int = 500): sample_description = sim.sample_description test = de.test.pairwise( - data=sim.X, + data=sim.x, grouping="condition", test="z-test", noise_model="nb", diff --git a/diffxpy/unit_test/test_partition.py b/diffxpy/unit_test/test_partition.py index 98d1009..7f83374 100644 --- a/diffxpy/unit_test/test_partition.py +++ b/diffxpy/unit_test/test_partition.py @@ -29,13 +29,13 @@ def test_null_distribution_wald(self, n_cells: int = 4000, n_genes: int = 200): sim.generate() sample_description = pd.DataFrame({ - "covar1": np.random.randint(2, size=sim.num_observations), - "covar2": np.random.randint(2, size=sim.num_observations) + "covar1": np.random.randint(2, size=sim.nobs), + "covar2": np.random.randint(2, size=sim.nobs) }) sample_description["cond"] = sim.sample_description["condition"].values partition = de.test.partition( - data=sim.X, + data=sim.x, parts="cond", sample_description=sample_description ) @@ -45,7 +45,7 @@ def test_null_distribution_wald(self, n_cells: int = 4000, n_genes: int = 200): training_strategy="DEFAULT", dtype="float64" ) - summary = det.summary() + _ = det.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(det.pval.flatten(), 'uniform').pvalue @@ -74,13 +74,13 @@ def test_null_distribution_wald_multi(self, n_cells: int = 4000, n_genes: int = sim.generate() sample_description = pd.DataFrame({ - "covar1": np.random.randint(4, size=sim.num_observations), - "covar2": np.random.randint(2, size=sim.num_observations) + "covar1": np.random.randint(4, size=sim.nobs), + "covar2": np.random.randint(2, size=sim.nobs) }) sample_description["cond"] = sim.sample_description["condition"].values partition = de.test.partition( - data=sim.X, + data=sim.x, parts="cond", sample_description=sample_description ) @@ -90,7 +90,7 @@ def test_null_distribution_wald_multi(self, n_cells: int = 4000, n_genes: int = training_strategy="DEFAULT", dtype="float64" ) - summary = det.summary() + _ = det.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(det.pval.flatten(), 'uniform').pvalue @@ -119,13 +119,13 @@ def test_null_distribution_lrt(self, n_cells: int = 4000, n_genes: int = 200): sim.generate() sample_description = pd.DataFrame({ - "covar1": np.random.randint(2, size=sim.num_observations), - "covar2": np.random.randint(2, size=sim.num_observations) + "covar1": np.random.randint(2, size=sim.nobs), + "covar2": np.random.randint(2, size=sim.nobs) }) sample_description["cond"] = sim.sample_description["condition"].values partition = de.test.partition( - data=sim.X, + data=sim.x, parts="cond", sample_description=sample_description ) @@ -137,7 +137,7 @@ def test_null_distribution_lrt(self, n_cells: int = 4000, n_genes: int = 200): training_strategy="DEFAULT", dtype="float64" ) - summary = det.summary() + _ = det.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(det.pval.flatten(), 'uniform').pvalue @@ -166,12 +166,12 @@ def test_null_distribution_ttest(self, n_cells: int = 4000, n_genes: int = 200): sim.generate() sample_description = pd.DataFrame({ - "covar1": np.random.randint(2, size=sim.num_observations) + "covar1": np.random.randint(2, size=sim.nobs) }) sample_description["cond"] = sim.sample_description["condition"].values partition = de.test.partition( - data=sim.X, + data=sim.x, parts="cond", sample_description=sample_description ) @@ -209,12 +209,12 @@ def test_null_distribution_rank(self, n_cells: int = 4000, n_genes: int = 200): sim.generate() sample_description = pd.DataFrame({ - "covar1": np.random.randint(2, size=sim.num_observations) + "covar1": np.random.randint(2, size=sim.nobs) }) sample_description["cond"] = sim.sample_description["condition"].values partition = de.test.partition( - data=sim.X, + data=sim.x, parts="cond", sample_description=sample_description ) diff --git a/diffxpy/unit_test/test_single_de.py b/diffxpy/unit_test/test_single_de.py index 693e1fc..8edf03f 100644 --- a/diffxpy/unit_test/test_single_de.py +++ b/diffxpy/unit_test/test_single_de.py @@ -1,13 +1,11 @@ import unittest import logging import numpy as np -import pandas as pd -import scipy.stats as stats import diffxpy.api as de -class _TestSingleDE: +class _TestSingleDe: def _prepare_data( self, @@ -23,8 +21,12 @@ def _prepare_data( """ if noise_model == "nb": from batchglm.api.models.glm_nb import Simulator + rand_fn_loc = lambda shape: np.random.uniform(5, 10, shape) + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models.glm_norm import Simulator + rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) @@ -32,22 +34,21 @@ def _prepare_data( sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params( - rand_fn_ave=lambda shape: np.random.poisson(500, shape) + 1, - rand_fn=lambda shape: np.abs(np.random.uniform(1, 0.5, shape)) + rand_fn_loc=rand_fn_loc, + rand_fn_scale=rand_fn_scale ) - sim.params["a_var"][1, :num_non_de] = 0 - sim.params["b_var"][1, :num_non_de] = 0 - sim.params["isDE"] = ("features",), np.arange(n_genes) >= num_non_de + sim.a_var[1, :num_non_de] = 0 + sim.b_var[1, :num_non_de] = 0 + self.isDE = np.arange(n_genes) >= num_non_de sim.generate_data() - return sim def _eval(self, sim, test): - idx_de = np.where(sim.params["isDE"] == True)[0] - idx_nonde = np.where(sim.params["isDE"] == False)[0] + idx_de = np.where(self.isDE)[0] + idx_nonde = np.where(np.logical_not(self.isDE))[0] - frac_de_of_non_de = np.sum(test.qval[idx_nonde] < 0.05) / len(idx_nonde) - frac_de_of_de = np.sum(test.qval[idx_de] < 0.05) / len(idx_de) + frac_de_of_non_de = np.mean(test.qval[idx_nonde] < 0.05) + frac_de_of_de = np.mean(test.qval[idx_de] < 0.05) logging.getLogger("diffxpy").info( 'fraction of non-DE genes with q-value < 0.05: %.1f%%' % @@ -82,10 +83,9 @@ def _test_rank_de( ) test = de.test.rank_test( - data=sim.X, - grouping="condition", + data=sim.input_data, sample_description=sim.sample_description, - dtype="float64" + grouping="condition" ) self._eval(sim=sim, test=test) @@ -112,10 +112,9 @@ def _test_t_test_de( ) test = de.test.t_test( - data=sim.X, + data=sim.input_data, grouping="condition", - sample_description=sim.sample_description, - dtype="float64" + sample_description=sim.sample_description ) self._eval(sim=sim, test=test) @@ -144,10 +143,10 @@ def _test_wald_de( ) test = de.test.wald( - data=sim.X, + data=sim.input_data, + sample_description=sim.sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition", - sample_description=sim.sample_description, noise_model=noise_model, training_strategy="DEFAULT", dtype="float64" @@ -179,12 +178,12 @@ def _test_lrt_de( ) test = de.test.lrt( - data=sim.X, + data=sim.input_data, + sample_description=sim.sample_description, full_formula_loc="~ 1 + condition", full_formula_scale="~ 1", reduced_formula_loc="~ 1", reduced_formula_scale="~ 1", - sample_description=sim.sample_description, noise_model=noise_model, training_strategy="DEFAULT", dtype="float64" @@ -195,7 +194,7 @@ def _test_lrt_de( return True -class TestSingleDE_STANDARD(_TestSingleDE, unittest.TestCase): +class TestSingleDeStandard(_TestSingleDe, unittest.TestCase): """ Noise model-independent tests unit tests that tests false positive and false negative rates. """ @@ -209,6 +208,11 @@ def test_ttest_de( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) return self._test_t_test_de( n_cells=n_cells, n_genes=n_genes @@ -223,13 +227,18 @@ def test_rank_de( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) return self._test_rank_de( n_cells=n_cells, n_genes=n_genes ) -class TestSingleDE_NB(_TestSingleDE, unittest.TestCase): +class TestSingleDeNb(_TestSingleDe, unittest.TestCase): """ Negative binomial noise model unit tests that tests false positive and false negative rates. """ @@ -243,6 +252,11 @@ def test_wald_de_nb( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) return self._test_wald_de( n_cells=n_cells, n_genes=n_genes, @@ -258,6 +272,11 @@ def test_lrt_de_nb( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) return self._test_lrt_de( n_cells=n_cells, n_genes=n_genes, @@ -265,7 +284,7 @@ def test_lrt_de_nb( ) -class TestSingleDE_NORM(_TestSingleDE, unittest.TestCase): +class TestSingleDeNorm(_TestSingleDe, unittest.TestCase): """ Normal noise model unit tests that tests false positive and false negative rates. """ @@ -279,6 +298,11 @@ def test_wald_de_norm( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) return self._test_wald_de( n_cells=n_cells, n_genes=n_genes, @@ -294,6 +318,11 @@ def test_lrt_de_norm( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) return self._test_lrt_de( n_cells=n_cells, n_genes=n_genes, diff --git a/diffxpy/unit_test/test_single_external_libs.py b/diffxpy/unit_test/test_single_external_libs.py index 0590c2a..b1d4883 100644 --- a/diffxpy/unit_test/test_single_external_libs.py +++ b/diffxpy/unit_test/test_single_external_libs.py @@ -1,7 +1,6 @@ import unittest import logging import numpy as np -import pandas as pd import scipy.stats as stats from batchglm.api.models.glm_nb import Simulator @@ -18,10 +17,7 @@ def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100): """ sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) - sim.generate_params( - rand_fn_ave=lambda shape: np.random.poisson(500, shape) + 1, - rand_fn=lambda shape: np.abs(np.random.uniform(1, 0.5, shape)) - ) + sim.generate_params() sim.generate_data() return sim @@ -45,8 +41,8 @@ def _eval(self, test, ref_pvals): 'mean absolute log p-value deviation: %f' % float(mean_dev) ) - assert max_dev < 1e-3, "maximum deviation too large" - assert max_log_dev < 1e-1, "maximum deviation in log space too large" + assert max_dev < 1e-3, "maximum deviation too large: %f" % max_dev + assert max_log_dev < 1e-1, "maximum deviation in log space too large: %f" % max_log_dev def test_t_test_ref(self, n_cells: int = 2000, n_genes: int = 100): """ @@ -59,23 +55,25 @@ def test_t_test_ref(self, n_cells: int = 2000, n_genes: int = 100): logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.INFO) + np.random.seed(1) sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes) - test = de.test.t_test( - data=sim.X, + data=sim.input_data, grouping="condition", - sample_description=sim.sample_description, - dtype="float64" + sample_description=sim.sample_description ) # Run scipy t-tests as a reference. conds = np.unique(sim.sample_description["condition"].values) ind_a = np.where(sim.sample_description["condition"] == conds[0])[0] ind_b = np.where(sim.sample_description["condition"] == conds[1])[0] - scipy_pvals = stats.ttest_ind(a=sim.X[ind_a, :], b=sim.X[ind_b, :], axis=0, equal_var=False).pvalue - + scipy_pvals = stats.ttest_ind( + a=sim.x[ind_a, :], + b=sim.x[ind_b, :], + axis=0, + equal_var=False + ).pvalue self._eval(test=test, ref_pvals=scipy_pvals) - return True def test_rank_ref(self, n_cells: int = 2000, n_genes: int = 100): @@ -89,13 +87,12 @@ def test_rank_ref(self, n_cells: int = 2000, n_genes: int = 100): logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.INFO) + np.random.seed(1) sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes) - test = de.test.rank_test( - data=sim.X, + data=sim.input_data, grouping="condition", - sample_description=sim.sample_description, - dtype="float64" + sample_description=sim.sample_description ) # Run scipy t-tests as a reference. @@ -103,13 +100,15 @@ def test_rank_ref(self, n_cells: int = 2000, n_genes: int = 100): ind_a = np.where(sim.sample_description["condition"] == conds[0])[0] ind_b = np.where(sim.sample_description["condition"] == conds[1])[0] scipy_pvals = np.array([ - stats.mannwhitneyu(x=sim.X[ind_a, i], y=sim.X[ind_b, i], - use_continuity=True, alternative="two-sided").pvalue - for i in range(sim.X.shape[1]) - ]) - + stats.mannwhitneyu( + x=sim.x[ind_a, i], + y=sim.x[ind_b, i], + use_continuity=True, + alternative="two-sided" + ).pvalue + for i in range(sim.x.shape[1]) + ]) self._eval(test=test, ref_pvals=scipy_pvals) - return True diff --git a/diffxpy/unit_test/test_single_null.py b/diffxpy/unit_test/test_single_null.py index 34ca47d..3c9834c 100644 --- a/diffxpy/unit_test/test_single_null.py +++ b/diffxpy/unit_test/test_single_null.py @@ -27,31 +27,34 @@ def _test_null_distribution_wald( """ if noise_model == "nb": from batchglm.api.models.glm_nb import Simulator + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models.glm_norm import Simulator + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + sim.generate_params(rand_fn_scale=rand_fn_scale) + sim.generate_data() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations), - "batch": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.nobs), + "batch": np.random.randint(2, size=sim.nobs) }) test = de.test.wald( - data=sim.X, + data=sim.input_data, + sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition + batch", - sample_description=random_sample_description, batch_size=500, noise_model=noise_model, training_strategy="DEFAULT", dtype="float64" ) - summary = test.summary() + _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue @@ -89,19 +92,19 @@ def _test_null_distribution_wald_multi( sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(4, size=sim.num_observations) + "condition": np.random.randint(4, size=sim.nobs) }) test = de.test.wald( - data=sim.X, + data=sim.input_data, + sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition", - sample_description=random_sample_description, noise_model=noise_model, training_strategy="DEFAULT", dtype="float64" ) - summary = test.summary() + _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue @@ -139,21 +142,21 @@ def _test_null_distribution_lrt( sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.nobs) }) test = de.test.lrt( - data=sim.X, + data=sim.input_data, + sample_description=random_sample_description, full_formula_loc="~ 1 + condition", full_formula_scale="~ 1", reduced_formula_loc="~ 1", reduced_formula_scale="~ 1", - sample_description=random_sample_description, noise_model=noise_model, training_strategy="DEFAULT", dtype="float64" ) - summary = test.summary() + _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue @@ -184,17 +187,16 @@ def _test_null_distribution_ttest( sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.nobs) }) test = de.test.t_test( - data=sim.X, - grouping="condition", + data=sim.input_data, sample_description=random_sample_description, - is_logged=False, - dtype="float64" + grouping="condition", + is_logged=False ) - summary = test.summary() + _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue @@ -225,16 +227,15 @@ def _test_null_distribution_rank( sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.nobs) }) test = de.test.rank_test( - data=sim.X, - grouping="condition", + data=sim.input_data, sample_description=random_sample_description, - dtype="float64" + grouping="condition" ) - summary = test.summary() + _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue @@ -250,6 +251,7 @@ class TestSingleNullStandard(_TestSingleNull, unittest.TestCase): Noise model-independent unit tests that test whether a test generates uniformly distributed p-values if data are sampled from the null model. """ + def test_null_distribution_ttest( self, n_cells: int = 2000, @@ -265,6 +267,7 @@ def test_null_distribution_ttest( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_ttest( n_cells=n_cells, n_genes=n_genes @@ -285,13 +288,14 @@ def test_null_distribution_rank( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_rank( n_cells=n_cells, n_genes=n_genes ) -class TestSingleNullNB(_TestSingleNull, unittest.TestCase): +class TestSingleNullNb(_TestSingleNull, unittest.TestCase): """ Negative binomial noise model unit tests that test whether a test generates uniformly distributed p-values if data are sampled from the null model. @@ -312,6 +316,7 @@ def test_null_distribution_wald_nb( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_wald( n_cells=n_cells, n_genes=n_genes, @@ -334,6 +339,7 @@ def test_null_distribution_wald_multi_nb( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_wald_multi( n_cells=n_cells, n_genes=n_genes, @@ -355,6 +361,7 @@ def test_null_distribution_lrt_nb( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_lrt( n_cells=n_cells, n_genes=n_genes, @@ -362,7 +369,7 @@ def test_null_distribution_lrt_nb( ) -class TestSingleNullNORM(_TestSingleNull, unittest.TestCase): +class TestSingleNullNorm(_TestSingleNull, unittest.TestCase): """ Normal noise model unit tests that test whether a test generates uniformly distributed p-values if data are sampled from the null model. @@ -382,6 +389,7 @@ def test_null_distribution_wald_norm( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_wald( n_cells=n_cells, n_genes=n_genes, @@ -404,6 +412,7 @@ def test_null_distribution_wald_multi_norm( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_wald_multi( n_cells=n_cells, n_genes=n_genes, @@ -425,11 +434,13 @@ def test_null_distribution_lrt_norm( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_lrt( n_cells=n_cells, n_genes=n_genes, noise_model="norm" ) + if __name__ == '__main__': unittest.main() diff --git a/diffxpy/unit_test/test_vsrest.py b/diffxpy/unit_test/test_vsrest.py index 28a56cf..9a78505 100644 --- a/diffxpy/unit_test/test_vsrest.py +++ b/diffxpy/unit_test/test_vsrest.py @@ -29,11 +29,11 @@ def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.num_observations) + "condition": np.random.randint(n_groups, size=sim.nobs) }) test = de.test.versus_rest( - data=sim.X, + data=sim.x, grouping="condition", test="wald", noise_model="nb", @@ -72,11 +72,11 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100): sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.nobs) }) test = de.test.versus_rest( - data=sim.X, + data=sim.x, grouping="condition", test="lrt", noise_model="nb", @@ -115,11 +115,11 @@ def test_null_distribution_rank(self, n_cells: int = 2000, n_genes: int = 100, n sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.num_observations) + "condition": np.random.randint(n_groups, size=sim.nobs) }) test = de.test.versus_rest( - data=sim.X, + data=sim.x, grouping="condition", test="rank", sample_description=random_sample_description, @@ -155,11 +155,11 @@ def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100, sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.num_observations) + "condition": np.random.randint(n_groups, size=sim.nobs) }) test = de.test.versus_rest( - data=sim.X, + data=sim.x, grouping="condition", test="t-test", sample_description=random_sample_description, diff --git a/requirements.txt b/requirements.txt index 7d2d66e..1b9b636 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ numpy>=1.14.0 scipy pandas patsy>=0.5.0 -batchglm>=0.4.0 +batchglm>=0.6.1 xarray statsmodels anndata diff --git a/setup.py b/setup.py index 2f43452..e8acae9 100644 --- a/setup.py +++ b/setup.py @@ -17,12 +17,11 @@ long_description_content_type="text/markdown", packages=find_packages(), install_requires=[ - 'numpy>=1.14.0', - 'scipy', + 'numpy==1.16.4', + 'scipy>=1.2.1', 'pandas', 'patsy>=0.5.0', - 'batchglm>=0.6.1', - 'xarray', + 'batchglm>=0.6.3', 'statsmodels', ], extras_require={