From 06cd1da82afb4c30dd7a18809e477e3e50baab03 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 14 Aug 2019 17:11:46 +0200 Subject: [PATCH 01/15] updated batchglm in requirements, necessary for documentation --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7d2d66e..1b9b636 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ numpy>=1.14.0 scipy pandas patsy>=0.5.0 -batchglm>=0.4.0 +batchglm>=0.6.1 xarray statsmodels anndata From cc73de498305c6640197e10f5e4ebd1ad599874a Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 21 Aug 2019 11:54:04 +0200 Subject: [PATCH 02/15] intermediate commit towards xarray depreceation --- diffxpy/api/utils.py | 2 +- diffxpy/models/batch_bfgs/optim.py | 8 +- diffxpy/stats/stats.py | 3 - diffxpy/testing/det.py | 381 +++++++----------- diffxpy/testing/tests.py | 146 +++---- diffxpy/testing/utils.py | 70 ++-- diffxpy/unit_test/test_constrained.py | 37 +- diffxpy/unit_test/test_continuous.py | 8 +- diffxpy/unit_test/test_correction.py | 3 +- diffxpy/unit_test/test_data_types.py | 45 +-- diffxpy/unit_test/test_extreme_values.py | 12 +- diffxpy/unit_test/test_numeric.py | 6 +- diffxpy/unit_test/test_pairwise.py | 22 +- diffxpy/unit_test/test_partition.py | 32 +- diffxpy/unit_test/test_single_de.py | 60 ++- .../unit_test/test_single_external_libs.py | 4 +- diffxpy/unit_test/test_single_null.py | 53 +-- diffxpy/unit_test/test_vsrest.py | 16 +- setup.py | 5 +- 19 files changed, 402 insertions(+), 511 deletions(-) diff --git a/diffxpy/api/utils.py b/diffxpy/api/utils.py index fb653a7..b2e4ded 100644 --- a/diffxpy/api/utils.py +++ b/diffxpy/api/utils.py @@ -1,4 +1,4 @@ from diffxpy.testing.utils import constraint_matrix_from_string, constraint_matrix_from_dict, \ constraint_system_from_star -from diffxpy.testing.utils import design_matrix, design_matrix_from_xarray, design_matrix_from_anndata +from diffxpy.testing.utils import design_matrix from diffxpy.testing.utils import view_coef_names, preview_coef_names diff --git a/diffxpy/models/batch_bfgs/optim.py b/diffxpy/models/batch_bfgs/optim.py index ef27ec5..98c234e 100644 --- a/diffxpy/models/batch_bfgs/optim.py +++ b/diffxpy/models/batch_bfgs/optim.py @@ -16,17 +16,17 @@ class Estim_BFGS_Model(): def __init__(self, Estim_BFGS, nproc): - self._num_observations = Estim_BFGS.X.shape[0] - self._num_features = Estim_BFGS.X.shape[1] + self._num_observations = Estim_BFGS.x.shape[0] + self._num_features = Estim_BFGS.x.shape[1] self._features = Estim_BFGS.feature_names - self._observations = Estim_BFGS.X.shape[0] + self._observations = Estim_BFGS.x.shape[0] self._design_loc = Estim_BFGS.design_loc self._design_scale = Estim_BFGS.design_scale self._loss = xr.DataArray(Estim_BFGS.full_loss(nproc)) self._log_probs = -self._loss self._probs = np.exp(self._log_probs) self._mles = xr.DataArray(np.transpose(Estim_BFGS.mles())) - self._gradient = xr.DataArray(np.zeros([Estim_BFGS.X.shape[1]])) + self._gradient = xr.DataArray(np.zeros([Estim_BFGS.x.shape[1]])) self._fisher_inv = xr.DataArray(Estim_BFGS.fisher_inv) self._idx_loc = np.arange(0, Estim_BFGS.design_loc.shape[1]) self._idx_scale = np.arange(Estim_BFGS.design_loc.shape[1], diff --git a/diffxpy/stats/stats.py b/diffxpy/stats/stats.py index 9f60d26..e57aa99 100644 --- a/diffxpy/stats/stats.py +++ b/diffxpy/stats/stats.py @@ -3,7 +3,6 @@ import numpy as np import numpy.linalg import scipy.stats -import xarray as xr def likelihood_ratio_test( @@ -264,8 +263,6 @@ def wald_test_chisq( theta_diff = theta_mle - theta0 # Convert to nd.array to avoid gufunc error. - if isinstance(theta_diff, xr.DataArray): - theta_diff = theta_diff.values wald_statistic = np.array([ np.matmul( np.matmul( diff --git a/diffxpy/testing/det.py b/diffxpy/testing/det.py index 412a15d..ba0f79c 100644 --- a/diffxpy/testing/det.py +++ b/diffxpy/testing/det.py @@ -5,18 +5,16 @@ from random import sample import numpy as np -import xarray as xr import patsy -from .utils import split_X, dmat_unique +from .utils import split_x, dmat_unique try: import anndata except ImportError: anndata = None -from batchglm.xarray_sparse.base import SparseXArrayDataArray -from batchglm.models.glm_nb import Model as GeneralizedLinearModel +from batchglm.models.base import _EstimatorBase from ..stats import stats from . import correction @@ -25,84 +23,6 @@ logger = logging.getLogger("diffxpy") -class _Estimation(GeneralizedLinearModel, metaclass=abc.ABCMeta): - """ - Dummy class specifying all needed methods / parameters necessary for a model - fitted for DifferentialExpressionTest. - Useful for type hinting. - """ - - @property - @abc.abstractmethod - def X(self) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def design_loc(self) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def design_scale(self) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def constraints_loc(self) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def constraints_scale(self) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def num_observations(self) -> int: - pass - - @property - @abc.abstractmethod - def num_features(self) -> int: - pass - - @property - @abc.abstractmethod - def features(self) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def observations(self) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def log_likelihood(self, **kwargs) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def loss(self, **kwargs) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def gradients(self, **kwargs) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def hessians(self, **kwargs) -> np.ndarray: - pass - - @property - @abc.abstractmethod - def fisher_inv(self, **kwargs) -> np.ndarray: - pass - - class _DifferentialExpressionTest(metaclass=abc.ABCMeta): """ Dummy class specifying all needed methods / parameters necessary for DifferentialExpressionTest. @@ -132,7 +52,7 @@ def gene_ids(self) -> np.ndarray: @property @abc.abstractmethod - def X(self): + def x(self): pass @abc.abstractmethod @@ -169,24 +89,22 @@ def _correction(self, method) -> np.ndarray: def _ave(self): """ - Returns a xr.DataArray containing the mean expression by gene + Returns the mean expression by gene. - :return: xr.DataArray + :return: np.ndarray """ pass @property def log_likelihood(self): if self._log_likelihood is None: - self._log_likelihood = self._ll().compute() + self._log_likelihood = self._ll() return self._log_likelihood @property def mean(self): if self._mean is None: self._mean = self._ave() - if isinstance(self._mean, xr.DataArray): # Could also be np.ndarray coming out of XArraySparseDataArray - self._mean = self._mean.compute() return self._mean @property @@ -340,10 +258,7 @@ def plot_volcano( logfc = np.reshape(self.log2_fold_change(), -1) # Clipping throws errors if not performed in actual data format (ndarray or DataArray): - if isinstance(logfc, xr.DataArray): - logfc = logfc.clip(-log2_fc_threshold, log2_fc_threshold) - else: - logfc = np.clip(logfc, -log2_fc_threshold, log2_fc_threshold, logfc) + logfc = np.clip(logfc, -log2_fc_threshold, log2_fc_threshold, logfc) fig, ax = plt.subplots() @@ -450,10 +365,7 @@ def plot_ma( logfc = np.reshape(self.log2_fold_change(), -1) # Clipping throws errors if not performed in actual data format (ndarray or DataArray): - if isinstance(logfc, xr.DataArray): - logfc = logfc.clip(-log2_fc_threshold, log2_fc_threshold) - else: - logfc = np.clip(logfc, -log2_fc_threshold, log2_fc_threshold, logfc) + logfc = np.clip(logfc, -log2_fc_threshold, log2_fc_threshold, logfc) fig, ax = plt.subplots() @@ -554,17 +466,17 @@ class DifferentialExpressionTestLRT(_DifferentialExpressionTestSingle): sample_description: pd.DataFrame full_design_loc_info: patsy.design_info - full_estim: _Estimation + full_estim: _EstimatorBase reduced_design_loc_info: patsy.design_info - reduced_estim: _Estimation + reduced_estim: _EstimatorBase def __init__( self, sample_description: pd.DataFrame, full_design_loc_info: patsy.design_info, - full_estim, + full_estim: _EstimatorBase, reduced_design_loc_info: patsy.design_info, - reduced_estim + reduced_estim: _EstimatorBase ): super().__init__() self.sample_description = sample_description @@ -575,19 +487,19 @@ def __init__( @property def gene_ids(self) -> np.ndarray: - return np.asarray(self.full_estim.features) + return np.asarray(self.full_estim.input_data.features) @property - def X(self): - return self.full_estim.X + def x(self): + return self.full_estim.x @property def reduced_model_gradient(self): - return self.reduced_estim.gradients + return self.reduced_estim.jacobian @property def full_model_gradient(self): - return self.full_estim.gradients + return self.full_estim.jacobian def _test(self): if np.any(self.full_estim.log_likelihood < self.reduced_estim.log_likelihood): @@ -596,27 +508,29 @@ def _test(self): return stats.likelihood_ratio_test( ll_full=self.full_estim.log_likelihood, ll_reduced=self.reduced_estim.log_likelihood, - df_full=self.full_estim.constraints_loc.shape[1] + self.full_estim.constraints_scale.shape[1], - df_reduced=self.reduced_estim.constraints_loc.shape[1] + self.reduced_estim.constraints_scale.shape[1], + df_full=self.full_estim.input_data.constraints_loc.shape[1] + + self.full_estim.input_data.constraints_scale.shape[1], + df_reduced=self.reduced_estim.input_data.constraints_loc.shape[1] + + self.reduced_estim.input_data.constraints_scale.shape[1], ) def _ave(self): """ - Returns a xr.DataArray containing the mean expression by gene + Returns the mean expression by gene - :return: xr.DataArray + :return: np.ndarray """ - return np.mean(self.full_estim.X, axis=0) + return np.mean(self.full_estim.x, axis=0) def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e): """ - Returns a xr.DataArray containing the locations for the different categories of the factors + Returns the locations for the different categories of the factors :param factors: the factors to select. E.g. `condition` or `batch` if formula would be `~ 1 + batch + condition` :param base: the log base to use; default is the natural logarithm - :return: xr.DataArray + :return: np.ndarray """ if not (isinstance(factors, list) or isinstance(factors, tuple) or isinstance(factors, set)): @@ -626,7 +540,7 @@ def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e): di = self.full_design_loc_info sample_description = self.sample_description[[f.name() for f in di.subset(factors).factor_infos]] - dmat = self.full_estim.design_loc + dmat = self.full_estim.input_data.design_loc # make rows unique dmat, sample_description = dmat_unique(dmat, sample_description) @@ -645,19 +559,11 @@ def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e): # make the design matrix + sample description unique again dmat, sample_description = dmat_unique(dmat, sample_description) - locations = self.full_estim.inverse_link_loc(dmat.dot(self.full_estim.par_link_loc)) + locations = self.full_estim.model.inverse_link_loc(np.matmul(dmat, self.full_estim.model.a)) locations = np.log(locations) / np.log(base) dist = np.expand_dims(locations, axis=0) dist = np.transpose(dist, [1, 0, 2]) - dist - dist = xr.DataArray(dist, dims=("minuend", "subtrahend", "gene")) - # retval = xr.Dataset({"logFC": retval}) - - dist.coords["gene"] = self.gene_ids - - for col in sample_description: - dist.coords["minuend_" + col] = (("minuend",), sample_description[col]) - dist.coords["subtrahend_" + col] = (("subtrahend",), sample_description[col]) # # If this is a pairwise comparison, return only one fold change per gene # if dist.shape[:2] == (2, 2): @@ -695,7 +601,7 @@ def log_fold_change(self, base=np.e, return_type="vector"): return None else: dists = self._log_fold_change(factors=factors, base=base) - return dists[1, 0].values + return dists[1, 0] else: dists = self._log_fold_change(factors=factors, base=base) return dists @@ -709,12 +615,12 @@ def locations(self): di = self.full_design_loc_info sample_description = self.sample_description[[f.name() for f in di.factor_infos]] - dmat = self.full_estim.design_loc + dmat = self.full_estim.input_data.design_loc dmat, sample_description = dmat_unique(dmat, sample_description) - retval = self.full_estim.inverse_link_loc(dmat.dot(self.full_estim.par_link_loc)) - retval = pd.DataFrame(retval, columns=self.full_estim.features) + retval = self.full_estim.model.inverse_link_loc(np.matmul(dmat, self.full_estim.model.a)) + retval = pd.DataFrame(retval, columns=self.full_estim.input_data.features) for col in sample_description: retval[col] = sample_description[col] @@ -731,12 +637,12 @@ def scales(self): di = self.full_design_loc_info sample_description = self.sample_description[[f.name() for f in di.factor_infos]] - dmat = self.full_estim.design_scale + dmat = self.full_estim.input_data.design_scale dmat, sample_description = dmat_unique(dmat, sample_description) retval = self.full_estim.inverse_link_scale(dmat.doc(self.full_estim.par_link_scale)) - retval = pd.DataFrame(retval, columns=self.full_estim.features) + retval = pd.DataFrame(retval, columns=self.full_estim.input_data.features) for col in sample_description: retval[col] = sample_description[col] @@ -781,7 +687,7 @@ class DifferentialExpressionTestWald(_DifferentialExpressionTestSingle): Single wald test per gene. """ - model_estim: _Estimation + model_estim: _EstimatorBase sample_description: pd.DataFrame coef_loc_totest: np.ndarray theta_mle: np.ndarray @@ -791,7 +697,7 @@ class DifferentialExpressionTestWald(_DifferentialExpressionTestSingle): def __init__( self, - model_estim: _Estimation, + model_estim: _EstimatorBase, col_indices: np.ndarray, noise_model: str, sample_description: pd.DataFrame @@ -811,26 +717,30 @@ def __init__( try: if model_estim._error_codes is not None: self._error_codes = model_estim._error_codes + else: + self._error_codes = None except Exception as e: self._error_codes = None try: if model_estim._niter is not None: self._niter = model_estim._niter + else: + self._niter = None except Exception as e: self._niter = None @property def gene_ids(self) -> np.ndarray: - return np.asarray(self.model_estim.features) + return np.asarray(self.model_estim.input_data.features) @property - def X(self): - return self.model_estim.X + def x(self): + return self.model_estim.x @property def model_gradient(self): - return self.model_estim.gradients + return self.model_estim.jacobian def log_fold_change(self, base=np.e, **kwargs): """ @@ -854,25 +764,25 @@ def log_fold_change(self, base=np.e, **kwargs): def _ll(self): """ - Returns a xr.DataArray containing the log likelihood of each gene + Returns the log likelihood of each gene - :return: xr.DataArray + :return: np.ndarray """ return self.model_estim.log_likelihood def _ave(self): """ - Returns a xr.DataArray containing the mean expression by gene + Returns the mean expression by gene - :return: xr.DataArray + :return: np.ndarray """ - return self.X.mean(axis=0) + return self.x.mean(axis=0) def _test(self): """ - Returns a xr.DataArray containing the p-value for differential expression for each gene + Returns the p-value for differential expression for each gene - :return: xr.DataArray + :return: np.ndarray """ # Check whether single- or multiple parameters are tested. # For a single parameter, the wald statistic distribution is approximated @@ -880,7 +790,7 @@ def _test(self): self.theta_mle = self.model_estim.a_var[self.coef_loc_totest] if len(self.coef_loc_totest) == 1: self.theta_mle = self.theta_mle[0] # Make xarray one dimensional for stats.wald_test. - self.theta_sd = self.model_estim.fisher_inv[:, self.coef_loc_totest[0], self.coef_loc_totest[0]].values + self.theta_sd = self.model_estim.fisher_inv[:, self.coef_loc_totest[0], self.coef_loc_totest[0]] self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd, where=self.theta_sd < np.nextafter(0, np.inf)) self.theta_sd = np.sqrt(self.theta_sd) @@ -958,10 +868,10 @@ def plot_vs_ttest( import seaborn as sns from .tests import t_test - grouping = np.asarray(self.model_estim.design_loc[:, self.coef_loc_totest]) + grouping = np.asarray(self.model_estim.input_data.design_loc[:, self.coef_loc_totest]) # Normalize by size factors that were used in regression. - sf = np.broadcast_to(np.expand_dims(self.model_estim.size_factors, axis=1), - shape=self.model_estim.X.shape) + sf = np.broadcast_to(np.expand_dims(self.model_estim.input_data.size_factors, axis=1), + shape=self.model_estim.x.shape) ttest = t_test( data=self.model_estim.X.multiply(1 / sf, copy=True), grouping=grouping, @@ -1019,11 +929,11 @@ def plot_comparison_ols_coef( import matplotlib.pyplot as plt from matplotlib import gridspec from matplotlib import rcParams - from batchglm.api.models.glm_norm import Estimator, InputData + from batchglm.api.models.glm_norm import Estimator, InputDataGLM # Run OLS model fit to have comparison coefficients. if self._store_ols is None: - input_data_ols = InputData.new( + input_data_ols = InputDataGLM( data=self.model_estim.input_data.data, design_loc=self.model_estim.input_data.design_loc, design_scale=self.model_estim.input_data.design_scale[:, [0]], @@ -1048,10 +958,10 @@ def plot_comparison_ols_coef( # Prepare parameter summary of both model fits. par_loc = self.model_estim.input_data.data.coords["design_loc_params"].values - a_var_ols = store_ols.a_var.values + a_var_ols = store_ols.a_var a_var_ols[1:, :] = (a_var_ols[1:, :] + a_var_ols[[0], :]) / a_var_ols[[0], :] - a_var_user = self.model_estim.a_var.values + a_var_user = self.model_estim.a_var # Translate coefficients from both fits to be multiplicative in identity space. if self.noise_model == "nb": a_var_user = np.exp(a_var_user) # self.model_estim.inverse_link_loc(a_var_user) @@ -1156,11 +1066,11 @@ def plot_comparison_ols_pred( import matplotlib.pyplot as plt from matplotlib import gridspec from matplotlib import rcParams - from batchglm.api.models.glm_norm import Estimator, InputData + from batchglm.api.models.glm_norm import Estimator, InputDataGLM # Run OLS model fit to have comparison coefficients. if self._store_ols is None: - input_data_ols = InputData.new( + input_data_ols = InputDataGLM( data=self.model_estim.input_data.data, design_loc=self.model_estim.input_data.design_loc, design_scale=self.model_estim.input_data.design_scale[:, [0]], @@ -1203,19 +1113,16 @@ def plot_comparison_ols_pred( pred_n_cells = sample( population=list(np.arange(0, self.model_estim.X.shape[0])), - k=np.min([20, self.model_estim.design_loc.shape[0]]) + k=np.min([20, self.model_estim.input_data.design_loc.shape[0]]) ) - if isinstance(self.model_estim.X, SparseXArrayDataArray): - x = np.asarray(self.model_estim.X.X[pred_n_cells, :].todense()).flatten() - else: - x = np.asarray(self.model_estim.X[pred_n_cells, :]).flatten() + x = np.asarray(self.model_estim.X[pred_n_cells, :]).flatten() - y_user = self.model_estim.inverse_link_loc( - np.matmul(self.model_estim.design_loc[pred_n_cells, :].values, self.model_estim.a_var.values).flatten() + y_user = self.model_estim.model.inverse_link_loc( + np.matmul(self.model_estim.input_data.design_loc[pred_n_cells, :], self.model_estim.a_var).flatten() ) y_ols = store_ols.inverse_link_loc( - np.matmul(store_ols.design_loc[pred_n_cells, :].values, store_ols.a_var.values).flatten() + np.matmul(store_ols.design_loc[pred_n_cells, :], store_ols.a_var).flatten() ) if log1p_transform: x = np.log(x+1) @@ -1309,15 +1216,15 @@ def _assemble_gene_fits( :param log1p_transform: Whether to log transform observations before estimating the distribution with boxplot. Model estimates are adjusted accordingly. :param incl_fits: Whether to include fits in plot. - :return summaries_genes: List with data frame for seabron in it. + :return summaries_genes: List with data frame for seaborn in it. """ summaries_genes = [] for i, g in enumerate(gene_names): - assert g in self.model_estim.features, "gene %g not found" % g - g_idx = self.model_estim.features.tolist().index(g) + assert g in self.model_estim.input_data.features, "gene %g not found" % g + g_idx = self.model_estim.input_data.features.index(g) # Raw data for boxplot: - y = self.model_estim.X[:, g_idx] + y = self.model_estim.x[:, g_idx] # Model fits: loc = self.model_estim.location[:, g_idx] scale = self.model_estim.scale[:, g_idx] @@ -1613,12 +1520,12 @@ def __init__( is_sig_zerovar: bool = True ): super().__init__() - self._X = data + self._x = data self.sample_description = sample_description self.grouping = grouping self._gene_names = np.asarray(gene_names) - x0, x1 = split_X(data, grouping) + x0, x1 = split_x(data, grouping) # Only compute p-values for genes with non-zero observations and non-zero group-wise variance. mean_x0 = x0.mean(axis=0).astype(dtype=np.float) @@ -1692,8 +1599,8 @@ def gene_ids(self) -> np.ndarray: return self._gene_names @property - def X(self): - return self._X + def x(self): + return self._x def log_fold_change(self, base=np.e, **kwargs): """ @@ -1747,12 +1654,12 @@ def __init__( is_sig_zerovar: bool = True ): super().__init__() - self._X = data + self._x = data self.sample_description = sample_description self.grouping = grouping self._gene_names = np.asarray(gene_names) - x0, x1 = split_X(data, grouping) + x0, x1 = split_x(data, grouping) mean_x0 = x0.mean(axis=0).astype(dtype=np.float) mean_x1 = x1.mean(axis=0).astype(dtype=np.float) @@ -1774,16 +1681,10 @@ def __init__( # TODO: can this be done on sparse? pval = np.zeros([data.shape[1]]) + np.nan - if isinstance(x0, xr.DataArray): - pval[idx_run] = stats.mann_whitney_u_test( - x0=x0.data[:, idx_run], - x1=x1.data[:, idx_run] - ) - else: - pval[idx_run] = stats.mann_whitney_u_test( - x0=x0.X[:, idx_run].toarray(), - x1=x1.X[:, idx_run].toarray() - ) + pval[idx_run] = stats.mann_whitney_u_test( + x0=x0.x[:, idx_run].toarray(), + x1=x1.x[:, idx_run].toarray() + ) pval[np.where(np.logical_and( np.logical_and(mean_x0 == mean_x1, self._mean > 0), np.logical_not(self._var_geq_zero) @@ -1806,8 +1707,8 @@ def gene_ids(self) -> np.ndarray: return self._gene_names @property - def X(self): - return self._X + def x(self): + return self._x def log_fold_change(self, base=np.e, **kwargs): """ @@ -1855,7 +1756,7 @@ def plot_vs_ttest(self, log10=False): grouping = self.grouping ttest = t_test( - data=self.X, + data=self.x, grouping=grouping, gene_names=self.gene_ids, ) @@ -1967,7 +1868,7 @@ def gene_ids(self) -> np.ndarray: return self._gene_ids @property - def X(self): + def x(self): return None @property @@ -2159,11 +2060,17 @@ class DifferentialExpressionTestZTest(_DifferentialExpressionTestMulti): Pairwise unit_test between more than 2 groups per gene. """ - model_estim: _Estimation + model_estim: _EstimatorBase theta_mle: np.ndarray theta_sd: np.ndarray - def __init__(self, model_estim: _Estimation, grouping, groups, correction_type: str): + def __init__( + self, + model_estim: _EstimatorBase, + grouping, + groups, + correction_type: str + ): super().__init__(correction_type=correction_type) self.model_estim = model_estim self.grouping = grouping @@ -2182,7 +2089,7 @@ def __init__(self, model_estim: _Estimation, grouping, groups, correction_type: def _test(self, **kwargs): groups = self.groups - num_features = self.model_estim.X.shape[1] + num_features = self.model_estim.x.shape[1] pvals = np.tile(np.NaN, [len(groups), len(groups), num_features]) pvals[np.eye(pvals.shape[0]).astype(bool)] = 1 @@ -2202,28 +2109,28 @@ def _test(self, **kwargs): @property def gene_ids(self) -> np.ndarray: - return np.asarray(self.model_estim.features) + return np.asarray(self.model_estim.input_data.features) @property - def X(self): - return self.model_estim.X + def x(self): + return self.model_estim.x @property def log_likelihood(self): - return np.sum(self.model_estim.log_probs(), axis=0) + return self.model_estim.log_likelihood @property def model_gradient(self): - return self.model_estim.gradients + return self.model_estim.jacobian def _ave(self): """ - Returns a xr.DataArray containing the mean expression by gene + Returns the mean expression by gene - :return: xr.DataArray + :return: np.ndarray """ - return np.mean(self.model_estim.X, axis=0) + return np.mean(self.model_estim.x, axis=0) def log_fold_change(self, base=np.e, **kwargs): """ @@ -2231,7 +2138,7 @@ def log_fold_change(self, base=np.e, **kwargs): """ if self._logfc is None: groups = self.groups - num_features = self.model_estim.X.shape[1] + num_features = self.model_estim.x.shape[1] logfc = np.tile(np.NaN, [len(groups), len(groups), num_features]) logfc[np.eye(logfc.shape[0]).astype(bool)] = 0 @@ -2356,11 +2263,16 @@ class DifferentialExpressionTestZTestLazy(_DifferentialExpressionTestMulti): memory. """ - model_estim: _Estimation + model_estim: _EstimatorBase _theta_mle: np.ndarray _theta_sd: np.ndarray - def __init__(self, model_estim: _Estimation, grouping, groups, correction_type="global"): + def __init__( + self, + model_estim: _EstimatorBase, + grouping, groups, + correction_type="global" + ): super().__init__(correction_type=correction_type) self.model_estim = model_estim self.grouping = grouping @@ -2370,7 +2282,7 @@ def __init__(self, model_estim: _Estimation, grouping, groups, correction_type=" self.groups = groups.tolist() # values of parameter estimates: coefficients x genes array with one coefficient per group - self._theta_mle = model_estim.par_link_loc + self._theta_mle = model_estim.a_var # standard deviation of estimates: coefficients x genes array with one coefficient per group # theta_sd = sqrt(diagonal(fisher_inv)) self._theta_sd = np.sqrt(np.diagonal(model_estim.fisher_inv, axis1=-2, axis2=-1)).T @@ -2410,7 +2322,7 @@ def _test(self, **kwargs): pass def _test_pairs(self, groups0, groups1): - num_features = self.model_estim.X.shape[1] + num_features = self.model_estim.x.shape[1] pvals = np.tile(np.NaN, [len(groups0), len(groups1), num_features]) @@ -2430,28 +2342,28 @@ def _test_pairs(self, groups0, groups1): @property def gene_ids(self) -> np.ndarray: - return np.asarray(self.model_estim.features) + return np.asarray(self.model_estim.input_data.features) @property - def X(self): - return self.model_estim.X + def x(self): + return self.model_estim.x @property def log_likelihood(self): - return np.sum(self.model_estim.log_probs(), axis=0) + return self.model_estim.log_likelihood @property def model_gradient(self): - return self.model_estim.gradients + return self.model_estim.jacobian def _ave(self): """ - Returns a xr.DataArray containing the mean expression by gene + Returns the mean expression by gene - :return: xr.DataArray + :return: np.ndarray """ - return np.mean(self.model_estim.X, axis=0) + return np.mean(self.model_estim.x, axis=0) @property def pval(self, **kwargs): @@ -2459,7 +2371,8 @@ def pval(self, **kwargs): This function is not available in lazy results evaluation as it would require all pairwise tests to be performed. """ - pass + raise ValueError("This function is not available in lazy results evaluation as it would " + "require all pairwise tests to be performed.") @property def qval(self, **kwargs): @@ -2467,7 +2380,8 @@ def qval(self, **kwargs): This function is not available in lazy results evaluation as it would require all pairwise tests to be performed. """ - pass + raise ValueError("This function is not available in lazy results evaluation as it would " + "require all pairwise tests to be performed.") def log_fold_change(self, base=np.e, **kwargs): """ @@ -2499,9 +2413,9 @@ def summary( pass def _check_groups(self, groups0, groups1): - if isinstance(groups0, list)==False: + if not isinstance(groups0, list): groups0 = [groups0] - if isinstance(groups1, list)==False: + if not isinstance(groups1, list): groups1 = [groups1] for g in groups0: if g not in self.groups: @@ -2587,7 +2501,7 @@ def log_fold_change_pairs(self, groups0=None, groups1=None, base=np.e): logfc = np.zeros(shape=(len(groups0), len(groups1), num_features)) for i, g0 in enumerate(groups0): for j, g1 in enumerate(groups1): - logfc[i, j, :] = self._theta_mle[g0, :].values - self._theta_mle[g1, :].values + logfc[i, j, :] = self._theta_mle[g0, :] - self._theta_mle[g1, :] if base == np.e: return logfc @@ -2718,7 +2632,16 @@ class DifferentialExpressionTestVsRest(_DifferentialExpressionTestMulti): Tests between between each group and the rest for more than 2 groups per gene. """ - def __init__(self, gene_ids, pval, logfc, ave, groups, tests, correction_type: str): + def __init__( + self, + gene_ids, + pval, + logfc, + ave, + groups, + tests, + correction_type: str + ): super().__init__(correction_type=correction_type) self._gene_ids = np.asarray(gene_ids) self._pval = pval @@ -2744,7 +2667,7 @@ def gene_ids(self) -> np.ndarray: return self._gene_ids @property - def X(self) -> Union[np.ndarray, None]: + def x(self) -> Union[np.ndarray, None]: return None def log_fold_change(self, base=np.e, **kwargs): @@ -2853,7 +2776,7 @@ def gene_ids(self) -> np.ndarray: return self._gene_ids @property - def X(self) -> np.ndarray: + def x(self) -> np.ndarray: return None def log_fold_change(self, base=np.e, **kwargs): @@ -2903,7 +2826,7 @@ def summary(self, qval_thres=None, fc_upper_thres=None, class _DifferentialExpressionTestCont(_DifferentialExpressionTestSingle): _de_test: _DifferentialExpressionTestSingle - _model_estim: _Estimation + _model_estim: _EstimatorBase _size_factors: np.ndarray _continuous_coords: np.ndarray _spline_coefs: list @@ -2911,7 +2834,7 @@ class _DifferentialExpressionTestCont(_DifferentialExpressionTestSingle): def __init__( self, de_test: _DifferentialExpressionTestSingle, - model_estim: _Estimation, + model_estim: _EstimatorBase, size_factors: np.ndarray, continuous_coords: np.ndarray, spline_coefs: list, @@ -2929,8 +2852,8 @@ def gene_ids(self) -> np.ndarray: return self._de_test.gene_ids @property - def X(self): - return self._de_test.X + def x(self): + return self._de_test.x @property def pval(self) -> np.ndarray: @@ -2948,8 +2871,14 @@ def mean(self) -> np.ndarray: def log_likelihood(self) -> np.ndarray: return self._de_test.log_likelihood - def summary(self, nonnumeric=False, qval_thres=None, fc_upper_thres=None, - fc_lower_thres=None, mean_thres=None) -> pd.DataFrame: + def summary( + self, + nonnumeric=False, + qval_thres=None, + fc_upper_thres=None, + fc_lower_thres=None, + mean_thres=None + ) -> pd.DataFrame: """ Summarize differential expression results into an output table. @@ -2987,7 +2916,7 @@ def log_fold_change(self, base=np.e, genes=None, nonnumeric=False): :return: Log-fold change of fitted expression value by gene. """ if genes is None: - genes = np.asarray(range(self.X.shape[1])) + genes = np.asarray(range(self.x.shape[1])) else: genes = self._idx_genes(genes) @@ -3016,7 +2945,7 @@ def _filter_genes_int(self, genes: list): :param genes: List of genes to filter. :return: Filtered list of genes """ - genes_found = np.array([x < self.X.shape[1] for x in genes]) + genes_found = np.array([x < self.x.shape[1] for x in genes]) if any(genes_found == False): logger.info("did not find some genes, omitting") genes = genes[genes_found] @@ -3062,13 +2991,13 @@ def _continuous_model(self, idx, non_numeric=False): """ idx = np.asarray(idx) if non_numeric: - mu = np.matmul(self._model_estim.design_loc.values, + mu = np.matmul(self._model_estim.input_data.design_loc, self._model_estim.par_link_loc[:, idx]) if self._size_factors is not None: mu = mu + self._size_factors else: idx_basis = self._spline_par_loc_idx(intercept=True) - mu = np.matmul(self._model_estim.design_loc[:, idx_basis].values, + mu = np.matmul(self._model_estim.input_data.design_loc[:, idx_basis], self._model_estim.par_link_loc[idx_basis, idx]) mu = np.exp(mu) @@ -3189,7 +3118,7 @@ def plot_genes( ax = plt.subplot(gs[i]) axs.append(ax) - y = self.X[:, g] + y = self.x[:, g] yhat = self._continuous_model(idx=g, non_numeric=non_numeric) if log: y = np.log(y + 1) diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py index cba2037..c573d45 100644 --- a/diffxpy/testing/tests.py +++ b/diffxpy/testing/tests.py @@ -6,7 +6,6 @@ import pandas as pd import patsy import scipy.sparse -import xarray as xr try: from anndata.base import Raw @@ -14,7 +13,7 @@ from anndata import Raw from batchglm import data as data_utils -from batchglm.xarray_sparse import SparseXArrayDataSet +from batchglm.models.base import _EstimatorBase, _InputDataBase from diffxpy import pkg_constants from diffxpy.models.batch_bfgs.optim import Estim_BFGS from .det import DifferentialExpressionTestLRT, DifferentialExpressionTestWald, \ @@ -22,12 +21,9 @@ DifferentialExpressionTestZTestLazy, DifferentialExpressionTestZTest, DifferentialExpressionTestPairwise, \ DifferentialExpressionTestVsRest, _DifferentialExpressionTestMulti, DifferentialExpressionTestByPartition, \ DifferentialExpressionTestWaldCont, DifferentialExpressionTestLRTCont -from .utils import parse_gene_names, parse_data, parse_sample_description, parse_size_factors, parse_grouping, \ +from .utils import parse_gene_names, parse_sample_description, parse_size_factors, parse_grouping, \ constraint_system_from_star -# Use this to suppress matrix subclass PendingDepreceationWarnings from numpy: -np.warnings.filterwarnings("ignore") - def _fit( noise_model, @@ -46,7 +42,7 @@ def _fit( quick_scale: bool = None, close_session=True, dtype="float64" -): +) -> _EstimatorBase: """ :param noise_model: str, noise model to use in model-based unit_test. Possible options: @@ -143,15 +139,15 @@ def _fit( raise ValueError('base.test(): `noise_model="%s"` not recognized.' % noise_model) else: if noise_model == "nb" or noise_model == "negative_binomial": - from batchglm.api.models.glm_nb import Estimator, InputData + from batchglm.api.models.glm_nb import Estimator, InputDataGLM elif noise_model == "norm" or noise_model == "normal": - from batchglm.api.models.glm_norm import Estimator, InputData + from batchglm.api.models.glm_norm import Estimator, InputDataGLM else: raise ValueError('base.test(): `noise_model="%s"` not recognized.' % noise_model) logging.getLogger("diffxpy").info("Fitting model...") logging.getLogger("diffxpy").debug(" * Assembling input data...") - input_data = InputData.new( + input_data = InputDataGLM( data=data, design_loc=design_loc, design_scale=design_scale, @@ -193,16 +189,14 @@ def _fit( if close_session: logging.getLogger("diffxpy").debug(" * Finalize estimation...") - model = estim.finalize() - else: - model = estim + estim.finalize() logging.getLogger("diffxpy").debug(" * Model fitting done.") - return model + return estim def lrt( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], full_formula_loc: str, reduced_formula_loc: str, full_formula_scale: str = "~1", @@ -306,11 +300,10 @@ def lrt( as_numeric = [as_numeric] gene_names = parse_gene_names(data, gene_names) - X = parse_data(data, gene_names) sample_description = parse_sample_description(data, sample_description) size_factors = parse_size_factors( size_factors=size_factors, - data=X, + data=data, sample_description=sample_description ) @@ -341,7 +334,7 @@ def lrt( reduced_model = _fit( noise_model=noise_model, - data=X, + data=data, design_loc=reduced_design_loc, design_scale=reduced_design_scale, constraints_loc=None, @@ -358,7 +351,7 @@ def lrt( ) full_model = _fit( noise_model=noise_model, - data=X, + data=data, design_loc=full_design_loc, design_scale=full_design_scale, constraints_loc=None, @@ -387,7 +380,7 @@ def lrt( def wald( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], factor_loc_totest: Union[str, List[str]] = None, coef_to_test: Union[str, List[str]] = None, formula_loc: Union[None, str] = None, @@ -397,8 +390,8 @@ def wald( init_b: Union[np.ndarray, str] = "AUTO", gene_names: Union[np.ndarray, list] = None, sample_description: Union[None, pd.DataFrame] = None, - dmat_loc: Union[patsy.design_info.DesignMatrix, xr.Dataset] = None, - dmat_scale: Union[patsy.design_info.DesignMatrix, xr.Dataset] = None, + dmat_loc: Union[patsy.design_info.DesignMatrix] = None, + dmat_scale: Union[patsy.design_info.DesignMatrix] = None, constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, noise_model: str = "nb", @@ -566,12 +559,11 @@ def wald( # # Parse input data formats: gene_names = parse_gene_names(data, gene_names) - X = parse_data(data, gene_names) if dmat_loc is None and dmat_scale is None: sample_description = parse_sample_description(data, sample_description) size_factors = parse_size_factors( size_factors=size_factors, - data=X, + data=data, sample_description=sample_description ) @@ -582,7 +574,6 @@ def wald( formula=formula_loc, as_numeric=as_numeric, constraints=constraints_loc, - dims=["design_loc_params", "loc_params"], return_type="patsy" ) logging.getLogger("diffxpy").debug("building scale model") @@ -592,7 +583,6 @@ def wald( formula=formula_scale, as_numeric=as_numeric, constraints=constraints_scale, - dims=["design_scale_params", "scale_params"], return_type="patsy" ) @@ -637,7 +627,7 @@ def wald( model = _fit( noise_model=noise_model, - data=X, + data=data, design_loc=design_loc, design_scale=design_scale, constraints_loc=constraints_loc, @@ -664,19 +654,18 @@ def wald( def t_test( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], grouping, gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None, is_logged: bool = False, - is_sig_zerovar: bool = True, - dtype="float64" + is_sig_zerovar: bool = True ): """ Perform Welch's t-test for differential expression between two groups on adata object for each gene. - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like, or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param grouping: str, array @@ -692,13 +681,10 @@ def t_test( the p-value is set to np.nan. """ gene_names = parse_gene_names(data, gene_names) - X = parse_data(data, gene_names) - if isinstance(X, SparseXArrayDataSet): - X = X.X grouping = parse_grouping(data, sample_description, grouping) de_test = DifferentialExpressionTestTT( - data=X.astype(dtype), + data=data, sample_description=sample_description, grouping=grouping, gene_names=gene_names, @@ -710,19 +696,18 @@ def t_test( def rank_test( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], grouping: Union[str, np.ndarray, list], gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None, is_logged: bool = False, - is_sig_zerovar: bool = True, - dtype="float64" + is_sig_zerovar: bool = True ): """ Perform Mann-Whitney rank test (Wilcoxon rank-sum test) for differential expression between two groups on adata object for each gene. - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like, or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param grouping: str, array @@ -738,13 +723,10 @@ def rank_test( the p-value is set to np.nan. """ gene_names = parse_gene_names(data, gene_names) - X = parse_data(data, gene_names) - if isinstance(X, SparseXArrayDataSet): - X = X.X grouping = parse_grouping(data, sample_description, grouping) de_test = DifferentialExpressionTestRank( - data=X.astype(dtype), + data=data, sample_description=sample_description, grouping=grouping, gene_names=gene_names, @@ -756,7 +738,7 @@ def rank_test( def two_sample( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], grouping: Union[str, np.ndarray, list], as_numeric: Union[List[str], Tuple[str], str] = (), test: str = "t-test", @@ -802,7 +784,7 @@ def two_sample( Doesn't require fitting of generalized linear models. Wilcoxon rank sum (Mann-Whitney U) test between both observation groups. - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like, or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param grouping: str, array @@ -851,7 +833,6 @@ def two_sample( 'The t-test is based on a gaussian noise model and wilcoxon is model free.') gene_names = parse_gene_names(data, gene_names) - X = parse_data(data, gene_names) grouping = parse_grouping(data, sample_description, grouping) sample_description = pd.DataFrame({"grouping": grouping}) @@ -867,7 +848,7 @@ def two_sample( formula_loc = '~ 1 + grouping' formula_scale = '~ 1 + grouping' de_test = wald( - data=X, + data=data, factor_loc_totest="grouping", as_numeric=as_numeric, coef_to_test=None, @@ -891,7 +872,7 @@ def two_sample( reduced_formula_loc = '~ 1' reduced_formula_scale = '~ 1' de_test = lrt( - data=X, + data=data, full_formula_loc=full_formula_loc, reduced_formula_loc=reduced_formula_loc, full_formula_scale=full_formula_scale, @@ -909,7 +890,7 @@ def two_sample( ) elif test.lower() == 't-test' or test.lower() == "t_test" or test.lower() == "ttest": de_test = t_test( - data=X, + data=data, gene_names=gene_names, grouping=grouping, is_sig_zerovar=is_sig_zerovar, @@ -917,7 +898,7 @@ def two_sample( ) elif test.lower() == 'rank': de_test = rank_test( - data=X, + data=data, gene_names=gene_names, grouping=grouping, is_sig_zerovar=is_sig_zerovar, @@ -930,7 +911,7 @@ def two_sample( def pairwise( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], grouping: Union[str, np.ndarray, list], as_numeric: Union[List[str], Tuple[str], str] = (), test: str = 'z-test', @@ -982,7 +963,7 @@ def pairwise( Doesn't require fitting of generalized linear models. Wilcoxon rank sum (Mann-Whitney U) test between both observation groups. - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like, or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param grouping: str, array @@ -1048,7 +1029,6 @@ def pairwise( # Do not store all models but only p-value and q-value matrix: # genes x groups x groups gene_names = parse_gene_names(data, gene_names) - X = parse_data(data, gene_names) sample_description = parse_sample_description(data, sample_description) grouping = parse_grouping(data, sample_description, grouping) sample_description = pd.DataFrame({"grouping": grouping}) @@ -1061,7 +1041,7 @@ def pairwise( ) model = _fit( noise_model=noise_model, - data=X, + data=data, design_loc=dmat, design_scale=dmat, gene_names=gene_names, @@ -1089,9 +1069,9 @@ def pairwise( ) else: groups = np.unique(grouping) - pvals = np.tile(np.NaN, [len(groups), len(groups), X.shape[1]]) + pvals = np.tile(np.NaN, [len(groups), len(groups), data.shape[1]]) pvals[np.eye(pvals.shape[0]).astype(bool)] = 0 - logfc = np.tile(np.NaN, [len(groups), len(groups), X.shape[1]]) + logfc = np.tile(np.NaN, [len(groups), len(groups), data.shape[1]]) logfc[np.eye(logfc.shape[0]).astype(bool)] = 0 if keep_full_test_objs: @@ -1105,7 +1085,7 @@ def pairwise( sel = (grouping == g1) | (grouping == g2) de_test_temp = two_sample( - data=X[sel], + data=data[sel], grouping=grouping[sel], as_numeric=as_numeric, test=test, @@ -1132,7 +1112,7 @@ def pairwise( gene_ids=gene_names, pval=pvals, logfc=logfc, - ave=np.mean(X, axis=0), + ave=np.mean(data, axis=0), groups=groups, tests=tests, correction_type=pval_correction @@ -1142,7 +1122,7 @@ def pairwise( def versus_rest( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], grouping: Union[str, np.ndarray, list], as_numeric: Union[List[str], Tuple[str], str] = (), test: str = 'wald', @@ -1194,7 +1174,7 @@ def versus_rest( Doesn't require fitting of generalized linear models. Wilcoxon rank sum (Mann-Whitney U) test between both observation groups. - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param grouping: str, array @@ -1254,14 +1234,13 @@ def versus_rest( # Do not store all models but only p-value and q-value matrix: # genes x groups gene_names = parse_gene_names(data, gene_names) - X = parse_data(data, gene_names) sample_description = parse_sample_description(data, sample_description) grouping = parse_grouping(data, sample_description, grouping) sample_description = pd.DataFrame({"grouping": grouping}) groups = np.unique(grouping) - pvals = np.zeros([1, len(groups), X.shape[1]]) - logfc = np.zeros([1, len(groups), X.shape[1]]) + pvals = np.zeros([1, len(groups), data.shape[1]]) + logfc = np.zeros([1, len(groups), data.shape[1]]) if keep_full_test_objs: tests = np.tile([None], [1, len(groups)]) @@ -1271,7 +1250,7 @@ def versus_rest( for i, g1 in enumerate(groups): test_grouping = np.where(grouping == g1, "group", "rest") de_test_temp = two_sample( - data=X, + data=data, grouping=test_grouping, as_numeric=as_numeric, test=test, @@ -1295,7 +1274,7 @@ def versus_rest( gene_ids=gene_names, pval=pvals, logfc=logfc, - ave=np.mean(X, axis=0), + ave=np.mean(data, axis=0), groups=groups, tests=tests, correction_type=pval_correction @@ -1305,7 +1284,7 @@ def versus_rest( def partition( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], parts: Union[str, np.ndarray, list], gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None @@ -1320,7 +1299,7 @@ def partition( Wraps _Partition so that doc strings are nice. - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param parts: str, array @@ -1348,13 +1327,13 @@ class _Partition: def __init__( self, - data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray], + data: Union[anndata.AnnData, np.ndarray], parts: Union[str, np.ndarray, list], gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None ): """ - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param parts: str, array @@ -1363,7 +1342,7 @@ def __init__( :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these :param sample_description: optional pandas.DataFrame containing sample annotations """ - self.X = parse_data(data, gene_names) + self.x = data self.gene_names = parse_gene_names(data, gene_names) self.sample_description = parse_sample_description(data, sample_description) self.partition = parse_grouping(data, sample_description, parts) @@ -1423,7 +1402,7 @@ def two_sample( DETestsSingle = [] for i, idx in enumerate(self.partition_idx): DETestsSingle.append(two_sample( - data=self.X[idx, :], + data=self.x[idx, :], grouping=grouping, as_numeric=as_numeric, test=test, @@ -1468,7 +1447,7 @@ def t_test( DETestsSingle = [] for i, idx in enumerate(self.partition_idx): DETestsSingle.append(t_test( - data=self.X[idx, :], + data=self.x[idx, :], grouping=grouping, is_logged=is_logged, gene_names=self.gene_names, @@ -1479,7 +1458,7 @@ def t_test( return DifferentialExpressionTestByPartition( partitions=self.partitions, tests=DETestsSingle, - ave=np.mean(self.X, axis=0), + ave=np.mean(self.x, axis=0), correction_type="by_test") def rank_test( @@ -1505,7 +1484,7 @@ def rank_test( DETestsSingle = [] for i, idx in enumerate(self.partition_idx): DETestsSingle.append(rank_test( - data=self.X[idx, :], + data=self.x[idx, :], grouping=grouping, gene_names=self.gene_names, sample_description=self.sample_description.iloc[idx, :], @@ -1515,7 +1494,7 @@ def rank_test( return DifferentialExpressionTestByPartition( partitions=self.partitions, tests=DETestsSingle, - ave=np.mean(self.X, axis=0), + ave=np.mean(self.x, axis=0), correction_type="by_test") def lrt( @@ -1594,7 +1573,7 @@ def lrt( DETestsSingle = [] for i, idx in enumerate(self.partition_idx): DETestsSingle.append(lrt( - data=self.X[idx, :], + data=self.x[idx, :], reduced_formula_loc=reduced_formula_loc, full_formula_loc=full_formula_loc, reduced_formula_scale=reduced_formula_scale, @@ -1613,7 +1592,7 @@ def lrt( return DifferentialExpressionTestByPartition( partitions=self.partitions, tests=DETestsSingle, - ave=np.mean(self.X, axis=0), + ave=np.mean(self.x, axis=0), correction_type="by_test") def wald( @@ -1694,7 +1673,7 @@ def wald( DETestsSingle = [] for i, idx in enumerate(self.partition_idx): DETestsSingle.append(wald( - data=self.X[idx, :], + data=self.x[idx, :], factor_loc_totest=factor_loc_totest, coef_to_test=coef_to_test, formula_loc=formula_loc, @@ -1713,12 +1692,12 @@ def wald( return DifferentialExpressionTestByPartition( partitions=self.partitions, tests=DETestsSingle, - ave=np.mean(self.X, axis=0), + ave=np.mean(self.x, axis=0), correction_type="by_test") def continuous_1d( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix], continuous: str, df: int = 5, factor_loc_totest: Union[str, List[str]] = None, @@ -1759,7 +1738,7 @@ def continuous_1d( design matrices are built within this function and the shape of constraint matrices depends on the output of this function. - :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. + :param data: Array-like or anndata.Anndata object containing observations. Input data matrix (observations x features) or (cells x genes). :param continuous: str @@ -1906,7 +1885,6 @@ def continuous_1d( if isinstance(as_numeric, tuple): as_numeric = list(as_numeric) - X = parse_data(data, gene_names) gene_names = parse_gene_names(data, gene_names) sample_description = parse_sample_description(data, sample_description) @@ -1967,7 +1945,7 @@ def continuous_1d( logging.getLogger("diffxpy").debug("formula_scale_new: " + formula_scale_new) de_test = wald( - data=X, + data=data, factor_loc_totest=factor_loc_totest_new, coef_to_test=None, formula_loc=formula_loc_new, @@ -2020,7 +1998,7 @@ def continuous_1d( logging.getLogger("diffxpy").debug("reduced_formula_scale: " + reduced_formula_scale) de_test = lrt( - data=X, + data=data, full_formula_loc=full_formula_loc, reduced_formula_loc=reduced_formula_loc, full_formula_scale=full_formula_scale, diff --git a/diffxpy/testing/utils.py b/diffxpy/testing/utils.py index 3c348f1..7b7d8d3 100644 --- a/diffxpy/testing/utils.py +++ b/diffxpy/testing/utils.py @@ -2,9 +2,8 @@ import numpy as np import pandas as pd import patsy -import scipy +import scipy.sparse from typing import List, Tuple, Union -import xarray as xr try: from anndata.base import Raw @@ -12,37 +11,30 @@ from anndata import Raw from batchglm import data as data_utils +from batchglm.models.base import _InputDataBase # Relay util functions for diffxpy api. # design_matrix, preview_coef_names and constraint_system_from_star are redefined here. from batchglm.data import constraint_matrix_from_string, constraint_matrix_from_dict -from batchglm.data import design_matrix_from_xarray, design_matrix_from_anndata from batchglm.data import view_coef_names -def parse_gene_names(data, gene_names): +def parse_gene_names( + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], + gene_names: Union[list, np.ndarray, None] +): if gene_names is None: if anndata is not None and (isinstance(data, anndata.AnnData) or isinstance(data, Raw)): gene_names = data.var_names - elif isinstance(data, xr.DataArray): - gene_names = data["features"] - elif isinstance(data, xr.Dataset): - gene_names = data["features"] + elif isinstance(data, _InputDataBase): + gene_names = data.features else: raise ValueError("Missing gene names") return np.asarray(gene_names) -def parse_data(data, gene_names) -> xr.DataArray: - X = data_utils.xarray_from_data(data, dims=("observations", "features")) - if gene_names is not None: - X.coords["features"] = gene_names - - return X - - def parse_sample_description( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], sample_description: Union[pd.DataFrame, None] ) -> pd.DataFrame: """ @@ -54,33 +46,32 @@ def parse_sample_description( """ if sample_description is None: if anndata is not None and isinstance(data, anndata.AnnData): - sample_description = data_utils.sample_description_from_anndata( - dataset=data, - ) - elif isinstance(data, xr.Dataset): - sample_description = data_utils.sample_description_from_xarray( - dataset=data, - dim="observations", - ) + sample_description = data.obs else: raise ValueError( - "Please specify `sample_description` or provide `data` as xarray.Dataset or anndata.AnnData " + + "Please specify `sample_description` or provide `data` as anndata.AnnData " + "with corresponding sample annotations" ) if anndata is not None and isinstance(data, Raw): # Raw does not have attribute shape. assert data.X.shape[0] == sample_description.shape[0], \ - "data matrix and sample description must contain same number of cells" + "data matrix and sample description must contain same number of cells: %i, %i" % \ + (data.X.shape[0], sample_description.shape[0]) + elif isinstance(data, _InputDataBase): + assert data.x.shape[0] == sample_description.shape[0], \ + "data matrix and sample description must contain same number of cells: %i, %i" % \ + (data.x.shape[0], sample_description.shape[0]) else: assert data.shape[0] == sample_description.shape[0], \ - "data matrix and sample description must contain same number of cells" + "data matrix and sample description must contain same number of cells: %i, %i" % \ + (data.shape[0], sample_description.shape[0]) return sample_description def parse_size_factors( size_factors: Union[np.ndarray, pd.core.series.Series, np.ndarray], - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], sample_description: pd.DataFrame ) -> Union[np.ndarray, None]: """ @@ -111,7 +102,7 @@ def parse_grouping(data, sample_description, grouping): return np.squeeze(np.asarray(grouping)) -def split_X(data, grouping): +def split_x(data, grouping): groups = np.unique(grouping) x0 = data[np.where(grouping == groups[0])[0]] x1 = data[np.where(grouping == groups[1])[0]] @@ -126,14 +117,14 @@ def dmat_unique(dmat, sample_description): def design_matrix( - data: Union[anndata.AnnData, Raw, xr.DataArray, xr.Dataset, np.ndarray, + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix] = None, sample_description: Union[None, pd.DataFrame] = None, formula: Union[None, str] = None, as_numeric: Union[List[str], Tuple[str], str] = (), dmat: Union[pd.DataFrame, None] = None, return_type: str = "xarray" -) -> Union[patsy.design_info.DesignMatrix, xr.Dataset, pd.DataFrame]: +) -> Union[patsy.design_info.DesignMatrix, pd.DataFrame]: """ Create a design matrix from some sample description. This function defaults to perform formatting if dmat is directly supplied as a pd.DataFrame. @@ -155,8 +146,6 @@ def design_matrix( - "patsy": return plain patsy.design_info.DesignMatrix object - "dataframe": return pd.DataFrame with observations as rows and params as columns - - "xarray": return xr.Dataset with design matrix as ds["design"] and the sample description embedded as - one variable per column :param dmat: model design matrix """ if data is None and sample_description is None and dmat is None: @@ -217,13 +206,12 @@ def preview_coef_names( def constraint_system_from_star( - dmat: Union[None, np.ndarray, xr.DataArray, xr.Dataset] = None, + dmat: Union[None, np.ndarray] = None, sample_description: Union[None, pd.DataFrame] = None, formula: Union[None, str] = None, as_numeric: Union[List[str], Tuple[str], str] = (), constraints: dict = {}, - dims: Union[Tuple[str, str], List[str]] = (), - return_type: str = "xarray", + return_type: str = "patsy", ) -> Tuple: """ Create a design matrix and a constraint matrix. @@ -251,17 +239,12 @@ def constraint_system_from_star( Can only group by non-constrained effects right now, use constraint_matrix_from_string for other cases. - :param dims: Dimension names of xarray. - - E.g.: ["design_loc_params", "loc_params"] or ["design_scale_params", "scale_params"] :param return_type: type of the returned value. - "patsy": return plain patsy.design_info.DesignMatrix object - "dataframe": return pd.DataFrame with observations as rows and params as columns - - "xarray": return xr.Dataset with design matrix as ds["design"] and the sample description embedded as - one variable per column This option is overridden if constraints are supplied as dict. - :return: a model design matrix and a constraint matrix formatted as xr.DataArray + :return: a model design matrix and a constraint matrix """ if isinstance(as_numeric, str): as_numeric = [as_numeric] @@ -279,6 +262,5 @@ def constraint_system_from_star( formula=formula, as_categorical=as_categorical, constraints=constraints, - dims=dims, return_type=return_type ) diff --git a/diffxpy/unit_test/test_constrained.py b/diffxpy/unit_test/test_constrained.py index 9ef21a3..78b4851 100644 --- a/diffxpy/unit_test/test_constrained.py +++ b/diffxpy/unit_test/test_constrained.py @@ -45,26 +45,22 @@ def test_forfatal_from_string(self): # Build constraints: constraints_loc = de.utils.constraint_matrix_from_string( dmat=dmat_est_loc, - constraints=["bio1+bio2=0", "bio3+bio4=0"], - dims=["design_loc_params", "loc_params"] + constraints=["bio1+bio2=0", "bio3+bio4=0"] ) constraints_scale = de.utils.constraint_matrix_from_string( dmat=dmat_est_scale, - constraints=["bio1+bio2=0", "bio3+bio4=0"], - dims=["design_scale_params", "scale_params"] + constraints=["bio1+bio2=0", "bio3+bio4=0"] ) test = de.test.wald( - data=sim.X, + data=sim.x, dmat_loc=dmat_est_loc, dmat_scale=dmat_est_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["treatment1"] ) - summary = test.summary() - - return True + _ = test.summary() def test_forfatal_from_dict(self): """ @@ -102,16 +98,14 @@ def test_forfatal_from_dict(self): ) test = de.test.wald( - data=sim.X, + data=sim.x, dmat_loc=dmat_loc, dmat_scale=dmat_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["cond[T.cond1]"] ) - summary = test.summary() - - return True + _ = test.summary() def test_null_distribution_wald_constrained(self, n_genes: int = 100): """ @@ -155,14 +149,14 @@ def test_null_distribution_wald_constrained(self, n_genes: int = 100): ) test = de.test.wald( - data=sim.X, + data=sim.x, dmat_loc=dmat_loc, dmat_scale=dmat_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["cond[T.cond1]"] ) - summary = test.summary() + _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue @@ -230,13 +224,12 @@ def test_null_distribution_wald_constrained_2layer(self, n_genes: int = 100): "bio5+bio6=0", "bio7+bio8=0", "tech1+tech2=0", - "tech3+tech4=0"], - dims=["design_loc_params", "loc_params"] + "tech3+tech4=0"] ) constraints_scale = None test = de.test.wald( - data=sim.X, + data=sim.x, dmat_loc=dmat_est_loc, dmat_scale=dmat_est_scale, constraints_loc=constraints_loc, @@ -298,26 +291,24 @@ def test_null_distribution_wald_multi_constrained_2layer(self, n_genes: int = 50 dmat=dmat_est_loc, constraints=["bio1+bio2=0", "bio3+bio4=0", - "bio5+bio6=0"], - dims=["design_loc_params", "loc_params"] + "bio5+bio6=0"] ) constraints_scale = de.utils.constraint_matrix_from_string( dmat=dmat_est_scale, constraints=["bio1+bio2=0", "bio3+bio4=0", - "bio5+bio6=0"], - dims=["design_scale_params", "scale_params"] + "bio5+bio6=0"] ) test = de.test.wald( - data=sim.X, + data=sim.x, dmat_loc=dmat_est_loc, dmat_scale=dmat_est_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, coef_to_test=["treatment1", "treatment2"] ) - summary = test.summary() + _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue diff --git a/diffxpy/unit_test/test_continuous.py b/diffxpy/unit_test/test_continuous.py index 642ece4..57ad345 100644 --- a/diffxpy/unit_test/test_continuous.py +++ b/diffxpy/unit_test/test_continuous.py @@ -30,8 +30,8 @@ def test_forfatal_functions(self): sim.generate() random_sample_description = pd.DataFrame({ - "pseudotime": np.random.random(size=sim.num_observations), - "batch": np.random.randint(2, size=sim.num_observations) + "pseudotime": np.random.random(size=sim.nobs), + "batch": np.random.randint(2, size=sim.nobs) }) test = de.test.continuous_1d( @@ -89,7 +89,7 @@ def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100): sim.generate() random_sample_description = pd.DataFrame({ - "pseudotime": np.random.random(size=sim.num_observations) + "pseudotime": np.random.random(size=sim.nobs) }) test = de.test.continuous_1d( @@ -135,7 +135,7 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100): sim.generate() random_sample_description = pd.DataFrame({ - "pseudotime": np.random.random(size=sim.num_observations) + "pseudotime": np.random.random(size=sim.nobs) }) test = de.test.continuous_1d( diff --git a/diffxpy/unit_test/test_correction.py b/diffxpy/unit_test/test_correction.py index 6941688..cbd602e 100644 --- a/diffxpy/unit_test/test_correction.py +++ b/diffxpy/unit_test/test_correction.py @@ -4,7 +4,7 @@ import scipy.stats as stats import logging -from batchglm.api.models.glm_nb import Simulator, Estimator, InputData +from batchglm.api.models.glm_nb import Simulator, Estimator, InputDataGLM import diffxpy.api as de @@ -12,5 +12,6 @@ class TestCorrection(unittest.TestCase): pass + if __name__ == '__main__': unittest.main() diff --git a/diffxpy/unit_test/test_data_types.py b/diffxpy/unit_test/test_data_types.py index 8381c3d..4859e6a 100644 --- a/diffxpy/unit_test/test_data_types.py +++ b/diffxpy/unit_test/test_data_types.py @@ -23,7 +23,7 @@ def _test_wald(self, data, sample_description, gene_names=None): training_strategy="DEFAULT", dtype="float64" ) - summary = test.summary() + _ = test.summary() def _test_lrt(self, data, sample_description, gene_names=None): test = de.test.lrt( @@ -36,7 +36,7 @@ def _test_lrt(self, data, sample_description, gene_names=None): training_strategy="DEFAULT", dtype="float64" ) - summary = test.summary() + _ = test.summary() def _test_t_test(self, data, sample_description, gene_names=None): test = de.test.t_test( @@ -45,7 +45,7 @@ def _test_t_test(self, data, sample_description, gene_names=None): sample_description=sample_description, gene_names=gene_names ) - summary = test.summary() + _ = test.summary() def _test_rank(self, data, sample_description, gene_names=None): test = de.test.rank_test( @@ -54,7 +54,7 @@ def _test_rank(self, data, sample_description, gene_names=None): sample_description=sample_description, gene_names=gene_names ) - summary = test.summary() + _ = test.summary() def simulate(self, n_cells: int = 20, n_genes: int = 2): sim = Simulator(num_observations=n_cells, num_features=n_genes) @@ -62,48 +62,37 @@ def simulate(self, n_cells: int = 20, n_genes: int = 2): sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.input_data.num_observations) }) - return sim.X ,random_sample_description + return sim.x, random_sample_description def _test_numpy(self, sparse): data, sample_description = self.simulate() - gene_names = data.features - data = data.values + gene_names = ["gene" + str(i) for i in range(data.shape[1])] if sparse: data = scipy.sparse.csr_matrix(data) self._test_wald(data=data, sample_description=sample_description, gene_names=gene_names) - #self._test_lrt(data=data, sample_description=sample_description, gene_names=gene_names) + self._test_lrt(data=data, sample_description=sample_description, gene_names=gene_names) self._test_t_test(data=data, sample_description=sample_description, gene_names=gene_names) self._test_rank(data=data, sample_description=sample_description, gene_names=gene_names) - def _test_xarray(self): - data, sample_description = self.simulate() - - self._test_wald(data=data, sample_description=sample_description) - #self._test_lrt(data=data, sample_description=sample_description) - self._test_t_test(data=data, sample_description=sample_description) - self._test_rank(data=data, sample_description=sample_description) - def _test_anndata(self, sparse): data, sample_description = self.simulate() - gene_names = [str(x) for x in data.features.values] - data = data.values + gene_names = ["gene" + str(i) for i in range(data.shape[1])] if sparse: data = scipy.sparse.csr_matrix(data) data = anndata.AnnData(data) data.var_names = gene_names self._test_wald(data=data, sample_description=sample_description) - #self._test_lrt(data=data, sample_description=sample_description) + self._test_lrt(data=data, sample_description=sample_description) self._test_t_test(data=data, sample_description=sample_description) self._test_rank(data=data, sample_description=sample_description) def _test_anndata_raw(self, sparse): data, sample_description = self.simulate() - gene_names = [str(x) for x in data.features.values] - data = data.values + gene_names = ["gene" + str(i) for i in range(data.shape[1])] if sparse: data = scipy.sparse.csr_matrix(data) @@ -111,7 +100,7 @@ def _test_anndata_raw(self, sparse): data.var_names = gene_names data.raw = data self._test_wald(data=data.raw, sample_description=sample_description) - #self._test_lrt(data=data.raw, sample_description=sample_description) + self._test_lrt(data=data.raw, sample_description=sample_description) self._test_t_test(data=data, sample_description=sample_description) self._test_rank(data=data, sample_description=sample_description) @@ -125,15 +114,6 @@ def test_numpy(self): return True - def test_xarray(self): - logging.getLogger("tensorflow").setLevel(logging.ERROR) - logging.getLogger("batchglm").setLevel(logging.WARNING) - logging.getLogger("diffxpy").setLevel(logging.WARNING) - - self._test_xarray() - - return True - def test_anndata(self): logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) @@ -146,5 +126,6 @@ def test_anndata(self): return True + if __name__ == '__main__': unittest.main() diff --git a/diffxpy/unit_test/test_extreme_values.py b/diffxpy/unit_test/test_extreme_values.py index cb19922..bf5cd9b 100644 --- a/diffxpy/unit_test/test_extreme_values.py +++ b/diffxpy/unit_test/test_extreme_values.py @@ -22,11 +22,11 @@ def test_t_test_zero_variance(self): sim = Simulator(num_observations=1000, num_features=10) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() - sim.data.X[:, 0] = 0 - sim.data.X[:, 1] = 5 + sim.input_data.x[:, 0] = 0 + sim.input_data.x[:, 1] = 5 random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.nobs) }) test = de.test.t_test( @@ -52,11 +52,11 @@ def test_rank_test_zero_variance(self): sim = Simulator(num_observations=1000, num_features=10) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() - sim.data.X[:, 0] = 0 - sim.data.X[:, 1] = 5 + sim.input_data.x[:, 0] = 0 + sim.input_data.x[:, 1] = 5 random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.nobs) }) test = de.test.rank_test( diff --git a/diffxpy/unit_test/test_numeric.py b/diffxpy/unit_test/test_numeric.py index 05e53b6..7a05c60 100644 --- a/diffxpy/unit_test/test_numeric.py +++ b/diffxpy/unit_test/test_numeric.py @@ -25,11 +25,11 @@ def test(self): sim.generate_data() sample_description = sim.sample_description - sample_description["numeric1"] = np.random.random(size=sim.num_observations) - sample_description["numeric2"] = np.random.random(size=sim.num_observations) + sample_description["numeric1"] = np.random.random(size=sim.nobs) + sample_description["numeric2"] = np.random.random(size=sim.nobs) test = de.test.wald( - data=sim.X, + data=sim.x, sample_description=sample_description, formula_loc="~ 1 + condition + numeric1 + numeric2", formula_scale="~ 1", diff --git a/diffxpy/unit_test/test_pairwise.py b/diffxpy/unit_test/test_pairwise.py index 5b52746..6cc6c59 100644 --- a/diffxpy/unit_test/test_pairwise.py +++ b/diffxpy/unit_test/test_pairwise.py @@ -29,11 +29,11 @@ def test_null_distribution_ztest(self, n_cells: int = 2000, n_genes: int = 100, sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.num_observations) + "condition": np.random.randint(n_groups, size=sim.nobs) }) test = de.test.pairwise( - data=sim.X, + data=sim.x, grouping="condition", test="z-test", noise_model="nb", @@ -69,11 +69,11 @@ def test_null_distribution_z_lazy(self, n_cells: int = 2000, n_genes: int = 100) sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(4, size=sim.num_observations) + "condition": np.random.randint(4, size=sim.nobs) }) test = de.test.pairwise( - data=sim.X, + data=sim.x, grouping="condition", test='z-test', lazy=True, @@ -112,11 +112,11 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100, n_ sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.num_observations) + "condition": np.random.randint(n_groups, size=sim.nobs) }) test = de.test.pairwise( - data=sim.X, + data=sim.x, grouping="condition", test="lrt", noise_model="nb", @@ -151,11 +151,11 @@ def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 10000 sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.num_observations) + "condition": np.random.randint(n_groups, size=sim.nobs) }) test = de.test.pairwise( - data=sim.X, + data=sim.x, grouping="condition", test="t-test", sample_description=random_sample_description, @@ -189,11 +189,11 @@ def test_null_distribution_wilcoxon(self, n_cells: int = 2000, n_genes: int = 10 sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.num_observations) + "condition": np.random.randint(n_groups, size=sim.nobs) }) test = de.test.pairwise( - data=sim.X, + data=sim.x, grouping="condition", test="wilcoxon", sample_description=random_sample_description, @@ -239,7 +239,7 @@ def test_ztest_de(self, n_cells: int = 2000, n_genes: int = 500): sample_description = sim.sample_description test = de.test.pairwise( - data=sim.X, + data=sim.x, grouping="condition", test="z-test", noise_model="nb", diff --git a/diffxpy/unit_test/test_partition.py b/diffxpy/unit_test/test_partition.py index 98d1009..7f83374 100644 --- a/diffxpy/unit_test/test_partition.py +++ b/diffxpy/unit_test/test_partition.py @@ -29,13 +29,13 @@ def test_null_distribution_wald(self, n_cells: int = 4000, n_genes: int = 200): sim.generate() sample_description = pd.DataFrame({ - "covar1": np.random.randint(2, size=sim.num_observations), - "covar2": np.random.randint(2, size=sim.num_observations) + "covar1": np.random.randint(2, size=sim.nobs), + "covar2": np.random.randint(2, size=sim.nobs) }) sample_description["cond"] = sim.sample_description["condition"].values partition = de.test.partition( - data=sim.X, + data=sim.x, parts="cond", sample_description=sample_description ) @@ -45,7 +45,7 @@ def test_null_distribution_wald(self, n_cells: int = 4000, n_genes: int = 200): training_strategy="DEFAULT", dtype="float64" ) - summary = det.summary() + _ = det.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(det.pval.flatten(), 'uniform').pvalue @@ -74,13 +74,13 @@ def test_null_distribution_wald_multi(self, n_cells: int = 4000, n_genes: int = sim.generate() sample_description = pd.DataFrame({ - "covar1": np.random.randint(4, size=sim.num_observations), - "covar2": np.random.randint(2, size=sim.num_observations) + "covar1": np.random.randint(4, size=sim.nobs), + "covar2": np.random.randint(2, size=sim.nobs) }) sample_description["cond"] = sim.sample_description["condition"].values partition = de.test.partition( - data=sim.X, + data=sim.x, parts="cond", sample_description=sample_description ) @@ -90,7 +90,7 @@ def test_null_distribution_wald_multi(self, n_cells: int = 4000, n_genes: int = training_strategy="DEFAULT", dtype="float64" ) - summary = det.summary() + _ = det.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(det.pval.flatten(), 'uniform').pvalue @@ -119,13 +119,13 @@ def test_null_distribution_lrt(self, n_cells: int = 4000, n_genes: int = 200): sim.generate() sample_description = pd.DataFrame({ - "covar1": np.random.randint(2, size=sim.num_observations), - "covar2": np.random.randint(2, size=sim.num_observations) + "covar1": np.random.randint(2, size=sim.nobs), + "covar2": np.random.randint(2, size=sim.nobs) }) sample_description["cond"] = sim.sample_description["condition"].values partition = de.test.partition( - data=sim.X, + data=sim.x, parts="cond", sample_description=sample_description ) @@ -137,7 +137,7 @@ def test_null_distribution_lrt(self, n_cells: int = 4000, n_genes: int = 200): training_strategy="DEFAULT", dtype="float64" ) - summary = det.summary() + _ = det.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(det.pval.flatten(), 'uniform').pvalue @@ -166,12 +166,12 @@ def test_null_distribution_ttest(self, n_cells: int = 4000, n_genes: int = 200): sim.generate() sample_description = pd.DataFrame({ - "covar1": np.random.randint(2, size=sim.num_observations) + "covar1": np.random.randint(2, size=sim.nobs) }) sample_description["cond"] = sim.sample_description["condition"].values partition = de.test.partition( - data=sim.X, + data=sim.x, parts="cond", sample_description=sample_description ) @@ -209,12 +209,12 @@ def test_null_distribution_rank(self, n_cells: int = 4000, n_genes: int = 200): sim.generate() sample_description = pd.DataFrame({ - "covar1": np.random.randint(2, size=sim.num_observations) + "covar1": np.random.randint(2, size=sim.nobs) }) sample_description["cond"] = sim.sample_description["condition"].values partition = de.test.partition( - data=sim.X, + data=sim.x, parts="cond", sample_description=sample_description ) diff --git a/diffxpy/unit_test/test_single_de.py b/diffxpy/unit_test/test_single_de.py index 693e1fc..6beb78a 100644 --- a/diffxpy/unit_test/test_single_de.py +++ b/diffxpy/unit_test/test_single_de.py @@ -1,8 +1,6 @@ import unittest import logging import numpy as np -import pandas as pd -import scipy.stats as stats import diffxpy.api as de @@ -35,16 +33,16 @@ def _prepare_data( rand_fn_ave=lambda shape: np.random.poisson(500, shape) + 1, rand_fn=lambda shape: np.abs(np.random.uniform(1, 0.5, shape)) ) - sim.params["a_var"][1, :num_non_de] = 0 - sim.params["b_var"][1, :num_non_de] = 0 - sim.params["isDE"] = ("features",), np.arange(n_genes) >= num_non_de + sim.a_var[1, :num_non_de] = 0 + sim.b_var[1, :num_non_de] = 0 + self.isDE = np.arange(n_genes) >= num_non_de sim.generate_data() return sim def _eval(self, sim, test): - idx_de = np.where(sim.params["isDE"] == True)[0] - idx_nonde = np.where(sim.params["isDE"] == False)[0] + idx_de = np.where(self.isDE)[0] + idx_nonde = np.where(np.logical_not(self.isDE))[0] frac_de_of_non_de = np.sum(test.qval[idx_nonde] < 0.05) / len(idx_nonde) frac_de_of_de = np.sum(test.qval[idx_de] < 0.05) / len(idx_de) @@ -82,10 +80,9 @@ def _test_rank_de( ) test = de.test.rank_test( - data=sim.X, - grouping="condition", + data=sim.input_data, sample_description=sim.sample_description, - dtype="float64" + grouping="condition" ) self._eval(sim=sim, test=test) @@ -112,10 +109,9 @@ def _test_t_test_de( ) test = de.test.t_test( - data=sim.X, + data=sim.x, grouping="condition", - sample_description=sim.sample_description, - dtype="float64" + sample_description=sim.sample_description ) self._eval(sim=sim, test=test) @@ -144,10 +140,10 @@ def _test_wald_de( ) test = de.test.wald( - data=sim.X, + data=sim.input_data, + sample_description=sim.sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition", - sample_description=sim.sample_description, noise_model=noise_model, training_strategy="DEFAULT", dtype="float64" @@ -179,12 +175,12 @@ def _test_lrt_de( ) test = de.test.lrt( - data=sim.X, + data=sim.input_data, + sample_description=sim.sample_description, full_formula_loc="~ 1 + condition", full_formula_scale="~ 1", reduced_formula_loc="~ 1", reduced_formula_scale="~ 1", - sample_description=sim.sample_description, noise_model=noise_model, training_strategy="DEFAULT", dtype="float64" @@ -209,6 +205,11 @@ def test_ttest_de( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) return self._test_t_test_de( n_cells=n_cells, n_genes=n_genes @@ -223,6 +224,11 @@ def test_rank_de( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) return self._test_rank_de( n_cells=n_cells, n_genes=n_genes @@ -243,6 +249,11 @@ def test_wald_de_nb( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) return self._test_wald_de( n_cells=n_cells, n_genes=n_genes, @@ -258,6 +269,11 @@ def test_lrt_de_nb( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) return self._test_lrt_de( n_cells=n_cells, n_genes=n_genes, @@ -279,6 +295,11 @@ def test_wald_de_norm( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) return self._test_wald_de( n_cells=n_cells, n_genes=n_genes, @@ -294,6 +315,11 @@ def test_lrt_de_norm( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) return self._test_lrt_de( n_cells=n_cells, n_genes=n_genes, diff --git a/diffxpy/unit_test/test_single_external_libs.py b/diffxpy/unit_test/test_single_external_libs.py index 0590c2a..4f5bcaa 100644 --- a/diffxpy/unit_test/test_single_external_libs.py +++ b/diffxpy/unit_test/test_single_external_libs.py @@ -62,7 +62,7 @@ def test_t_test_ref(self, n_cells: int = 2000, n_genes: int = 100): sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes) test = de.test.t_test( - data=sim.X, + data=sim.x, grouping="condition", sample_description=sim.sample_description, dtype="float64" @@ -92,7 +92,7 @@ def test_rank_ref(self, n_cells: int = 2000, n_genes: int = 100): sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes) test = de.test.rank_test( - data=sim.X, + data=sim.x, grouping="condition", sample_description=sim.sample_description, dtype="float64" diff --git a/diffxpy/unit_test/test_single_null.py b/diffxpy/unit_test/test_single_null.py index 34ca47d..72b5157 100644 --- a/diffxpy/unit_test/test_single_null.py +++ b/diffxpy/unit_test/test_single_null.py @@ -37,21 +37,21 @@ def _test_null_distribution_wald( sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations), - "batch": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.nobs), + "batch": np.random.randint(2, size=sim.nobs) }) test = de.test.wald( - data=sim.X, + data=sim.input_data, + sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition + batch", - sample_description=random_sample_description, batch_size=500, noise_model=noise_model, training_strategy="DEFAULT", dtype="float64" ) - summary = test.summary() + _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue @@ -89,14 +89,14 @@ def _test_null_distribution_wald_multi( sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(4, size=sim.num_observations) + "condition": np.random.randint(4, size=sim.nobs) }) test = de.test.wald( - data=sim.X, + data=sim.input_data, + sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition", - sample_description=random_sample_description, noise_model=noise_model, training_strategy="DEFAULT", dtype="float64" @@ -139,21 +139,21 @@ def _test_null_distribution_lrt( sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.nobs) }) test = de.test.lrt( - data=sim.X, + data=sim.input_data, + sample_description=random_sample_description, full_formula_loc="~ 1 + condition", full_formula_scale="~ 1", reduced_formula_loc="~ 1", reduced_formula_scale="~ 1", - sample_description=random_sample_description, noise_model=noise_model, training_strategy="DEFAULT", dtype="float64" ) - summary = test.summary() + _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue @@ -184,17 +184,16 @@ def _test_null_distribution_ttest( sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.nobs) }) test = de.test.t_test( - data=sim.X, - grouping="condition", + data=sim.input_data, sample_description=random_sample_description, - is_logged=False, - dtype="float64" + grouping="condition", + is_logged=False ) - summary = test.summary() + _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue @@ -225,16 +224,15 @@ def _test_null_distribution_rank( sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.nobs) }) test = de.test.rank_test( - data=sim.X, - grouping="condition", + data=sim.input_data, sample_description=random_sample_description, - dtype="float64" + grouping="condition" ) - summary = test.summary() + _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue @@ -265,6 +263,7 @@ def test_null_distribution_ttest( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_ttest( n_cells=n_cells, n_genes=n_genes @@ -285,6 +284,7 @@ def test_null_distribution_rank( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_rank( n_cells=n_cells, n_genes=n_genes @@ -312,6 +312,7 @@ def test_null_distribution_wald_nb( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_wald( n_cells=n_cells, n_genes=n_genes, @@ -334,6 +335,7 @@ def test_null_distribution_wald_multi_nb( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_wald_multi( n_cells=n_cells, n_genes=n_genes, @@ -355,6 +357,7 @@ def test_null_distribution_lrt_nb( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_lrt( n_cells=n_cells, n_genes=n_genes, @@ -382,6 +385,7 @@ def test_null_distribution_wald_norm( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_wald( n_cells=n_cells, n_genes=n_genes, @@ -404,6 +408,7 @@ def test_null_distribution_wald_multi_norm( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_wald_multi( n_cells=n_cells, n_genes=n_genes, @@ -425,11 +430,13 @@ def test_null_distribution_lrt_norm( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) return self._test_null_distribution_lrt( n_cells=n_cells, n_genes=n_genes, noise_model="norm" ) + if __name__ == '__main__': unittest.main() diff --git a/diffxpy/unit_test/test_vsrest.py b/diffxpy/unit_test/test_vsrest.py index 28a56cf..9a78505 100644 --- a/diffxpy/unit_test/test_vsrest.py +++ b/diffxpy/unit_test/test_vsrest.py @@ -29,11 +29,11 @@ def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.num_observations) + "condition": np.random.randint(n_groups, size=sim.nobs) }) test = de.test.versus_rest( - data=sim.X, + data=sim.x, grouping="condition", test="wald", noise_model="nb", @@ -72,11 +72,11 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100): sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.num_observations) + "condition": np.random.randint(2, size=sim.nobs) }) test = de.test.versus_rest( - data=sim.X, + data=sim.x, grouping="condition", test="lrt", noise_model="nb", @@ -115,11 +115,11 @@ def test_null_distribution_rank(self, n_cells: int = 2000, n_genes: int = 100, n sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.num_observations) + "condition": np.random.randint(n_groups, size=sim.nobs) }) test = de.test.versus_rest( - data=sim.X, + data=sim.x, grouping="condition", test="rank", sample_description=random_sample_description, @@ -155,11 +155,11 @@ def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100, sim.generate() random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.num_observations) + "condition": np.random.randint(n_groups, size=sim.nobs) }) test = de.test.versus_rest( - data=sim.X, + data=sim.x, grouping="condition", test="t-test", sample_description=random_sample_description, diff --git a/setup.py b/setup.py index 2f43452..6ba511f 100644 --- a/setup.py +++ b/setup.py @@ -17,12 +17,11 @@ long_description_content_type="text/markdown", packages=find_packages(), install_requires=[ - 'numpy>=1.14.0', - 'scipy', + 'numpy==1.16.4', + 'scipy>=1.2.1', 'pandas', 'patsy>=0.5.0', 'batchglm>=0.6.1', - 'xarray', 'statsmodels', ], extras_require={ From 5a5108e9ce7f8662689081bf39109f331dd44fab Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 21 Aug 2019 12:42:16 +0200 Subject: [PATCH 03/15] fixed indexing of covar matrix in testing --- diffxpy/stats/stats.py | 4 +++- diffxpy/testing/det.py | 8 +++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/diffxpy/stats/stats.py b/diffxpy/stats/stats.py index e57aa99..afba800 100644 --- a/diffxpy/stats/stats.py +++ b/diffxpy/stats/stats.py @@ -33,7 +33,7 @@ def likelihood_ratio_test( delta_df = df_full - df_reduced # Compute the deviance test statistic. delta_dev = 2 * (ll_full - ll_reduced) - # Compute the p-values based on the deviance and its expection based on the chi-square distribution. + # Compute the p-values based on the deviance and its expectation based on the chi-square distribution. pvals = 1 - scipy.stats.chi2(delta_df).cdf(delta_dev) return pvals @@ -251,6 +251,8 @@ def wald_test_chisq( if theta_mle.shape[0] != theta_covar.shape[1]: raise ValueError( 'stats.wald_test(): theta_mle and theta_covar have to contain the same number of parameters') + if len(theta_covar.shape) != 3: + raise ValueError('stats.wald_test(): theta_covar should have 3 dimensions but has %i' % len(theta_covar.shape)) if theta_mle.shape[1] != theta_covar.shape[0]: raise ValueError('stats.wald_test(): theta_mle and theta_covar have to contain the same number of genes') if theta_covar.shape[1] != theta_covar.shape[2]: diff --git a/diffxpy/testing/det.py b/diffxpy/testing/det.py index ba0f79c..a061bf1 100644 --- a/diffxpy/testing/det.py +++ b/diffxpy/testing/det.py @@ -791,8 +791,7 @@ def _test(self): if len(self.coef_loc_totest) == 1: self.theta_mle = self.theta_mle[0] # Make xarray one dimensional for stats.wald_test. self.theta_sd = self.model_estim.fisher_inv[:, self.coef_loc_totest[0], self.coef_loc_totest[0]] - self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd, - where=self.theta_sd < np.nextafter(0, np.inf)) + self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd, where=self.theta_sd < np.nextafter(0, np.inf)) self.theta_sd = np.sqrt(self.theta_sd) return stats.wald_test( theta_mle=self.theta_mle, @@ -801,12 +800,11 @@ def _test(self): ) else: self.theta_sd = np.diagonal(self.model_estim.fisher_inv, axis1=-2, axis2=-1).copy() - self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd, - where=self.theta_sd < np.nextafter(0, np.inf)) + self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd, where=self.theta_sd < np.nextafter(0, np.inf)) self.theta_sd = np.sqrt(self.theta_sd) return stats.wald_test_chisq( theta_mle=self.theta_mle, - theta_covar=self.model_estim.fisher_inv[:, self.coef_loc_totest, self.coef_loc_totest], + theta_covar=self.model_estim.fisher_inv[:, self.coef_loc_totest, :][:, :, self.coef_loc_totest], theta0=0 ) From 8dbba49fa4f5876068bcf5c0779a4be249c1a528 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 21 Aug 2019 13:03:36 +0200 Subject: [PATCH 04/15] fixed t-test and rank test --- diffxpy/testing/det.py | 14 +++++++++++--- diffxpy/testing/tests.py | 2 +- diffxpy/testing/utils.py | 1 + diffxpy/unit_test/test_single_de.py | 10 +++++----- diffxpy/unit_test/test_single_null.py | 7 ++++--- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/diffxpy/testing/det.py b/diffxpy/testing/det.py index a061bf1..bd99fc6 100644 --- a/diffxpy/testing/det.py +++ b/diffxpy/testing/det.py @@ -14,7 +14,7 @@ except ImportError: anndata = None -from batchglm.models.base import _EstimatorBase +from batchglm.models.base import _EstimatorBase, _InputDataBase from ..stats import stats from . import correction @@ -1518,6 +1518,10 @@ def __init__( is_sig_zerovar: bool = True ): super().__init__() + if isinstance(data, anndata.AnnData) or isinstance(data, anndata.Raw): + data = data.X + elif isinstance(data, _InputDataBase): + data = data.x self._x = data self.sample_description = sample_description self.grouping = grouping @@ -1652,6 +1656,10 @@ def __init__( is_sig_zerovar: bool = True ): super().__init__() + if isinstance(data, anndata.AnnData) or isinstance(data, anndata.Raw): + data = data.X + elif isinstance(data, _InputDataBase): + data = data.x self._x = data self.sample_description = sample_description self.grouping = grouping @@ -1680,8 +1688,8 @@ def __init__( # TODO: can this be done on sparse? pval = np.zeros([data.shape[1]]) + np.nan pval[idx_run] = stats.mann_whitney_u_test( - x0=x0.x[:, idx_run].toarray(), - x1=x1.x[:, idx_run].toarray() + x0=np.asarray(x0[:, idx_run]), + x1=np.asarray(x1[:, idx_run]) ) pval[np.where(np.logical_and( np.logical_and(mean_x0 == mean_x1, self._mean > 0), diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py index c573d45..7c09f03 100644 --- a/diffxpy/testing/tests.py +++ b/diffxpy/testing/tests.py @@ -209,7 +209,7 @@ def lrt( noise_model="nb", size_factors: Union[np.ndarray, pd.core.series.Series, np.ndarray] = None, batch_size: int = None, - training_strategy: Union[str, List[Dict[str, object]], Callable] = "DEFAULT", + training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO", quick_scale: bool = False, dtype="float64", **kwargs diff --git a/diffxpy/testing/utils.py b/diffxpy/testing/utils.py index 7b7d8d3..3eb525e 100644 --- a/diffxpy/testing/utils.py +++ b/diffxpy/testing/utils.py @@ -103,6 +103,7 @@ def parse_grouping(data, sample_description, grouping): def split_x(data, grouping): + grouping = np.asarray(grouping) groups = np.unique(grouping) x0 = data[np.where(grouping == groups[0])[0]] x1 = data[np.where(grouping == groups[1])[0]] diff --git a/diffxpy/unit_test/test_single_de.py b/diffxpy/unit_test/test_single_de.py index 6beb78a..c6b6681 100644 --- a/diffxpy/unit_test/test_single_de.py +++ b/diffxpy/unit_test/test_single_de.py @@ -5,7 +5,7 @@ import diffxpy.api as de -class _TestSingleDE: +class _TestSingleDe: def _prepare_data( self, @@ -109,7 +109,7 @@ def _test_t_test_de( ) test = de.test.t_test( - data=sim.x, + data=sim.input_data, grouping="condition", sample_description=sim.sample_description ) @@ -191,7 +191,7 @@ def _test_lrt_de( return True -class TestSingleDE_STANDARD(_TestSingleDE, unittest.TestCase): +class TestSingleDeStandard(_TestSingleDe, unittest.TestCase): """ Noise model-independent tests unit tests that tests false positive and false negative rates. """ @@ -235,7 +235,7 @@ def test_rank_de( ) -class TestSingleDE_NB(_TestSingleDE, unittest.TestCase): +class TestSingleDeNb(_TestSingleDe, unittest.TestCase): """ Negative binomial noise model unit tests that tests false positive and false negative rates. """ @@ -281,7 +281,7 @@ def test_lrt_de_nb( ) -class TestSingleDE_NORM(_TestSingleDE, unittest.TestCase): +class TestSingleDeNorm(_TestSingleDe, unittest.TestCase): """ Normal noise model unit tests that tests false positive and false negative rates. """ diff --git a/diffxpy/unit_test/test_single_null.py b/diffxpy/unit_test/test_single_null.py index 72b5157..25bd8bb 100644 --- a/diffxpy/unit_test/test_single_null.py +++ b/diffxpy/unit_test/test_single_null.py @@ -101,7 +101,7 @@ def _test_null_distribution_wald_multi( training_strategy="DEFAULT", dtype="float64" ) - summary = test.summary() + _ = test.summary() # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(test.pval, 'uniform').pvalue @@ -248,6 +248,7 @@ class TestSingleNullStandard(_TestSingleNull, unittest.TestCase): Noise model-independent unit tests that test whether a test generates uniformly distributed p-values if data are sampled from the null model. """ + def test_null_distribution_ttest( self, n_cells: int = 2000, @@ -291,7 +292,7 @@ def test_null_distribution_rank( ) -class TestSingleNullNB(_TestSingleNull, unittest.TestCase): +class TestSingleNullNb(_TestSingleNull, unittest.TestCase): """ Negative binomial noise model unit tests that test whether a test generates uniformly distributed p-values if data are sampled from the null model. @@ -365,7 +366,7 @@ def test_null_distribution_lrt_nb( ) -class TestSingleNullNORM(_TestSingleNull, unittest.TestCase): +class TestSingleNullNorm(_TestSingleNull, unittest.TestCase): """ Normal noise model unit tests that test whether a test generates uniformly distributed p-values if data are sampled from the null model. From ba501739093841cc99f7ebbd5b7b33815659452d Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 21 Aug 2019 13:45:38 +0200 Subject: [PATCH 05/15] fixes to unit tests --- diffxpy/testing/det.py | 2 +- diffxpy/unit_test/test_single_de.py | 13 ++++++++----- diffxpy/unit_test/test_single_null.py | 5 ++++- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/diffxpy/testing/det.py b/diffxpy/testing/det.py index bd99fc6..b537fe4 100644 --- a/diffxpy/testing/det.py +++ b/diffxpy/testing/det.py @@ -789,7 +789,7 @@ def _test(self): # with a normal distribution, for multiple parameters, a chi-square distribution is used. self.theta_mle = self.model_estim.a_var[self.coef_loc_totest] if len(self.coef_loc_totest) == 1: - self.theta_mle = self.theta_mle[0] # Make xarray one dimensional for stats.wald_test. + self.theta_mle = self.theta_mle[0] self.theta_sd = self.model_estim.fisher_inv[:, self.coef_loc_totest[0], self.coef_loc_totest[0]] self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd, where=self.theta_sd < np.nextafter(0, np.inf)) self.theta_sd = np.sqrt(self.theta_sd) diff --git a/diffxpy/unit_test/test_single_de.py b/diffxpy/unit_test/test_single_de.py index c6b6681..8edf03f 100644 --- a/diffxpy/unit_test/test_single_de.py +++ b/diffxpy/unit_test/test_single_de.py @@ -21,8 +21,12 @@ def _prepare_data( """ if noise_model == "nb": from batchglm.api.models.glm_nb import Simulator + rand_fn_loc = lambda shape: np.random.uniform(5, 10, shape) + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models.glm_norm import Simulator + rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) @@ -30,22 +34,21 @@ def _prepare_data( sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) sim.generate_params( - rand_fn_ave=lambda shape: np.random.poisson(500, shape) + 1, - rand_fn=lambda shape: np.abs(np.random.uniform(1, 0.5, shape)) + rand_fn_loc=rand_fn_loc, + rand_fn_scale=rand_fn_scale ) sim.a_var[1, :num_non_de] = 0 sim.b_var[1, :num_non_de] = 0 self.isDE = np.arange(n_genes) >= num_non_de sim.generate_data() - return sim def _eval(self, sim, test): idx_de = np.where(self.isDE)[0] idx_nonde = np.where(np.logical_not(self.isDE))[0] - frac_de_of_non_de = np.sum(test.qval[idx_nonde] < 0.05) / len(idx_nonde) - frac_de_of_de = np.sum(test.qval[idx_de] < 0.05) / len(idx_de) + frac_de_of_non_de = np.mean(test.qval[idx_nonde] < 0.05) + frac_de_of_de = np.mean(test.qval[idx_de] < 0.05) logging.getLogger("diffxpy").info( 'fraction of non-DE genes with q-value < 0.05: %.1f%%' % diff --git a/diffxpy/unit_test/test_single_null.py b/diffxpy/unit_test/test_single_null.py index 25bd8bb..3c9834c 100644 --- a/diffxpy/unit_test/test_single_null.py +++ b/diffxpy/unit_test/test_single_null.py @@ -27,14 +27,17 @@ def _test_null_distribution_wald( """ if noise_model == "nb": from batchglm.api.models.glm_nb import Simulator + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": from batchglm.api.models.glm_norm import Simulator + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % noise_model) sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + sim.generate_params(rand_fn_scale=rand_fn_scale) + sim.generate_data() random_sample_description = pd.DataFrame({ "condition": np.random.randint(2, size=sim.nobs), From ae59388988782ca50560e0cf40e2b575821c964b Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 21 Aug 2019 13:56:17 +0200 Subject: [PATCH 06/15] fixed external lib unit test --- .../unit_test/test_single_external_libs.py | 47 +++++++++---------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/diffxpy/unit_test/test_single_external_libs.py b/diffxpy/unit_test/test_single_external_libs.py index 4f5bcaa..b1d4883 100644 --- a/diffxpy/unit_test/test_single_external_libs.py +++ b/diffxpy/unit_test/test_single_external_libs.py @@ -1,7 +1,6 @@ import unittest import logging import numpy as np -import pandas as pd import scipy.stats as stats from batchglm.api.models.glm_nb import Simulator @@ -18,10 +17,7 @@ def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100): """ sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=2) - sim.generate_params( - rand_fn_ave=lambda shape: np.random.poisson(500, shape) + 1, - rand_fn=lambda shape: np.abs(np.random.uniform(1, 0.5, shape)) - ) + sim.generate_params() sim.generate_data() return sim @@ -45,8 +41,8 @@ def _eval(self, test, ref_pvals): 'mean absolute log p-value deviation: %f' % float(mean_dev) ) - assert max_dev < 1e-3, "maximum deviation too large" - assert max_log_dev < 1e-1, "maximum deviation in log space too large" + assert max_dev < 1e-3, "maximum deviation too large: %f" % max_dev + assert max_log_dev < 1e-1, "maximum deviation in log space too large: %f" % max_log_dev def test_t_test_ref(self, n_cells: int = 2000, n_genes: int = 100): """ @@ -59,23 +55,25 @@ def test_t_test_ref(self, n_cells: int = 2000, n_genes: int = 100): logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.INFO) + np.random.seed(1) sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes) - test = de.test.t_test( - data=sim.x, + data=sim.input_data, grouping="condition", - sample_description=sim.sample_description, - dtype="float64" + sample_description=sim.sample_description ) # Run scipy t-tests as a reference. conds = np.unique(sim.sample_description["condition"].values) ind_a = np.where(sim.sample_description["condition"] == conds[0])[0] ind_b = np.where(sim.sample_description["condition"] == conds[1])[0] - scipy_pvals = stats.ttest_ind(a=sim.X[ind_a, :], b=sim.X[ind_b, :], axis=0, equal_var=False).pvalue - + scipy_pvals = stats.ttest_ind( + a=sim.x[ind_a, :], + b=sim.x[ind_b, :], + axis=0, + equal_var=False + ).pvalue self._eval(test=test, ref_pvals=scipy_pvals) - return True def test_rank_ref(self, n_cells: int = 2000, n_genes: int = 100): @@ -89,13 +87,12 @@ def test_rank_ref(self, n_cells: int = 2000, n_genes: int = 100): logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.INFO) + np.random.seed(1) sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes) - test = de.test.rank_test( - data=sim.x, + data=sim.input_data, grouping="condition", - sample_description=sim.sample_description, - dtype="float64" + sample_description=sim.sample_description ) # Run scipy t-tests as a reference. @@ -103,13 +100,15 @@ def test_rank_ref(self, n_cells: int = 2000, n_genes: int = 100): ind_a = np.where(sim.sample_description["condition"] == conds[0])[0] ind_b = np.where(sim.sample_description["condition"] == conds[1])[0] scipy_pvals = np.array([ - stats.mannwhitneyu(x=sim.X[ind_a, i], y=sim.X[ind_b, i], - use_continuity=True, alternative="two-sided").pvalue - for i in range(sim.X.shape[1]) - ]) - + stats.mannwhitneyu( + x=sim.x[ind_a, i], + y=sim.x[ind_b, i], + use_continuity=True, + alternative="two-sided" + ).pvalue + for i in range(sim.x.shape[1]) + ]) self._eval(test=test, ref_pvals=scipy_pvals) - return True From a4d3a5f7cace891556a65fad8a9415bc6f0ab865 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 21 Aug 2019 14:25:41 +0200 Subject: [PATCH 07/15] more work on unit tests --- diffxpy/stats/stats.py | 5 ++--- diffxpy/unit_test/test_data_types.py | 22 ++++++++-------------- diffxpy/unit_test/test_extreme_values.py | 15 +++++++++------ 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/diffxpy/stats/stats.py b/diffxpy/stats/stats.py index afba800..1773a12 100644 --- a/diffxpy/stats/stats.py +++ b/diffxpy/stats/stats.py @@ -56,14 +56,13 @@ def mann_whitney_u_test( """ axis = 1 if np.any(np.ndim(x0) != np.ndim(x1)): - raise ValueError('stats.wilcoxon(): number of dimensions is not allowed to differ between x0 and x1!') + raise ValueError('number of dimensions is not allowed to differ between x0 and x1') # Reshape into 2D array if only one test is performed. if np.ndim(x0) == 1: x0 = x0.reshape([x0.shape[0], 1]) x1 = x1.reshape([x1.shape[0], 1]) if np.any(x0.shape[axis] != x1.shape[axis]): - raise ValueError( - 'stats.wilcoxon(): the first axis (number of tests) is not allowed to differ between x0 and x1!') + raise ValueError('the first axis (number of tests) is not allowed to differ between x0 and x1') pvals = np.asarray([ scipy.stats.mannwhitneyu( diff --git a/diffxpy/unit_test/test_data_types.py b/diffxpy/unit_test/test_data_types.py index 4859e6a..edf42dc 100644 --- a/diffxpy/unit_test/test_data_types.py +++ b/diffxpy/unit_test/test_data_types.py @@ -15,44 +15,38 @@ class TestDataTypesSingle(unittest.TestCase): def _test_wald(self, data, sample_description, gene_names=None): test = de.test.wald( data=data, - factor_loc_totest="condition", - formula_loc="~ 1 + condition", sample_description=sample_description, gene_names=gene_names, - quick_scale=True, - training_strategy="DEFAULT", - dtype="float64" + factor_loc_totest="condition", + formula_loc="~ 1 + condition" ) _ = test.summary() def _test_lrt(self, data, sample_description, gene_names=None): test = de.test.lrt( data=data, - full_formula_loc="~ 1 + condition", - reduced_formula_loc="~ 1", sample_description=sample_description, gene_names=gene_names, - quick_scale=True, - training_strategy="DEFAULT", - dtype="float64" + full_formula_loc="~ 1 + condition", + reduced_formula_loc="~ 1" ) _ = test.summary() def _test_t_test(self, data, sample_description, gene_names=None): test = de.test.t_test( data=data, - grouping="condition", sample_description=sample_description, - gene_names=gene_names + gene_names=gene_names, + grouping="condition" ) _ = test.summary() def _test_rank(self, data, sample_description, gene_names=None): test = de.test.rank_test( data=data, - grouping="condition", sample_description=sample_description, - gene_names=gene_names + gene_names=gene_names, + grouping="condition" ) _ = test.summary() diff --git a/diffxpy/unit_test/test_extreme_values.py b/diffxpy/unit_test/test_extreme_values.py index bf5cd9b..4564dea 100644 --- a/diffxpy/unit_test/test_extreme_values.py +++ b/diffxpy/unit_test/test_extreme_values.py @@ -3,7 +3,6 @@ import numpy as np import pandas as pd -import scipy.stats as stats from batchglm.api.models.glm_nb import Simulator import diffxpy.api as de @@ -19,6 +18,8 @@ def test_t_test_zero_variance(self): logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) + de.pkg_constants.DE_TREAT_ZEROVAR_TT_AS_SIG = True sim = Simulator(num_observations=1000, num_features=10) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() @@ -30,14 +31,14 @@ def test_t_test_zero_variance(self): }) test = de.test.t_test( - data=sim.X, - grouping="condition", + data=sim.input_data, sample_description=random_sample_description, + grouping="condition", is_sig_zerovar=True ) assert np.isnan(test.pval[0]) and test.pval[1] == 1, \ - "rank test did not assign p-value of zero to groups with zero variance and same mean, %f, %f" % \ + "t test did not assign p-value of zero to groups with zero variance and same mean, %f, %f" % \ (test.pval[0], test.pval[1]) return True @@ -49,6 +50,8 @@ def test_rank_test_zero_variance(self): logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) + np.random.seed(1) + de.pkg_constants.DE_TREAT_ZEROVAR_TT_AS_SIG = True sim = Simulator(num_observations=1000, num_features=10) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() @@ -60,9 +63,9 @@ def test_rank_test_zero_variance(self): }) test = de.test.rank_test( - data=sim.X, - grouping="condition", + data=sim.input_data, sample_description=random_sample_description, + grouping="condition", is_sig_zerovar=True ) From f75c373f99e4e6594c9f3cb76c6a20b47b2d2a64 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 21 Aug 2019 14:31:12 +0200 Subject: [PATCH 08/15] fixed extreme value handling in t test --- diffxpy/pkg_constants.py | 2 -- diffxpy/testing/det.py | 30 +++++++----------------- diffxpy/unit_test/test_extreme_values.py | 2 -- 3 files changed, 8 insertions(+), 26 deletions(-) diff --git a/diffxpy/pkg_constants.py b/diffxpy/pkg_constants.py index 389f621..399a45d 100644 --- a/diffxpy/pkg_constants.py +++ b/diffxpy/pkg_constants.py @@ -1,5 +1,3 @@ -DE_TREAT_ZEROVAR_TT_AS_SIG = True - BATCHGLM_OPTIM_GD = False BATCHGLM_OPTIM_ADAM = False BATCHGLM_OPTIM_ADAGRAD = False diff --git a/diffxpy/testing/det.py b/diffxpy/testing/det.py index b537fe4..377d3a7 100644 --- a/diffxpy/testing/det.py +++ b/diffxpy/testing/det.py @@ -1561,6 +1561,10 @@ def __init__( np.logical_and(mean_x0 == mean_x1, self._mean > 0), np.logical_not(self._var_geq_zero) ))[0]] = 1.0 + # Depening on user choice via is_sig_zerovar: + # Set p-value to 0 if LFC was non-zero and variances are zero, + # this causes division by zero in the test statistic. This + # is a highly significant result if one believes the variance estimate. if is_sig_zerovar: pval[np.where(np.logical_and( mean_x0 != mean_x1, @@ -1573,28 +1577,6 @@ def __init__( self._logfc = mean_x1 - mean_x0 else: self._logfc = np.log(mean_x1) - np.log(mean_x0) - # Return 0 if LFC was non-zero and variances are zero, - # this causes division by zero in the test statistic. This - # is a highly significant result if one believes the variance estimate. - # This is the default which can be changed and can be changed - # via DIFFXPY_TREAT_ZEROVAR_TT_AS_SIG. - pval[np.where(np.logical_and(np.logical_and( - np.logical_not(self._var_geq_zero), - self._ave_nonzero), - np.abs(self._logfc) < np.nextafter(0, 1) - ))] = 0 - if pkg_constants.DE_TREAT_ZEROVAR_TT_AS_SIG: - pval[np.where(np.logical_and(np.logical_and( - np.logical_not(self._var_geq_zero), - self._ave_nonzero), - np.abs(self._logfc) >= np.nextafter(0, 1) - ))] = 1 - else: - pval[np.where(np.logical_and(np.logical_and( - np.logical_not(self._var_geq_zero), - self._ave_nonzero), - np.abs(self._logfc) >= np.nextafter(0, 1) - ))] = 0 @property def gene_ids(self) -> np.ndarray: @@ -1695,6 +1677,10 @@ def __init__( np.logical_and(mean_x0 == mean_x1, self._mean > 0), np.logical_not(self._var_geq_zero) ))[0]] = 1.0 + # Depening on user choice via is_sig_zerovar: + # Set p-value to 0 if LFC was non-zero and variances are zero, + # this causes division by zero in the test statistic. This + # is a highly significant result if one believes the variance estimate. if is_sig_zerovar: pval[np.where(np.logical_and( mean_x0 != mean_x1, diff --git a/diffxpy/unit_test/test_extreme_values.py b/diffxpy/unit_test/test_extreme_values.py index 4564dea..c430784 100644 --- a/diffxpy/unit_test/test_extreme_values.py +++ b/diffxpy/unit_test/test_extreme_values.py @@ -19,7 +19,6 @@ def test_t_test_zero_variance(self): logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) - de.pkg_constants.DE_TREAT_ZEROVAR_TT_AS_SIG = True sim = Simulator(num_observations=1000, num_features=10) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() @@ -51,7 +50,6 @@ def test_rank_test_zero_variance(self): logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) - de.pkg_constants.DE_TREAT_ZEROVAR_TT_AS_SIG = True sim = Simulator(num_observations=1000, num_features=10) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() From 60356245d7a2758ca152286a395d6e478cf43bb8 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 21 Aug 2019 15:01:47 +0200 Subject: [PATCH 09/15] fixed further unit tests --- diffxpy/unit_test/test_data_types.py | 9 ++++++--- .../unit_test/{test_numeric.py => test_numeric_covar.py} | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) rename diffxpy/unit_test/{test_numeric.py => test_numeric_covar.py} (94%) diff --git a/diffxpy/unit_test/test_data_types.py b/diffxpy/unit_test/test_data_types.py index edf42dc..f4874bb 100644 --- a/diffxpy/unit_test/test_data_types.py +++ b/diffxpy/unit_test/test_data_types.py @@ -18,7 +18,9 @@ def _test_wald(self, data, sample_description, gene_names=None): sample_description=sample_description, gene_names=gene_names, factor_loc_totest="condition", - formula_loc="~ 1 + condition" + formula_loc="~ 1 + condition", + noise_model="nb", + batch_size=5 ) _ = test.summary() @@ -28,7 +30,8 @@ def _test_lrt(self, data, sample_description, gene_names=None): sample_description=sample_description, gene_names=gene_names, full_formula_loc="~ 1 + condition", - reduced_formula_loc="~ 1" + reduced_formula_loc="~ 1", + noise_model="nb" ) _ = test.summary() @@ -50,7 +53,7 @@ def _test_rank(self, data, sample_description, gene_names=None): ) _ = test.summary() - def simulate(self, n_cells: int = 20, n_genes: int = 2): + def simulate(self, n_cells: int = 200, n_genes: int = 2): sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() diff --git a/diffxpy/unit_test/test_numeric.py b/diffxpy/unit_test/test_numeric_covar.py similarity index 94% rename from diffxpy/unit_test/test_numeric.py rename to diffxpy/unit_test/test_numeric_covar.py index 7a05c60..2c3bedd 100644 --- a/diffxpy/unit_test/test_numeric.py +++ b/diffxpy/unit_test/test_numeric_covar.py @@ -7,7 +7,7 @@ import diffxpy.api as de -class TestNumeric(unittest.TestCase): +class TestNumericCovar(unittest.TestCase): def test(self): """ @@ -29,7 +29,7 @@ def test(self): sample_description["numeric2"] = np.random.random(size=sim.nobs) test = de.test.wald( - data=sim.x, + data=sim.input_data, sample_description=sample_description, formula_loc="~ 1 + condition + numeric1 + numeric2", formula_scale="~ 1", From b2f4ed794c505c818b2d589bf2d7642a017e1c5c Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 21 Aug 2019 15:28:19 +0200 Subject: [PATCH 10/15] fixed new xarray free handling in ttest and rank test --- diffxpy/stats/stats.py | 19 +++++++++--------- diffxpy/testing/det.py | 45 ++++++++++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/diffxpy/stats/stats.py b/diffxpy/stats/stats.py index 1773a12..5b8f8fe 100644 --- a/diffxpy/stats/stats.py +++ b/diffxpy/stats/stats.py @@ -1,8 +1,8 @@ -from typing import Union - import numpy as np import numpy.linalg +import scipy.sparse import scipy.stats +from typing import Union def likelihood_ratio_test( @@ -39,19 +39,18 @@ def likelihood_ratio_test( def mann_whitney_u_test( - x0: np.ndarray, - x1: np.ndarray, + x0: Union[np.ndarray, scipy.sparse.csr_matrix], + x1: Union[np.ndarray, scipy.sparse.csr_matrix] ): """ - Perform Wilcoxon rank sum test (Mann-Whitney U test) along second axis - (for each gene). + Perform Wilcoxon rank sum test (Mann-Whitney U test) along second axis, ie. for each gene. The Wilcoxon rank sum test is a non-parameteric test to compare two groups of observations. - :param x0: np.array (observations x genes) + :param x0: (observations x genes) Observations in first group by gene - :param x1: np.array (observations x genes) + :param x1: (observations x genes) Observations in second group by gene. """ axis = 1 @@ -66,8 +65,8 @@ def mann_whitney_u_test( pvals = np.asarray([ scipy.stats.mannwhitneyu( - x=x0[:, i].flatten(), - y=x1[:, i].flatten(), + x=np.asarray(x0[:, i].todense()).flatten() if isinstance(x0, scipy.sparse.csr_matrix) else x0[:, i], + y=np.asarray(x1[:, i].todense()).flatten() if isinstance(x0, scipy.sparse.csr_matrix) else x1[:, i], use_continuity=True, alternative="two-sided" ).pvalue for i in range(x0.shape[1]) diff --git a/diffxpy/testing/det.py b/diffxpy/testing/det.py index 377d3a7..d446713 100644 --- a/diffxpy/testing/det.py +++ b/diffxpy/testing/det.py @@ -3,6 +3,7 @@ from typing import Union, Dict, Tuple, List, Set import pandas as pd from random import sample +import scipy.sparse import numpy as np import patsy @@ -521,7 +522,7 @@ def _ave(self): :return: np.ndarray """ - return np.mean(self.full_estim.x, axis=0) + return np.asarray(np.mean(self.full_estim.x, axis=0)).flatten() def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e): """ @@ -715,16 +716,16 @@ def __init__( self._store_ols = None try: - if model_estim._error_codes is not None: - self._error_codes = model_estim._error_codes + if self.model_estim.error_codes is not None: + self._error_codes = self.model_estim.error_codes else: self._error_codes = None except Exception as e: self._error_codes = None try: - if model_estim._niter is not None: - self._niter = model_estim._niter + if self.model_estim.niter is not None: + self._niter = self.model_estim.niter else: self._niter = None except Exception as e: @@ -776,7 +777,7 @@ def _ave(self): :return: np.ndarray """ - return self.x.mean(axis=0) + return np.asarray(self.x.mean(axis=0)).flatten() def _test(self): """ @@ -1530,8 +1531,8 @@ def __init__( x0, x1 = split_x(data, grouping) # Only compute p-values for genes with non-zero observations and non-zero group-wise variance. - mean_x0 = x0.mean(axis=0).astype(dtype=np.float) - mean_x1 = x1.mean(axis=0).astype(dtype=np.float) + mean_x0 = np.asarray(np.mean(x0, axis=0)).flatten().astype(dtype=np.float) + mean_x1 = np.asarray(np.mean(x1, axis=0)).flatten().astype(dtype=np.float) # Avoid unnecessary mean computation: self._mean = np.average( a=np.vstack([mean_x0, mean_x1]), @@ -1541,8 +1542,13 @@ def __init__( returned=False ) self._ave_nonzero = self._mean != 0 # omit all-zero features - var_x0 = np.asarray(x0.var(axis=0)).flatten().astype(dtype=np.float) - var_x1 = np.asarray(x1.var(axis=0)).flatten().astype(dtype=np.float) + if isinstance(x0, scipy.sparse.csr_matrix): + # Efficient analytic expression of variance without densification. + var_x0 = np.asarray(np.mean(x0.power(2), axis=0)).flatten().astype(dtype=np.float) - np.square(mean_x0) + var_x1 = np.asarray(np.mean(x1.power(2), axis=0)).flatten().astype(dtype=np.float) - np.square(mean_x1) + else: + var_x0 = np.asarray(np.var(x0, axis=0)).flatten().astype(dtype=np.float) + var_x1 = np.asarray(np.var(x1, axis=0)).flatten().astype(dtype=np.float) self._var_geq_zero = np.logical_or( var_x0 > 0, var_x1 > 0 @@ -1649,8 +1655,8 @@ def __init__( x0, x1 = split_x(data, grouping) - mean_x0 = x0.mean(axis=0).astype(dtype=np.float) - mean_x1 = x1.mean(axis=0).astype(dtype=np.float) + mean_x0 = np.asarray(np.mean(x0, axis=0)).flatten().astype(dtype=np.float) + mean_x1 = np.asarray(np.mean(x1, axis=0)).flatten().astype(dtype=np.float) # Avoid unnecessary mean computation: self._mean = np.average( a=np.vstack([mean_x0, mean_x1]), @@ -1659,19 +1665,24 @@ def __init__( axis=0, returned=False ) - var_x0 = np.asarray(x0.var(axis=0)).flatten().astype(dtype=np.float) - var_x1 = np.asarray(x1.var(axis=0)).flatten().astype(dtype=np.float) + if isinstance(x0, scipy.sparse.csr_matrix): + # Efficient analytic expression of variance without densification. + var_x0 = np.asarray(np.mean(x0.power(2), axis=0)).flatten().astype(dtype=np.float) - np.square(mean_x0) + var_x1 = np.asarray(np.mean(x1.power(2), axis=0)).flatten().astype(dtype=np.float) - np.square(mean_x1) + else: + var_x0 = np.asarray(np.var(x0, axis=0)).flatten().astype(dtype=np.float) + var_x1 = np.asarray(np.var(x1, axis=0)).flatten().astype(dtype=np.float) self._var_geq_zero = np.logical_or( var_x0 > 0, var_x1 > 0 ) idx_run = np.where(np.logical_and(self._mean != 0, self._var_geq_zero))[0] - # TODO: can this be done on sparse? + # TODO: can this be done directly on sparse? pval = np.zeros([data.shape[1]]) + np.nan pval[idx_run] = stats.mann_whitney_u_test( - x0=np.asarray(x0[:, idx_run]), - x1=np.asarray(x1[:, idx_run]) + x0=x0[:, idx_run], + x1=x1[:, idx_run] ) pval[np.where(np.logical_and( np.logical_and(mean_x0 == mean_x1, self._mean > 0), From be6105a051f80d792fe7686a81bdfa59605d58b3 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 21 Aug 2019 15:47:45 +0200 Subject: [PATCH 11/15] added fitting module --- diffxpy/api/__init__.py | 1 + diffxpy/api/fit.py | 1 + diffxpy/fit/__init__.py | 1 + diffxpy/fit/external.py | 3 + diffxpy/fit/fit.py | 400 ++++++++++++++++++++++++++++++++++ diffxpy/testing/tests.py | 12 +- diffxpy/unit_test/test_fit.py | 198 +++++++++++++++++ 7 files changed, 609 insertions(+), 7 deletions(-) create mode 100644 diffxpy/api/fit.py create mode 100644 diffxpy/fit/__init__.py create mode 100644 diffxpy/fit/external.py create mode 100644 diffxpy/fit/fit.py create mode 100644 diffxpy/unit_test/test_fit.py diff --git a/diffxpy/api/__init__.py b/diffxpy/api/__init__.py index a1c1dad..18dd1b4 100644 --- a/diffxpy/api/__init__.py +++ b/diffxpy/api/__init__.py @@ -3,6 +3,7 @@ from . import test from . import enrich +from . import fit from . import stats from . import utils from .. import pkg_constants diff --git a/diffxpy/api/fit.py b/diffxpy/api/fit.py new file mode 100644 index 0000000..5a00580 --- /dev/null +++ b/diffxpy/api/fit.py @@ -0,0 +1 @@ +from diffxpy.fit import model, residuals diff --git a/diffxpy/fit/__init__.py b/diffxpy/fit/__init__.py new file mode 100644 index 0000000..25876c4 --- /dev/null +++ b/diffxpy/fit/__init__.py @@ -0,0 +1 @@ +from .fit import model, residuals \ No newline at end of file diff --git a/diffxpy/fit/external.py b/diffxpy/fit/external.py new file mode 100644 index 0000000..ec3af87 --- /dev/null +++ b/diffxpy/fit/external.py @@ -0,0 +1,3 @@ +from diffxpy.testing.tests import _fit +from diffxpy.testing.utils import parse_gene_names, parse_sample_description, parse_size_factors, \ + constraint_system_from_star \ No newline at end of file diff --git a/diffxpy/fit/fit.py b/diffxpy/fit/fit.py new file mode 100644 index 0000000..9f02809 --- /dev/null +++ b/diffxpy/fit/fit.py @@ -0,0 +1,400 @@ +import anndata +try: + from anndata.base import Raw +except ImportError: + from anndata import Raw +import logging +import numpy as np +import pandas as pd +import patsy +import scipy.sparse +from typing import Union, List, Dict, Callable, Tuple + +from batchglm.models.base import _InputDataBase +from .external import _fit +from .external import parse_gene_names, parse_sample_description, parse_size_factors, constraint_system_from_star + + +def model( + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], + formula_loc: Union[None, str] = None, + formula_scale: Union[None, str] = "~1", + as_numeric: Union[List[str], Tuple[str], str] = (), + init_a: Union[np.ndarray, str] = "AUTO", + init_b: Union[np.ndarray, str] = "AUTO", + gene_names: Union[np.ndarray, list] = None, + sample_description: Union[None, pd.DataFrame] = None, + dmat_loc: Union[patsy.design_info.DesignMatrix] = None, + dmat_scale: Union[patsy.design_info.DesignMatrix] = None, + constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + noise_model: str = "nb", + size_factors: Union[np.ndarray, pd.core.series.Series, str] = None, + batch_size: int = None, + training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO", + quick_scale: bool = False, + dtype="float64", + **kwargs +): + """ + Fit model via maximum likelihood for each gene. + + Relays to _fit with the same interface as the differential expression tests, ie. does design matrix and + constraint parsing. + + :param data: Input data matrix (observations x features) or (cells x genes). + :param formula_loc: formula + model formula for location and scale parameter models. + If not specified, `formula` will be used instead. + :param formula_scale: formula + model formula for scale parameter model. + If not specified, `formula` will be used instead. + :param as_numeric: + Which columns of sample_description to treat as numeric and + not as categorical. This yields columns in the design matrix + which do not correspond to one-hot encoded discrete factors. + This makes sense for number of genes, time, pseudotime or space + for example. + :param init_a: (Optional) Low-level initial values for a. + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize intercept with observed mean + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'a' + :param init_b: (Optional) Low-level initial values for b + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize with zeros + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'b' + :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these + :param sample_description: optional pandas.DataFrame containing sample annotations + :param dmat_loc: Pre-built location model design matrix. + This over-rides formula_loc and sample description information given in + data or sample_description. + :param dmat_scale: Pre-built scale model design matrix. + This over-rides formula_scale and sample description information given in + data or sample_description. + :param constraints_loc: Constraints for location model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the location model, dmat_loc, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param constraints_scale: Constraints for scale model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the scale model, dmat_scale, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param size_factors: 1D array of transformed library size factors for each cell in the + same order as in data or string-type column identifier of size-factor containing + column in sample description. + :param noise_model: str, noise model to use in model-based unit_test. Possible options: + + - 'nb': default + :param batch_size: The batch size to use for the estimator. + :param training_strategy: {str, function, list} training strategy to use. Can be: + + - str: will use Estimator.TrainingStrategy[training_strategy] to train + - function: Can be used to implement custom training function will be called as + `training_strategy(estimator)`. + - list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of + method arguments. + :param quick_scale: Depending on the optimizer, `scale` will be fitted faster and maybe less accurate. + + Useful in scenarios where fitting the exact `scale` is not absolutely necessary. + :param dtype: Allows specifying the precision which should be used to fit data. + + Should be "float32" for single precision or "float64" for double precision. + :param kwargs: [Debugging] Additional arguments will be passed to the _fit method. + """ + if len(kwargs) != 0: + logging.getLogger("diffxpy").debug("additional kwargs: %s", str(kwargs)) + + if (dmat_loc is None and formula_loc is None) or \ + (dmat_loc is not None and formula_loc is not None): + raise ValueError("Supply either dmat_loc or formula_loc.") + if (dmat_scale is None and formula_scale is None) or \ + (dmat_scale is not None and formula_scale != "~1"): + raise ValueError("Supply either dmat_scale or formula_scale.") + + # # Parse input data formats: + gene_names = parse_gene_names(data, gene_names) + if dmat_loc is None and dmat_scale is None: + sample_description = parse_sample_description(data, sample_description) + size_factors = parse_size_factors( + size_factors=size_factors, + data=data, + sample_description=sample_description + ) + + design_loc, constraints_loc = constraint_system_from_star( + dmat=dmat_loc, + sample_description=sample_description, + formula=formula_loc, + as_numeric=as_numeric, + constraints=constraints_loc, + return_type="patsy" + ) + design_scale, constraints_scale = constraint_system_from_star( + dmat=dmat_scale, + sample_description=sample_description, + formula=formula_scale, + as_numeric=as_numeric, + constraints=constraints_scale, + return_type="patsy" + ) + + model = _fit( + noise_model=noise_model, + data=data, + design_loc=design_loc, + design_scale=design_scale, + constraints_loc=constraints_loc, + constraints_scale=constraints_scale, + init_a=init_a, + init_b=init_b, + gene_names=gene_names, + size_factors=size_factors, + batch_size=batch_size, + training_strategy=training_strategy, + quick_scale=quick_scale, + dtype=dtype + ) + return model + + +def residuals( + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], + formula_loc: Union[None, str] = None, + formula_scale: Union[None, str] = "~1", + as_numeric: Union[List[str], Tuple[str], str] = (), + init_a: Union[np.ndarray, str] = "AUTO", + init_b: Union[np.ndarray, str] = "AUTO", + gene_names: Union[np.ndarray, list] = None, + sample_description: Union[None, pd.DataFrame] = None, + dmat_loc: Union[patsy.design_info.DesignMatrix] = None, + dmat_scale: Union[patsy.design_info.DesignMatrix] = None, + constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + noise_model: str = "nb", + size_factors: Union[np.ndarray, pd.core.series.Series, str] = None, + batch_size: int = None, + training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO", + quick_scale: bool = False, + dtype="float64", + **kwargs +): + """ + Fits model for each gene and returns residuals. + + Relays to _fit with the same interface as the differential expression tests, ie. does design matrix and + constraint parsing. + + :param data: Input data matrix (observations x features) or (cells x genes). + :param formula_loc: formula + model formula for location and scale parameter models. + If not specified, `formula` will be used instead. + :param formula_scale: formula + model formula for scale parameter model. + If not specified, `formula` will be used instead. + :param as_numeric: + Which columns of sample_description to treat as numeric and + not as categorical. This yields columns in the design matrix + which do not correspond to one-hot encoded discrete factors. + This makes sense for number of genes, time, pseudotime or space + for example. + :param init_a: (Optional) Low-level initial values for a. + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize intercept with observed mean + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'a' + :param init_b: (Optional) Low-level initial values for b + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize with zeros + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'b' + :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these + :param sample_description: optional pandas.DataFrame containing sample annotations + :param dmat_loc: Pre-built location model design matrix. + This over-rides formula_loc and sample description information given in + data or sample_description. + :param dmat_scale: Pre-built scale model design matrix. + This over-rides formula_scale and sample description information given in + data or sample_description. + :param constraints_loc: Constraints for location model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the location model, dmat_loc, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param constraints_scale: Constraints for scale model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the scale model, dmat_scale, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param size_factors: 1D array of transformed library size factors for each cell in the + same order as in data or string-type column identifier of size-factor containing + column in sample description. + :param noise_model: str, noise model to use in model-based unit_test. Possible options: + + - 'nb': default + :param batch_size: The batch size to use for the estimator. + :param training_strategy: {str, function, list} training strategy to use. Can be: + + - str: will use Estimator.TrainingStrategy[training_strategy] to train + - function: Can be used to implement custom training function will be called as + `training_strategy(estimator)`. + - list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of + method arguments. + :param quick_scale: Depending on the optimizer, `scale` will be fitted faster and maybe less accurate. + + Useful in scenarios where fitting the exact `scale` is not absolutely necessary. + :param dtype: Allows specifying the precision which should be used to fit data. + + Should be "float32" for single precision or "float64" for double precision. + :param kwargs: [Debugging] Additional arguments will be passed to the _fit method. + """ + estim = model( + data=data, + formula_loc=formula_loc, + formula_scale=formula_scale, + as_numeric=as_numeric, + init_a=init_a, + init_b=init_b, + gene_names=gene_names, + sample_description=sample_description, + dmat_loc=dmat_loc, + dmat_scale=dmat_scale, + constraints_loc=constraints_loc, + constraints_scale=constraints_scale, + noise_model=noise_model, + size_factors=size_factors, + batch_size=batch_size, + training_strategy=training_strategy, + quick_scale=quick_scale, + dtype=dtype, + ** kwargs + ) + residuals = estim.x - estim.model.location + return residuals diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py index 7c09f03..f4d8107 100644 --- a/diffxpy/testing/tests.py +++ b/diffxpy/testing/tests.py @@ -1,16 +1,14 @@ -from typing import Union, List, Dict, Callable, Tuple - import anndata +try: + from anndata.base import Raw +except ImportError: + from anndata import Raw import logging import numpy as np import pandas as pd import patsy import scipy.sparse - -try: - from anndata.base import Raw -except ImportError: - from anndata import Raw +from typing import Union, List, Dict, Callable, Tuple from batchglm import data as data_utils from batchglm.models.base import _EstimatorBase, _InputDataBase diff --git a/diffxpy/unit_test/test_fit.py b/diffxpy/unit_test/test_fit.py new file mode 100644 index 0000000..a2d65b4 --- /dev/null +++ b/diffxpy/unit_test/test_fit.py @@ -0,0 +1,198 @@ +import unittest +import logging +import numpy as np +import pandas as pd + +import diffxpy.api as de + + +class _TestFit: + + def _test_model_fit( + self, + n_cells: int, + n_genes: int, + noise_model: str + ): + """ + Test if de.wald() generates a uniform p-value distribution + if it is given data simulated based on the null model. Returns the p-value + of the two-side Kolmgorov-Smirnov test for equality of the observed + p-value distribution and a uniform distribution. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + :param noise_model: Noise model to use for data fitting. + """ + if noise_model == "nb": + from batchglm.api.models.glm_nb import Simulator + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + elif noise_model == "norm": + from batchglm.api.models.glm_norm import Simulator + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + else: + raise ValueError("noise model %s not recognized" % noise_model) + + sim = Simulator(num_observations=n_cells, num_features=n_genes) + sim.generate_sample_description(num_batches=0, num_conditions=0) + sim.generate_params(rand_fn_scale=rand_fn_scale) + sim.generate_data() + + random_sample_description = pd.DataFrame({ + "condition": np.random.randint(2, size=sim.nobs), + "batch": np.random.randint(2, size=sim.nobs) + }) + + estim = de.fit.model( + data=sim.input_data, + sample_description=random_sample_description, + formula_loc="~ 1 + condition + batch", + noise_model=noise_model + ) + return True + + def _test_residuals_fit( + self, + n_cells: int, + n_genes: int, + noise_model: str + ): + """ + Test if de.wald() (multivariate mode) generates a uniform p-value distribution + if it is given data simulated based on the null model. Returns the p-value + of the two-side Kolmgorov-Smirnov test for equality of the observed + p-value distribution and a uniform distribution. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + :param noise_model: Noise model to use for data fitting. + """ + if noise_model == "nb": + from batchglm.api.models.glm_nb import Simulator + elif noise_model == "norm": + from batchglm.api.models.glm_norm import Simulator + else: + raise ValueError("noise model %s not recognized" % noise_model) + + sim = Simulator(num_observations=n_cells, num_features=n_genes) + sim.generate_sample_description(num_batches=0, num_conditions=0) + sim.generate() + + random_sample_description = pd.DataFrame({ + "condition": np.random.randint(2, size=sim.nobs), + "batch": np.random.randint(2, size=sim.nobs) + }) + + res = de.fit.residuals( + data=sim.input_data, + sample_description=random_sample_description, + formula_loc="~ 1 + condition + batch", + noise_model=noise_model + ) + return True + + +class TestFitNb(_TestFit, unittest.TestCase): + """ + Negative binomial noise model unit tests that tests whether model fit relay works. + """ + + def test_model_fit( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if wald() generates a uniform p-value distribution for "nb" noise model. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_model_fit( + n_cells=n_cells, + n_genes=n_genes, + noise_model="nb" + ) + + def test_residuals_fit( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if wald() generates a uniform p-value distribution for "nb" noise model + for multiple coefficients to test. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_residuals_fit( + n_cells=n_cells, + n_genes=n_genes, + noise_model="nb" + ) + + +class TestFitNorm(_TestFit, unittest.TestCase): + """ + Normal noise model unit tests that tests whether model fit relay works. + """ + + def test_model_fit( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if wald() generates a uniform p-value distribution for "norm" noise model. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_model_fit( + n_cells=n_cells, + n_genes=n_genes, + noise_model="norm" + ) + + def test_residuals_fit( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if wald() generates a uniform p-value distribution for "norm" noise model + for multiple coefficients to test. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_residuals_fit( + n_cells=n_cells, + n_genes=n_genes, + noise_model="norm" + ) + + +if __name__ == '__main__': + unittest.main() From dba866aeca1802d1567741d6958bc64c7dbee5c4 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 21 Aug 2019 16:08:26 +0200 Subject: [PATCH 12/15] added partitioned model fitting --- diffxpy/api/fit.py | 2 +- diffxpy/fit/__init__.py | 2 +- diffxpy/fit/external.py | 2 +- diffxpy/fit/fit.py | 425 +++++++++++++++++++++++++++++++++- diffxpy/testing/tests.py | 11 +- diffxpy/unit_test/test_fit.py | 100 +++++++- 6 files changed, 524 insertions(+), 18 deletions(-) diff --git a/diffxpy/api/fit.py b/diffxpy/api/fit.py index 5a00580..f08ce84 100644 --- a/diffxpy/api/fit.py +++ b/diffxpy/api/fit.py @@ -1 +1 @@ -from diffxpy.fit import model, residuals +from diffxpy.fit import model, residuals, partition diff --git a/diffxpy/fit/__init__.py b/diffxpy/fit/__init__.py index 25876c4..d1ddf78 100644 --- a/diffxpy/fit/__init__.py +++ b/diffxpy/fit/__init__.py @@ -1 +1 @@ -from .fit import model, residuals \ No newline at end of file +from .fit import model, residuals, partition \ No newline at end of file diff --git a/diffxpy/fit/external.py b/diffxpy/fit/external.py index ec3af87..330ea9c 100644 --- a/diffxpy/fit/external.py +++ b/diffxpy/fit/external.py @@ -1,3 +1,3 @@ from diffxpy.testing.tests import _fit -from diffxpy.testing.utils import parse_gene_names, parse_sample_description, parse_size_factors, \ +from diffxpy.testing.utils import parse_gene_names, parse_sample_description, parse_size_factors, parse_grouping, \ constraint_system_from_star \ No newline at end of file diff --git a/diffxpy/fit/fit.py b/diffxpy/fit/fit.py index 9f02809..995a4d3 100644 --- a/diffxpy/fit/fit.py +++ b/diffxpy/fit/fit.py @@ -12,7 +12,8 @@ from batchglm.models.base import _InputDataBase from .external import _fit -from .external import parse_gene_names, parse_sample_description, parse_size_factors, constraint_system_from_star +from .external import parse_gene_names, parse_sample_description, parse_size_factors, parse_grouping, \ + constraint_system_from_star def model( @@ -39,9 +40,6 @@ def model( """ Fit model via maximum likelihood for each gene. - Relays to _fit with the same interface as the differential expression tests, ie. does design matrix and - constraint parsing. - :param data: Input data matrix (observations x features) or (cells x genes). :param formula_loc: formula model formula for location and scale parameter models. @@ -247,9 +245,6 @@ def residuals( """ Fits model for each gene and returns residuals. - Relays to _fit with the same interface as the differential expression tests, ie. does design matrix and - constraint parsing. - :param data: Input data matrix (observations x features) or (cells x genes). :param formula_loc: formula model formula for location and scale parameter models. @@ -398,3 +393,419 @@ def residuals( ) residuals = estim.x - estim.model.location return residuals + + +def partition( + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], + parts: Union[str, np.ndarray, list], + gene_names: Union[np.ndarray, list] = None, + sample_description: pd.DataFrame = None, + dmat_loc: Union[patsy.design_info.DesignMatrix] = None, + dmat_scale: Union[patsy.design_info.DesignMatrix] = None, + size_factors: Union[np.ndarray, pd.core.series.Series, str] = None +): + r""" + Perform differential expression test for each group. This class handles + the partitioning of the data set, the differential test callls and + the sumamry of the individual tests into one + DifferentialExpressionTestMulti object. All functions the yield + DifferentialExpressionTestSingle objects can be performed on each + partition. + + Wraps _Partition so that doc strings are nice. + + :param data: Array-like or anndata.Anndata object containing observations. + Input data matrix (observations x features) or (cells x genes). + :param parts: str, array + + - column in data.obs/sample_description which contains the split of observations into the two groups. + - array of length `num_observations` containing group labels + :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these + :param sample_description: optional pandas.DataFrame containing sample annotations. + :param dmat_loc: Pre-built location model design matrix. + This over-rides formula_loc and sample description information given in + data or sample_description. + :param dmat_scale: Pre-built scale model design matrix. + This over-rides formula_scale and sample description information given in + data or sample_description. + :param size_factors: 1D array of transformed library size factors for each cell in the + same order as in data or string-type column identifier of size-factor containing + column in sample description. + """ + return (_Partition( + data=data, + parts=parts, + gene_names=gene_names, + sample_description=sample_description, + dmat_loc=dmat_loc, + dmat_scale=dmat_scale, + size_factors=size_factors + )) + + +class _Partition: + """ + Perform model fit separately for each group. + """ + + def __init__( + self, + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], + parts: Union[str, np.ndarray, list], + gene_names: Union[np.ndarray, list] = None, + sample_description: pd.DataFrame = None, + dmat_loc: Union[patsy.design_info.DesignMatrix] = None, + dmat_scale: Union[patsy.design_info.DesignMatrix] = None, + size_factors: Union[np.ndarray, pd.core.series.Series, str] = None + ): + """ + :param data: Array-like or anndata.Anndata object containing observations. + Input data matrix (observations x features) or (cells x genes). + :param parts: str, array + + - column in data.obs/sample_description which contains the split of observations into the two groups. + - array of length `num_observations` containing group labels + :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these + :param sample_description: optional pandas.DataFrame containing sample annotations. + :param dmat_loc: Pre-built location model design matrix. + This over-rides formula_loc and sample description information given in + data or sample_description. + :param dmat_scale: Pre-built scale model design matrix. + This over-rides formula_scale and sample description information given in + data or sample_description. + :param size_factors: 1D array of transformed library size factors for each cell in the + same order as in data or string-type column identifier of size-factor containing + column in sample description. + """ + if isinstance(data, _InputDataBase): + self.x = data.x + elif isinstance(data, anndata.AnnData) or isinstance(data, Raw): + self.x = data.X + elif isinstance(data, np.ndarray): + self.x = data + else: + raise ValueError("data type %s not recognized" % type(data)) + self.gene_names = parse_gene_names(data, gene_names) + self.sample_description = parse_sample_description(data, sample_description) + self.dmat_loc = dmat_loc + self.dmat_scale = dmat_scale + self.size_factors = size_factors + self.partition = parse_grouping(data, sample_description, parts) + self.partitions = np.unique(self.partition) + self.partition_idx = [np.where(self.partition == x)[0] for x in self.partitions] + + def model( + self, + formula_loc: Union[None, str] = None, + formula_scale: Union[None, str] = "~1", + as_numeric: Union[List[str], Tuple[str], str] = (), + init_a: Union[np.ndarray, str] = "AUTO", + init_b: Union[np.ndarray, str] = "AUTO", + constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + noise_model: str = "nb", + batch_size: int = None, + training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO", + quick_scale: bool = False, + dtype="float64", + **kwargs + ): + """ + Fit model for each gene and for each group. + + :param formula_loc: formula + model formula for location and scale parameter models. + If not specified, `formula` will be used instead. + :param formula_scale: formula + model formula for scale parameter model. + If not specified, `formula` will be used instead. + :param as_numeric: + Which columns of sample_description to treat as numeric and + not as categorical. This yields columns in the design matrix + which do not correspond to one-hot encoded discrete factors. + This makes sense for number of genes, time, pseudotime or space + for example. + :param init_a: (Optional) Low-level initial values for a. + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize intercept with observed mean + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'a' + :param init_b: (Optional) Low-level initial values for b + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize with zeros + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'b' + :param constraints_loc: Constraints for location model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the location model, dmat_loc, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param constraints_scale: Constraints for scale model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the scale model, dmat_scale, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param noise_model: str, noise model to use in model-based unit_test. Possible options: + + - 'nb': default + :param batch_size: The batch size to use for the estimator. + :param training_strategy: {str, function, list} training strategy to use. Can be: + + - str: will use Estimator.TrainingStrategy[training_strategy] to train + - function: Can be used to implement custom training function will be called as + `training_strategy(estimator)`. + - list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of + method arguments. + :param quick_scale: Depending on the optimizer, `scale` will be fitted faster and maybe less accurate. + + Useful in scenarios where fitting the exact `scale` is not absolutely necessary. + :param dtype: Allows specifying the precision which should be used to fit data. + + Should be "float32" for single precision or "float64" for double precision. + :param kwargs: [Debugging] Additional arguments will be passed to the _fit method. + """ + estims = [] + for i, idx in enumerate(self.partition_idx): + estims.append(model( + data=self.x[idx, :], + formula_loc=formula_loc, + formula_scale=formula_scale, + as_numeric=as_numeric, + init_a=init_a, + init_b=init_b, + gene_names=self.gene_names, + sample_description=self.sample_description.iloc[idx, :], + dmat_loc=self.dmat_loc[idx, :] if self.dmat_loc is not None else None, + dmat_scale=self.dmat_scale[idx, :] if self.dmat_scale is not None else None, + constraints_loc=constraints_loc, + constraints_scale=constraints_scale, + noise_model=noise_model, + size_factors=self.size_factors[idx] if self.size_factors is not None else None, + batch_size=batch_size, + training_strategy=training_strategy, + quick_scale=quick_scale, + dtype=dtype, + **kwargs + )) + return estims + + def residuals( + self, + formula_loc: Union[None, str] = None, + formula_scale: Union[None, str] = "~1", + as_numeric: Union[List[str], Tuple[str], str] = (), + init_a: Union[np.ndarray, str] = "AUTO", + init_b: Union[np.ndarray, str] = "AUTO", + constraints_loc: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + constraints_scale: Union[None, List[str], Tuple[str, str], dict, np.ndarray] = None, + noise_model: str = "nb", + batch_size: int = None, + training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO", + quick_scale: bool = False, + dtype="float64", + **kwargs + ): + """ + Fit model for each gene and for each group and computes residuals. + + :param formula_loc: formula + model formula for location and scale parameter models. + If not specified, `formula` will be used instead. + :param formula_scale: formula + model formula for scale parameter model. + If not specified, `formula` will be used instead. + :param as_numeric: + Which columns of sample_description to treat as numeric and + not as categorical. This yields columns in the design matrix + which do not correspond to one-hot encoded discrete factors. + This makes sense for number of genes, time, pseudotime or space + for example. + :param init_a: (Optional) Low-level initial values for a. + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize intercept with observed mean + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'a' + :param init_b: (Optional) Low-level initial values for b + Can be: + + - str: + * "auto": automatically choose best initialization + * "standard": initialize with zeros + * "closed_form": try to initialize with closed form + - np.ndarray: direct initialization of 'b' + :param constraints_loc: Constraints for location model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the location model, dmat_loc, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param constraints_scale: Constraints for scale model. Can be one of the following: + + - np.ndarray: + Array with constraints in rows and model parameters in columns. + Each constraint contains non-zero entries for the a of parameters that + has to sum to zero. This constraint is enforced by binding one parameter + to the negative sum of the other parameters, effectively representing that + parameter as a function of the other parameters. This dependent + parameter is indicated by a -1 in this array, the independent parameters + of that constraint (which may be dependent at an earlier constraint) + are indicated by a 1. You should only use this option + together with prebuilt design matrix for the scale model, dmat_scale, + for example via de.utils.setup_constrained(). + - dict: + Every element of the dictionary corresponds to one set of equality constraints. + Each set has to be be an entry of the form {..., x: y, ...} + where x is the factor to be constrained and y is a factor by which levels of x are grouped + and then constrained. Set y="1" to constrain all levels of x to sum to one, + a single equality constraint. + + E.g.: {"batch": "condition"} Batch levels within each condition are constrained to sum to + zero. This is applicable if repeats of a an experiment within each condition + are independent so that the set-up ~1+condition+batch is perfectly confounded. + + Can only group by non-constrained effects right now, use constraint_matrix_from_string + for other cases. + - list of strings or tuple of strings: + String encoded equality constraints. + + E.g. ["batch1 + batch2 + batch3 = 0"] + - None: + No constraints are used, this is equivalent to using an identity matrix as a + constraint matrix. + :param noise_model: str, noise model to use in model-based unit_test. Possible options: + + - 'nb': default + :param batch_size: The batch size to use for the estimator. + :param training_strategy: {str, function, list} training strategy to use. Can be: + + - str: will use Estimator.TrainingStrategy[training_strategy] to train + - function: Can be used to implement custom training function will be called as + `training_strategy(estimator)`. + - list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of + method arguments. + :param quick_scale: Depending on the optimizer, `scale` will be fitted faster and maybe less accurate. + + Useful in scenarios where fitting the exact `scale` is not absolutely necessary. + :param dtype: Allows specifying the precision which should be used to fit data. + + Should be "float32" for single precision or "float64" for double precision. + :param kwargs: [Debugging] Additional arguments will be passed to the _fit method. + """ + residuals = [] + for i, idx in enumerate(self.partition_idx): + residuals.append(residuals( + data=self.x[idx, :], + formula_loc=formula_loc, + formula_scale=formula_scale, + as_numeric=as_numeric, + init_a=init_a, + init_b=init_b, + gene_names=self.gene_names, + sample_description=self.sample_description.iloc[idx, :], + dmat_loc=self.dmat_loc[idx, :] if self.dmat_loc is not None else None, + dmat_scale=self.dmat_scale[idx, :] if self.dmat_scale is not None else None, + constraints_loc=constraints_loc, + constraints_scale=constraints_scale, + noise_model=noise_model, + size_factors=self.size_factors[idx] if self.size_factors is not None else None, + batch_size=batch_size, + training_strategy=training_strategy, + quick_scale=quick_scale, + dtype=dtype, + **kwargs + )) + return residuals + + diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py index f4d8107..bd7f85c 100644 --- a/diffxpy/testing/tests.py +++ b/diffxpy/testing/tests.py @@ -1325,7 +1325,7 @@ class _Partition: def __init__( self, - data: Union[anndata.AnnData, np.ndarray], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, _InputDataBase], parts: Union[str, np.ndarray, list], gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None @@ -1340,7 +1340,14 @@ def __init__( :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these :param sample_description: optional pandas.DataFrame containing sample annotations """ - self.x = data + if isinstance(data, _InputDataBase): + self.x = data.x + elif isinstance(data, anndata.AnnData) or isinstance(data, Raw): + self.x = data.X + elif isinstance(data, np.ndarray): + self.x = data + else: + raise ValueError("data type %s not recognized" % type(data)) self.gene_names = parse_gene_names(data, gene_names) self.sample_description = parse_sample_description(data, sample_description) self.partition = parse_grouping(data, sample_description, parts) diff --git a/diffxpy/unit_test/test_fit.py b/diffxpy/unit_test/test_fit.py index a2d65b4..80b177f 100644 --- a/diffxpy/unit_test/test_fit.py +++ b/diffxpy/unit_test/test_fit.py @@ -51,6 +51,52 @@ def _test_model_fit( ) return True + def _test_model_fit_partition( + self, + n_cells: int, + n_genes: int, + noise_model: str + ): + """ + Test if de.wald() generates a uniform p-value distribution + if it is given data simulated based on the null model. Returns the p-value + of the two-side Kolmgorov-Smirnov test for equality of the observed + p-value distribution and a uniform distribution. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + :param noise_model: Noise model to use for data fitting. + """ + if noise_model == "nb": + from batchglm.api.models.glm_nb import Simulator + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + elif noise_model == "norm": + from batchglm.api.models.glm_norm import Simulator + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + else: + raise ValueError("noise model %s not recognized" % noise_model) + + sim = Simulator(num_observations=n_cells, num_features=n_genes) + sim.generate_sample_description(num_batches=0, num_conditions=0) + sim.generate_params(rand_fn_scale=rand_fn_scale) + sim.generate_data() + + random_sample_description = pd.DataFrame({ + "condition": np.random.randint(2, size=sim.nobs), + "batch": np.random.randint(2, size=sim.nobs) + }) + + partition = de.fit.partition( + data=sim.input_data, + sample_description=random_sample_description, + parts="condition" + ) + estim = partition.model( + formula_loc="~ 1 + batch", + noise_model=noise_model + ) + return True + def _test_residuals_fit( self, n_cells: int, @@ -103,7 +149,7 @@ def test_model_fit( n_genes: int = 2 ): """ - Test if wald() generates a uniform p-value distribution for "nb" noise model. + Test if model for "nb" noise model works. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). @@ -119,14 +165,35 @@ def test_model_fit( noise_model="nb" ) + def test_model_fit_partition( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if partitioned model for "nb" noise model works. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_model_fit_partition( + n_cells=n_cells, + n_genes=n_genes, + noise_model="nb" + ) + def test_residuals_fit( self, n_cells: int = 2000, n_genes: int = 2 ): """ - Test if wald() generates a uniform p-value distribution for "nb" noise model - for multiple coefficients to test. + Test if residual fit for "nb" noise model works. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). @@ -154,7 +221,7 @@ def test_model_fit( n_genes: int = 2 ): """ - Test if wald() generates a uniform p-value distribution for "norm" noise model. + Test if model fit for "norm" noise model works. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). @@ -170,14 +237,35 @@ def test_model_fit( noise_model="norm" ) + def test_model_fit_partition( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if partitioned model fit for "norm" noise model works. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_model_fit_partition( + n_cells=n_cells, + n_genes=n_genes, + noise_model="norm" + ) + def test_residuals_fit( self, n_cells: int = 2000, n_genes: int = 2 ): """ - Test if wald() generates a uniform p-value distribution for "norm" noise model - for multiple coefficients to test. + Test if residual fit for "norm" noise model works. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). From 02218cafd7e0d04b9c724c9405e25ec29cd14189 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 21 Aug 2019 16:18:52 +0200 Subject: [PATCH 13/15] fixed plotting calls --- diffxpy/testing/det.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/diffxpy/testing/det.py b/diffxpy/testing/det.py index d446713..94c6f99 100644 --- a/diffxpy/testing/det.py +++ b/diffxpy/testing/det.py @@ -305,6 +305,7 @@ def plot_volcano( plt.show() plt.close(fig) + plt.ion() if return_axs: return ax @@ -853,12 +854,19 @@ def summary( def plot_vs_ttest( self, log10=False, + show: bool = True, + save: Union[str, None] = None, + suffix: str = "_plot_vs_ttest.png", return_axs: bool = False ): """ Normalizes data by size factors if any were used in model. :param log10: + :param show: Whether (if save is not None) and where (save indicates dir and file stem) to display plot. + :param save: Path+file name stem to save plots to. + File will be save+suffix. Does not save if save is None. + :param suffix: Suffix for file name to save plot to. Also use this to set the file type. :param return_axs: Whether to return axis objects. :return: @@ -867,12 +875,17 @@ def plot_vs_ttest( import seaborn as sns from .tests import t_test + plt.ioff() + grouping = np.asarray(self.model_estim.input_data.design_loc[:, self.coef_loc_totest]) # Normalize by size factors that were used in regression. - sf = np.broadcast_to(np.expand_dims(self.model_estim.input_data.size_factors, axis=1), - shape=self.model_estim.x.shape) + if self.model_estim.input_data.size_factors is not None: + sf = np.broadcast_to(np.expand_dims(self.model_estim.input_data.size_factors, axis=1), + shape=self.model_estim.x.shape) + else: + sf = np.ones(shape=(self.model_estim.x.shape[0], 1)) ttest = t_test( - data=self.model_estim.X.multiply(1 / sf, copy=True), + data=self.model_estim.x / sf, grouping=grouping, gene_names=self.gene_ids, ) @@ -889,6 +902,16 @@ def plot_vs_ttest( ax.set(xlabel="t-test", ylabel='wald test') + # Save, show and return figure. + if save is not None: + plt.savefig(save + suffix) + + if show: + plt.show() + + plt.close(fig) + plt.ion() + if return_axs: return ax else: @@ -930,6 +953,8 @@ def plot_comparison_ols_coef( from matplotlib import rcParams from batchglm.api.models.glm_norm import Estimator, InputDataGLM + plt.ioff() + # Run OLS model fit to have comparison coefficients. if self._store_ols is None: input_data_ols = InputDataGLM( @@ -1067,6 +1092,8 @@ def plot_comparison_ols_pred( from matplotlib import rcParams from batchglm.api.models.glm_norm import Estimator, InputDataGLM + plt.ioff() + # Run OLS model fit to have comparison coefficients. if self._store_ols is None: input_data_ols = InputDataGLM( From c2615a3be2982349143611300427ed2958ca9664 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 21 Aug 2019 17:14:26 +0200 Subject: [PATCH 14/15] removed logging during fitting --- diffxpy/testing/tests.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py index bd7f85c..0a87d2f 100644 --- a/diffxpy/testing/tests.py +++ b/diffxpy/testing/tests.py @@ -127,6 +127,7 @@ def _fit( } if isinstance(training_strategy, str) and training_strategy.lower() == 'bfgs': + assert False, "depreceated" lib_size = np.zeros(data.shape[0]) if noise_model == "nb" or noise_model == "negative_binomial": estim = Estim_BFGS(X=data, design_loc=design_loc, design_scale=design_scale, @@ -143,8 +144,6 @@ def _fit( else: raise ValueError('base.test(): `noise_model="%s"` not recognized.' % noise_model) - logging.getLogger("diffxpy").info("Fitting model...") - logging.getLogger("diffxpy").debug(" * Assembling input data...") input_data = InputDataGLM( data=data, design_loc=design_loc, @@ -155,7 +154,6 @@ def _fit( feature_names=gene_names, ) - logging.getLogger("diffxpy").debug(" * Set up Estimator...") constructor_args = {} if batch_size is not None: constructor_args["batch_size"] = batch_size @@ -173,12 +171,9 @@ def _fit( dtype=dtype, **constructor_args ) - - logging.getLogger("diffxpy").debug(" * Initializing Estimator...") estim.initialize() - logging.getLogger("diffxpy").debug(" * Run estimation...") - # training: + # Training: if callable(training_strategy): # call training_strategy if it is a function training_strategy(estim) @@ -186,9 +181,7 @@ def _fit( estim.train_sequence(training_strategy=training_strategy) if close_session: - logging.getLogger("diffxpy").debug(" * Finalize estimation...") estim.finalize() - logging.getLogger("diffxpy").debug(" * Model fitting done.") return estim From 3eb7dea6d66f0593c0337e2dad4d76c132168486 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 21 Aug 2019 17:17:29 +0200 Subject: [PATCH 15/15] incremented batchglm requirement to 0.6.3 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6ba511f..e8acae9 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ 'scipy>=1.2.1', 'pandas', 'patsy>=0.5.0', - 'batchglm>=0.6.1', + 'batchglm>=0.6.3', 'statsmodels', ], extras_require={