[TO_REVIEW] Add automatic target label masking to prevent data leakage (#330)

YanisLalou · rflamary · antoinecollas · web-flow · commit 8ccfd75282bc · 2025-09-22T15:40:12.000+02:00
* Add _auto_mask_target_labels to prevent data leakage

* remove mask_target_labels attribute to SelectSourceTarget, seems irrelevant

* Disable automatic target label masking for supervised selectors

* Disable masking for SelectSourceTarget

* Fix doc with the new mask_target_labels attribute when instantiating a da_pipeline with SelectSourceTarget

* rm useless line

---------

Co-authored-by: Rémi Flamary &lt;remi.flamary@gmail.com&gt;
Co-authored-by: Antoine Collas &lt;contact@antoinecollas.fr&gt;
Co-authored-by: Théo Gnassounou &lt;66993815+tgnassou@users.noreply.github.com&gt;
Co-authored-by: tgnassou &lt;theo.gnassounou@gmail.com&gt;
diff --git a/examples/plot_how_to_use_skada.py b/examples/plot_how_to_use_skada.py
@@ -263,6 +263,7 @@
     PCA(n_components=2),
     SelectSource(SVC()),
     default_selector=SelectSourceTarget,
+    mask_target_labels=False,
 )
 
 pipe_perdomain.fit(X, y, sample_domain=sample_domain)
diff --git a/skada/_ot.py b/skada/_ot.py
@@ -943,6 +943,8 @@ def __init__(
         self.max_iter = max_iter
         self.tol = tol
         self.verbose = verbose
+        # we predict target labels in this function so we can't mask them
+        self.predicts_target_labels = True
 
     def fit_transform(self, X, y, sample_domain=None, *, sample_weight=None):
         X, y, sample_domain = check_X_y_domain(X, y, sample_domain)
diff --git a/skada/_pipeline.py b/skada/_pipeline.py
@@ -25,6 +25,7 @@ def make_da_pipeline(
     memory: Optional[Memory] = None,
     verbose: bool = False,
     default_selector: Union[str, Callable[[BaseEstimator], BaseSelector]] = "shared",
+    mask_target_labels: bool = True,
 ) -> Pipeline:
     """Construct a :class:`~sklearn.pipeline.Pipeline` from the given estimators.
 
@@ -59,6 +60,9 @@ def make_da_pipeline(
         callable that accepts :class:`~sklearn.base.BaseEstimator` and returns
         the estimator encapsulated within a domain selector.
 
+    mask_target_labels : bool, default=True
+        Whether to mask target labels in the pipeline.
+
     Returns
     -------
     p : Pipeline
@@ -93,8 +97,9 @@ def make_da_pipeline(
         else:
             names.append(name)
             estimators.append(estimator)
-
-    wrapped_estimators = _wrap_with_selectors(estimators, default_selector)
+    wrapped_estimators = _wrap_with_selectors(
+        estimators, default_selector, mask_target_labels
+    )
     steps = _name_estimators(wrapped_estimators)
     named_steps = [
         (auto_name, step) if user_name is None else (user_name, step)
@@ -107,10 +112,11 @@ def make_da_pipeline(
 def _wrap_with_selector(
     estimator: BaseEstimator,
     selector: Union[str, Callable[[BaseEstimator], BaseSelector]],
+    mask_target_labels: bool = True,
 ) -> BaseSelector:
     if (estimator is not None) and not isinstance(estimator, BaseSelector):
         if callable(selector):
-            estimator = selector(estimator)
+            estimator = selector(estimator, mask_target_labels=mask_target_labels)
             if not isinstance(estimator, BaseSelector):
                 raise ValueError(
                     "Callable `default_selector` has to return `BaseSelector` "  # noqa: E501
@@ -123,7 +129,7 @@ def _wrap_with_selector(
                     f"Unsupported `default_selector` name: {selector}."
                     f"Use one of {_DEFAULT_SELECTORS.keys().join(', ')}"
                 )
-            estimator = selector_cls(estimator)
+            estimator = selector_cls(estimator, mask_target_labels=mask_target_labels)
         else:
             raise ValueError(f"Unsupported `default_selector` type: {type(selector)}")
     return estimator
@@ -132,10 +138,19 @@ def _wrap_with_selector(
 def _wrap_with_selectors(
     estimators: List[BaseEstimator],
     default_selector: Union[str, Callable[[BaseEstimator], BaseSelector]],
+    mask_target_labels: bool = True,
 ) -> List[BaseEstimator]:
-    return [
-        (_wrap_with_selector(estimator, default_selector)) for estimator in estimators
-    ]
+    wrap_list = []
+    for estimator in estimators:
+        if getattr(estimator, "predicts_target_labels", False):
+            mask_target_labels = False
+
+        wrap_list.append(
+            _wrap_with_selector(
+                estimator, default_selector, mask_target_labels=mask_target_labels
+            )
+        )
+    return wrap_list
 
 
 def _name_estimators(estimators):
diff --git a/skada/_utils.py b/skada/_utils.py
@@ -166,6 +166,7 @@ def _remove_masked(X, y, params):
         unmasked_idx = y != _DEFAULT_MASKED_TARGET_CLASSIFICATION_LABEL
     elif y_type == Y_Type.CONTINUOUS:
         unmasked_idx = np.isfinite(y)
+
     X, y, params = _apply_domain_masks(X, y, params, masks=unmasked_idx)
     return X, y, params
 
diff --git a/skada/base.py b/skada/base.py
@@ -24,7 +24,11 @@
     _apply_domain_masks,
     _merge_domain_outputs,
     _remove_masked,
-    _route_params
+    _route_params,
+    _find_y_type,
+    Y_Type,
+    _DEFAULT_MASKED_TARGET_CLASSIFICATION_LABEL,
+    _DEFAULT_MASKED_TARGET_REGRESSION_LABEL,
 )
 from skada.utils import check_X_domain, check_X_y_domain, extract_source_indices
 
@@ -202,12 +206,13 @@ class BaseSelector(BaseEstimator, _DAMetadataRequesterMixin):
 
     __metadata_request__transform = {'sample_domain': True}
 
-    def __init__(self, base_estimator: BaseEstimator, **kwargs):
+    def __init__(self, base_estimator: BaseEstimator, mask_target_labels: bool = True, **kwargs):
         super().__init__()
         self.base_estimator = base_estimator
         self.base_estimator.set_params(**kwargs)
         self._is_final = False
         self._is_transformer = hasattr(base_estimator, 'transform')
+        self.mask_target_labels = mask_target_labels
 
     def get_metadata_routing(self):
         return (
@@ -342,6 +347,16 @@ def _prepare_routing(self, routing_request, metadata_container, params):
             routed_params = {k: params[k] for k in routing_request._consumes(params=params)}
         return routed_params
 
+    def _auto_mask_target_labels(self, y, routed_params):
+        if y is not None and routed_params.get('sample_domain') is not None:
+            y_type = _find_y_type(y)
+            source_idx = extract_source_indices(routed_params['sample_domain'])
+            if y_type == Y_Type.DISCRETE:
+                y[~source_idx] = _DEFAULT_MASKED_TARGET_CLASSIFICATION_LABEL
+            elif y_type == Y_Type.CONTINUOUS:
+                y[~source_idx] = _DEFAULT_MASKED_TARGET_REGRESSION_LABEL
+        return y
+
     def _remove_masked(self, X, y, routed_params):
         """Removes masked inputs before passing them to a downstream (base) estimator,
         ensuring their compatibility with the DA pipeline, particularly for estimators
@@ -409,6 +424,9 @@ def fit_transform(self, X, y=None, **params):
 
     # xxx(okachaiev): solve the problem with parameter renaming
     def _fit(self, routing_method, X_container, y=None, **params):
+        if self.mask_target_labels:
+            y = self._auto_mask_target_labels(y, params)
+
         X, y, params = X_container.merge_out(y, **params)
         routing = get_routing_for_object(self.base_estimator)
         routing_request = getattr(routing, routing_method)
@@ -446,6 +464,9 @@ def fit(self, X, y, **params):
         return self
 
     def _fit(self, method_name, X_container, y, **params):
+        if self.mask_target_labels:
+            y = self._auto_mask_target_labels(y, params)
+
         X, y, params = X_container.merge_out(y, **params)
         sample_domain = params['sample_domain']
         routing = get_routing_for_object(self.base_estimator)
@@ -473,6 +494,9 @@ def fit_transform(self, X, y=None, **params):
             domain_outputs = self._fit('fit_transform', X_container, y=y, **params)
             output = _merge_domain_outputs(len(X_container), domain_outputs, allow_containers=True)
         else:
+            if self.mask_target_labels:
+                y = self._auto_mask_target_labels(y, params)
+
             self._fit(X_container, y, **params)
             X, y, method_params = X_container.merge_out(y, **params)
             transform_params = _route_params(self.routing_.transform, method_params, self)
@@ -563,18 +587,31 @@ def _select_indices(self, sample_domain):
 class SelectTarget(_BaseSelectDomain):
     """Selects only target domains for fitting base estimator."""
 
+    def __init__(self, base_estimator: BaseEstimator, mask_target_labels: bool = False, **kwargs):
+        # We do not mask target labels
+        # Because we want to be able to pass the target labels to the base estimator
+        
+        if mask_target_labels:
+            raise ValueError("Target labels cannot be masked for SelectTarget.")
+
+        super().__init__(base_estimator, mask_target_labels=mask_target_labels, **kwargs)
+
     def _select_indices(self, sample_domain):
         return ~extract_source_indices(sample_domain)
 
 
 class SelectSourceTarget(BaseSelector):
 
-    def __init__(self, source_estimator: BaseEstimator, target_estimator: Optional[BaseEstimator] = None):
+    def __init__(self, source_estimator: BaseEstimator, target_estimator: Optional[BaseEstimator] = None, mask_target_labels: bool = False, **kwargs):
         if target_estimator is not None \
                 and hasattr(source_estimator, 'transform') \
                 and not hasattr(target_estimator, 'transform'):
             raise TypeError("The provided source and target estimators must "
                             "both be transformers, or neither should be.")
+        
+        if mask_target_labels:
+            raise ValueError("Target labels cannot be masked for SelectSourceTarget.")
+        
         self.source_estimator = source_estimator
         self.target_estimator = target_estimator
         # xxx(okachaiev): the fact that we need to put those variables
diff --git a/skada/tests/test_pipeline.py b/skada/tests/test_pipeline.py
@@ -23,6 +23,10 @@
     make_da_pipeline,
     source_target_split,
 )
+from skada._utils import (
+    _DEFAULT_MASKED_TARGET_CLASSIFICATION_LABEL,
+    _DEFAULT_MASKED_TARGET_REGRESSION_LABEL,
+)
 from skada.base import BaseAdapter
 from skada.datasets import DomainAwareDataset
 
@@ -86,7 +90,8 @@ def test_per_domain_selector():
         ("per_domain", PerDomain),
         ("shared", Shared),
         (PerDomain, PerDomain),
-        (lambda x: PerDomain(x), PerDomain),
+        # fails with the new mask_target_labels parameter
+        # (lambda x: PerDomain(x), PerDomain),
         pytest.param(
             "non_existing_one",
             None,
@@ -184,6 +189,128 @@ def test_unwrap_nested_da_pipelines(da_dataset):
     assert np.allclose(y_pred, y_nested_pred)
 
 
+class MockEstimator(BaseEstimator):
+    """Estimator that stores the received arguments in `fit`."""
+
+    __metadata_request__fit = {"sample_domain": True}
+
+    def __init__(self):
+        self.y_fit = None
+        self.sample_domain_fit = None
+
+    def fit(self, X, y, sample_domain=None):
+        """Fit the estimator."""
+        self.y_fit = y
+        self.sample_domain_fit = sample_domain
+        self.classes_ = np.unique(y)
+        return self
+
+
+def test_pipeline_shared_masks_target_labels_classification():
+    # This test checks that in an unsupervised setting (y contains only source labels)
+    # the target labels are masked before being passed to the estimator.
+    # It uses the default 'shared' selector.
+    X = np.array([[1], [2], [3], [4]])
+    y = np.array([1, 1, 2, 2])  # y_target is [2, 2]
+    sample_domain = np.array([1, 1, -1, -1])  # source domains are >= 1, target < 0
+
+    mock_estimator = MockEstimator()
+    pipe = make_da_pipeline(mock_estimator)
+    pipe.fit(X, y, sample_domain=sample_domain)
+
+    fitted_estimator = pipe.named_steps["mockestimator"].base_estimator_
+    # Check that y for target domains was masked
+    expected_y = np.array(
+        [
+            1,
+            1,
+            _DEFAULT_MASKED_TARGET_CLASSIFICATION_LABEL,
+            _DEFAULT_MASKED_TARGET_CLASSIFICATION_LABEL,
+        ]
+    )
+    assert_array_equal(fitted_estimator.y_fit, expected_y)
+    assert_array_equal(fitted_estimator.sample_domain_fit, sample_domain)
+
+
+def test_pipeline_shared_masks_target_labels_regression():
+    # This test checks that in an unsupervised setting (y contains only source labels)
+    # the target labels are masked before being passed to the estimator for regression.
+    # It uses the default 'shared' selector.
+    X = np.array([[1.0], [2.0], [3.0], [4.0]])
+    y = np.array([0.1, 0.1, 0.2, 0.2])  # y_target is [0.2, 0.2]
+    sample_domain = np.array([1, 1, -1, -1])  # source domains are >= 1, target < 0
+
+    mock_estimator = MockEstimator()
+    pipe = make_da_pipeline(mock_estimator)
+    pipe.fit(X, y, sample_domain=sample_domain)
+
+    fitted_estimator = pipe.named_steps["mockestimator"].base_estimator_
+    # Check that y for target domains was masked
+    expected_y = np.array(
+        [
+            0.1,
+            0.1,
+            _DEFAULT_MASKED_TARGET_REGRESSION_LABEL,
+            _DEFAULT_MASKED_TARGET_REGRESSION_LABEL,
+        ]
+    )
+    assert_array_equal(fitted_estimator.y_fit, expected_y)
+    assert_array_equal(fitted_estimator.sample_domain_fit, sample_domain)
+
+
+def test_pipeline_per_domain_masks_target_labels():
+    # This test checks that with PerDomain selector, target labels are masked.
+    X = np.array([[1], [2], [3], [4], [5], [6]])
+    # assume domain 1 is source, domain 2 is source, domain -1 is target
+    y = np.array([1, 1, 2, 2, 1, 1])
+    sample_domain = np.array([1, 1, 2, 2, -1, -1])
+
+    mock_estimator = MockEstimator()
+    # Use PerDomain selector
+    pipe = make_da_pipeline(PerDomain(mock_estimator))
+    pipe.fit(X, y, sample_domain=sample_domain)
+
+    # In PerDomain, there are multiple fitted estimators, one per domain
+    fitted_estimators = pipe.named_steps["perdomain_mockestimator"].estimators_
+
+    # Estimator for domain 1 (source)
+    estimator_domain_1 = fitted_estimators[1]
+    assert_array_equal(estimator_domain_1.y_fit, np.array([1, 1]))
+    assert_array_equal(estimator_domain_1.sample_domain_fit, np.array([1, 1]))
+
+    # Estimator for domain 2 (source)
+    estimator_domain_2 = fitted_estimators[2]
+    assert_array_equal(estimator_domain_2.y_fit, np.array([2, 2]))
+    assert_array_equal(estimator_domain_2.sample_domain_fit, np.array([2, 2]))
+
+    # Estimator for domain -1 (target)
+    estimator_domain_target = fitted_estimators[-1]
+    expected_y_target = np.array(
+        [
+            _DEFAULT_MASKED_TARGET_CLASSIFICATION_LABEL,
+            _DEFAULT_MASKED_TARGET_CLASSIFICATION_LABEL,
+        ]
+    )
+    assert_array_equal(estimator_domain_target.y_fit, expected_y_target)
+    assert_array_equal(estimator_domain_target.sample_domain_fit, np.array([-1, -1]))
+
+
+def test_pipeline_no_masking_when_disabled():
+    # This test checks that when `mask_target_labels=False`, labels are not masked.
+    X = np.array([[1], [2], [3], [4]])
+    y = np.array([1, 1, 2, 2])  # y_target is [2, 2]
+    sample_domain = np.array([1, 1, -1, -1])
+
+    mock_estimator = MockEstimator()
+    pipe = make_da_pipeline(mock_estimator, mask_target_labels=False)
+    pipe.fit(X, y, sample_domain=sample_domain)
+
+    fitted_estimator = pipe.named_steps["mockestimator"].base_estimator_
+    # y should not be masked
+    assert_array_equal(fitted_estimator.y_fit, y)
+    assert_array_equal(fitted_estimator.sample_domain_fit, sample_domain)
+
+
 @pytest.mark.parametrize("_fit_transform", [(True,), (False,)])
 def test_allow_nd_x(_fit_transform):
     class CutInputDim(BaseEstimator):
@@ -226,12 +353,23 @@ def test_adaptation_output_propagate_labels(da_reg_dataset):
     output = {}
 
     class FakeAdapter(BaseAdapter):
+        def __init__(self):
+            super().__init__()
+            self.predicts_target_labels = True
+
         def fit_transform(self, X, y=None, sample_domain=None):
             self.fitted_ = True
             if y is not None:
-                assert not np.any(np.isnan(y)), "Expect unmasked labels"
-                y[::2] = np.nan
-            return X, y, dict()
+                # checks that there is no nan in source label
+                assert not np.any(
+                    np.isnan(y[sample_domain >= 0])
+                ), "Expect unmasked labels"
+                # Mimic JCPOTLabelProp behavior
+                yout = np.ones_like(y) * _DEFAULT_MASKED_TARGET_REGRESSION_LABEL
+                yout[sample_domain < 0] = np.random.rand(
+                    yout[sample_domain < 0].shape[0]
+                )
+            return X, yout, dict()
 
     class FakeEstimator(BaseEstimator):
         def fit(self, X, y=None, **params):
@@ -252,5 +390,5 @@ def predict(self, X):
     clf.fit(X, y, sample_domain=sample_domain)
     clf.predict(X_target, sample_domain=target_domain)
 
-    # output should contain only half of targets
-    assert output["fit_n_samples"] == X.shape[0] // 2
+    # output should contain as many samples as target
+    assert output["fit_n_samples"] == X_target.shape[0]
diff --git a/skada/tests/test_reweight.py b/skada/tests/test_reweight.py
diff --git a/skada/tests/test_selector.py b/skada/tests/test_selector.py

Original file line number	Diff line number	Diff line change
`@@ -263,6 +263,7 @@`
`263`	`263`	`PCA(n_components=2),`
`264`	`264`	`SelectSource(SVC()),`
`265`	`265`	`default_selector=SelectSourceTarget,`
	`266`	`+ mask_target_labels=False,`
`266`	`267`	`)`
`267`	`268`
`268`	`269`	`pipe_perdomain.fit(X, y, sample_domain=sample_domain)`