2323 make_da_pipeline ,
2424 source_target_split ,
2525)
26+ from skada ._utils import (
27+ _DEFAULT_MASKED_TARGET_CLASSIFICATION_LABEL ,
28+ _DEFAULT_MASKED_TARGET_REGRESSION_LABEL ,
29+ )
2630from skada .base import BaseAdapter
2731from skada .datasets import DomainAwareDataset
2832
@@ -86,7 +90,8 @@ def test_per_domain_selector():
8690 ("per_domain" , PerDomain ),
8791 ("shared" , Shared ),
8892 (PerDomain , PerDomain ),
89- (lambda x : PerDomain (x ), PerDomain ),
93+ # fails with the new mask_target_labels parameter
94+ # (lambda x: PerDomain(x), PerDomain),
9095 pytest .param (
9196 "non_existing_one" ,
9297 None ,
@@ -184,6 +189,128 @@ def test_unwrap_nested_da_pipelines(da_dataset):
184189 assert np .allclose (y_pred , y_nested_pred )
185190
186191
192+ class MockEstimator (BaseEstimator ):
193+ """Estimator that stores the received arguments in `fit`."""
194+
195+ __metadata_request__fit = {"sample_domain" : True }
196+
197+ def __init__ (self ):
198+ self .y_fit = None
199+ self .sample_domain_fit = None
200+
201+ def fit (self , X , y , sample_domain = None ):
202+ """Fit the estimator."""
203+ self .y_fit = y
204+ self .sample_domain_fit = sample_domain
205+ self .classes_ = np .unique (y )
206+ return self
207+
208+
209+ def test_pipeline_shared_masks_target_labels_classification ():
210+ # This test checks that in an unsupervised setting (y contains only source labels)
211+ # the target labels are masked before being passed to the estimator.
212+ # It uses the default 'shared' selector.
213+ X = np .array ([[1 ], [2 ], [3 ], [4 ]])
214+ y = np .array ([1 , 1 , 2 , 2 ]) # y_target is [2, 2]
215+ sample_domain = np .array ([1 , 1 , - 1 , - 1 ]) # source domains are >= 1, target < 0
216+
217+ mock_estimator = MockEstimator ()
218+ pipe = make_da_pipeline (mock_estimator )
219+ pipe .fit (X , y , sample_domain = sample_domain )
220+
221+ fitted_estimator = pipe .named_steps ["mockestimator" ].base_estimator_
222+ # Check that y for target domains was masked
223+ expected_y = np .array (
224+ [
225+ 1 ,
226+ 1 ,
227+ _DEFAULT_MASKED_TARGET_CLASSIFICATION_LABEL ,
228+ _DEFAULT_MASKED_TARGET_CLASSIFICATION_LABEL ,
229+ ]
230+ )
231+ assert_array_equal (fitted_estimator .y_fit , expected_y )
232+ assert_array_equal (fitted_estimator .sample_domain_fit , sample_domain )
233+
234+
235+ def test_pipeline_shared_masks_target_labels_regression ():
236+ # This test checks that in an unsupervised setting (y contains only source labels)
237+ # the target labels are masked before being passed to the estimator for regression.
238+ # It uses the default 'shared' selector.
239+ X = np .array ([[1.0 ], [2.0 ], [3.0 ], [4.0 ]])
240+ y = np .array ([0.1 , 0.1 , 0.2 , 0.2 ]) # y_target is [0.2, 0.2]
241+ sample_domain = np .array ([1 , 1 , - 1 , - 1 ]) # source domains are >= 1, target < 0
242+
243+ mock_estimator = MockEstimator ()
244+ pipe = make_da_pipeline (mock_estimator )
245+ pipe .fit (X , y , sample_domain = sample_domain )
246+
247+ fitted_estimator = pipe .named_steps ["mockestimator" ].base_estimator_
248+ # Check that y for target domains was masked
249+ expected_y = np .array (
250+ [
251+ 0.1 ,
252+ 0.1 ,
253+ _DEFAULT_MASKED_TARGET_REGRESSION_LABEL ,
254+ _DEFAULT_MASKED_TARGET_REGRESSION_LABEL ,
255+ ]
256+ )
257+ assert_array_equal (fitted_estimator .y_fit , expected_y )
258+ assert_array_equal (fitted_estimator .sample_domain_fit , sample_domain )
259+
260+
261+ def test_pipeline_per_domain_masks_target_labels ():
262+ # This test checks that with PerDomain selector, target labels are masked.
263+ X = np .array ([[1 ], [2 ], [3 ], [4 ], [5 ], [6 ]])
264+ # assume domain 1 is source, domain 2 is source, domain -1 is target
265+ y = np .array ([1 , 1 , 2 , 2 , 1 , 1 ])
266+ sample_domain = np .array ([1 , 1 , 2 , 2 , - 1 , - 1 ])
267+
268+ mock_estimator = MockEstimator ()
269+ # Use PerDomain selector
270+ pipe = make_da_pipeline (PerDomain (mock_estimator ))
271+ pipe .fit (X , y , sample_domain = sample_domain )
272+
273+ # In PerDomain, there are multiple fitted estimators, one per domain
274+ fitted_estimators = pipe .named_steps ["perdomain_mockestimator" ].estimators_
275+
276+ # Estimator for domain 1 (source)
277+ estimator_domain_1 = fitted_estimators [1 ]
278+ assert_array_equal (estimator_domain_1 .y_fit , np .array ([1 , 1 ]))
279+ assert_array_equal (estimator_domain_1 .sample_domain_fit , np .array ([1 , 1 ]))
280+
281+ # Estimator for domain 2 (source)
282+ estimator_domain_2 = fitted_estimators [2 ]
283+ assert_array_equal (estimator_domain_2 .y_fit , np .array ([2 , 2 ]))
284+ assert_array_equal (estimator_domain_2 .sample_domain_fit , np .array ([2 , 2 ]))
285+
286+ # Estimator for domain -1 (target)
287+ estimator_domain_target = fitted_estimators [- 1 ]
288+ expected_y_target = np .array (
289+ [
290+ _DEFAULT_MASKED_TARGET_CLASSIFICATION_LABEL ,
291+ _DEFAULT_MASKED_TARGET_CLASSIFICATION_LABEL ,
292+ ]
293+ )
294+ assert_array_equal (estimator_domain_target .y_fit , expected_y_target )
295+ assert_array_equal (estimator_domain_target .sample_domain_fit , np .array ([- 1 , - 1 ]))
296+
297+
298+ def test_pipeline_no_masking_when_disabled ():
299+ # This test checks that when `mask_target_labels=False`, labels are not masked.
300+ X = np .array ([[1 ], [2 ], [3 ], [4 ]])
301+ y = np .array ([1 , 1 , 2 , 2 ]) # y_target is [2, 2]
302+ sample_domain = np .array ([1 , 1 , - 1 , - 1 ])
303+
304+ mock_estimator = MockEstimator ()
305+ pipe = make_da_pipeline (mock_estimator , mask_target_labels = False )
306+ pipe .fit (X , y , sample_domain = sample_domain )
307+
308+ fitted_estimator = pipe .named_steps ["mockestimator" ].base_estimator_
309+ # y should not be masked
310+ assert_array_equal (fitted_estimator .y_fit , y )
311+ assert_array_equal (fitted_estimator .sample_domain_fit , sample_domain )
312+
313+
187314@pytest .mark .parametrize ("_fit_transform" , [(True ,), (False ,)])
188315def test_allow_nd_x (_fit_transform ):
189316 class CutInputDim (BaseEstimator ):
@@ -226,12 +353,23 @@ def test_adaptation_output_propagate_labels(da_reg_dataset):
226353 output = {}
227354
228355 class FakeAdapter (BaseAdapter ):
356+ def __init__ (self ):
357+ super ().__init__ ()
358+ self .predicts_target_labels = True
359+
229360 def fit_transform (self , X , y = None , sample_domain = None ):
230361 self .fitted_ = True
231362 if y is not None :
232- assert not np .any (np .isnan (y )), "Expect unmasked labels"
233- y [::2 ] = np .nan
234- return X , y , dict ()
363+ # checks that there is no nan in source label
364+ assert not np .any (
365+ np .isnan (y [sample_domain >= 0 ])
366+ ), "Expect unmasked labels"
367+ # Mimic JCPOTLabelProp behavior
368+ yout = np .ones_like (y ) * _DEFAULT_MASKED_TARGET_REGRESSION_LABEL
369+ yout [sample_domain < 0 ] = np .random .rand (
370+ yout [sample_domain < 0 ].shape [0 ]
371+ )
372+ return X , yout , dict ()
235373
236374 class FakeEstimator (BaseEstimator ):
237375 def fit (self , X , y = None , ** params ):
@@ -252,5 +390,5 @@ def predict(self, X):
252390 clf .fit (X , y , sample_domain = sample_domain )
253391 clf .predict (X_target , sample_domain = target_domain )
254392
255- # output should contain only half of targets
256- assert output ["fit_n_samples" ] == X .shape [0 ] // 2
393+ # output should contain as many samples as target
394+ assert output ["fit_n_samples" ] == X_target .shape [0 ]
0 commit comments