OuyangWenyu
diff --git a/‎tests/test_data_scalers.py‎
Lines changed: 82 additions & 3 deletions b/‎tests/test_data_scalers.py‎
Lines changed: 82 additions & 3 deletions
diff --git a/‎torchhydro/configs/config.py‎
Lines changed: 20 additions & 8 deletions b/‎torchhydro/configs/config.py‎
Lines changed: 20 additions & 8 deletions
@@ -1,11 +1,12 @@
-import tempfile
 import pandas as pd
 import pytest
 import numpy as np
+from sklearn.discriminant_analysis import StandardScaler
 import xarray as xr
-import json
 import os
-from torchhydro.datasets.data_scalers import DapengScaler
+import pickle as pkl
+
+from torchhydro.datasets.data_scalers import DapengScaler, ScalerHub
 from hydrodatasource.reader.data_source import SelfMadeHydroDataset
 
 
@@ -128,3 +129,81 @@ def test_dapeng_scaler_load_data_and_denorm(sample_data):
             denorm_y.coords[coord].values,
             err_msg=f"{coord} is inconsistent",
         )
+
+
+def test_sklearn_scale_train_mode(sample_data):
+    target_vars, relevant_vars, constant_vars, data_cfgs = sample_data
+    scaler_hub = ScalerHub(
+        target_vars=target_vars,
+        relevant_vars=relevant_vars,
+        constant_vars=constant_vars,
+        data_cfgs=data_cfgs,
+        is_tra_val_te="train",
+    )
+    norm_key = "target_vars"
+    scaler = StandardScaler()
+    data_tmp = target_vars.to_numpy().reshape(-1, target_vars.shape[-1])
+
+    # Call the _sklearn_scale method
+    scaler, data_norm = scaler_hub._sklearn_scale(
+        data_cfgs, "train", norm_key, scaler, data_tmp
+    )
+
+    # Check if the scaler is fitted and data is normalized
+    assert hasattr(scaler, "mean_"), "Scaler is not fitted"
+    assert data_norm.shape == data_tmp.shape, "Normalized data shape mismatch"
+
+    # Check if the scaler file is saved
+    save_file = os.path.join(data_cfgs["case_dir"], f"{norm_key}_scaler.pkl")
+    assert os.path.isfile(save_file), "Scaler file was not saved"
+
+
+def test_sklearn_scale_test_mode_with_existing_scaler(sample_data):
+    target_vars, relevant_vars, constant_vars, data_cfgs = sample_data
+    scaler_hub = ScalerHub(
+        target_vars=target_vars,
+        relevant_vars=relevant_vars,
+        constant_vars=constant_vars,
+        data_cfgs=data_cfgs,
+        is_tra_val_te="train",
+    )
+    norm_key = "target_vars"
+    scaler = StandardScaler()
+    data_tmp = target_vars.to_numpy().reshape(-1, target_vars.shape[-1])
+
+    # Save a pre-fitted scaler for testing
+    save_file = os.path.join(data_cfgs["case_dir"], f"{norm_key}_scaler.pkl")
+    with open(save_file, "wb") as outfile:
+        pkl.dump(scaler.fit(data_tmp), outfile)
+
+    # Call the _sklearn_scale method in test mode
+    scaler, data_norm = scaler_hub._sklearn_scale(
+        data_cfgs, "test", norm_key, scaler, data_tmp
+    )
+
+    # Check if the scaler is loaded and data is normalized
+    assert hasattr(scaler, "mean_"), "Scaler is not loaded correctly"
+    assert data_norm.shape == data_tmp.shape, "Normalized data shape mismatch"
+
+
+def test_sklearn_scale_test_mode_without_scaler_file(sample_data):
+    target_vars, relevant_vars, constant_vars, data_cfgs = sample_data
+    scaler_hub = ScalerHub(
+        target_vars=target_vars,
+        relevant_vars=relevant_vars,
+        constant_vars=constant_vars,
+        data_cfgs=data_cfgs,
+        is_tra_val_te="test",
+    )
+    norm_key = "target_vars"
+    scaler = StandardScaler()
+    data_tmp = target_vars.to_numpy().reshape(-1, target_vars.shape[-1])
+
+    # Ensure no scaler file exists
+    save_file = os.path.join(data_cfgs["case_dir"], f"{norm_key}_scaler.pkl")
+    if os.path.isfile(save_file):
+        os.remove(save_file)
+
+    # Expect a FileNotFoundError
+    with pytest.raises(FileNotFoundError):
+        scaler_hub._sklearn_scale(data_cfgs, "test", norm_key, scaler, data_tmp)
@@ -1,10 +1,10 @@
 """
 Author: Wenyu Ouyang
 Date: 2021-12-31 11:08:29
-LastEditTime: 2025-01-12 10:12:48
+LastEditTime: 2025-04-17 10:11:03
 LastEditors: Wenyu Ouyang
 Description: Config for hydroDL
-FilePath: \torchhydro\torchhydro\configs\config.py
+FilePath: /torchhydro/torchhydro/configs/config.py
 Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved.
 """
 
@@ -207,6 +207,10 @@ def default_config_file():
                 # NOTE: pbm_norm is True means norm and denorm for differentiable models; if you use pure data-driven models, you should set it as False
                 "pbm_norm": False,
             },
+            # For scaler from sklearn, we need to specify the stat_dict_file for three different parts:
+            # target_vars, relevant_vars and constant_vars, and the sequence must be target_vars, relevant_vars, constant_vars
+            # the seperator of three stat_dict_file is ";"
+            # for example: "stat_dict_file": "target_stat_dict_file;relevant_stat_dict_file;constant_stat_dict_file"
             "stat_dict_file": None,
             # dataset for pytorch dataset
             "dataset": "StreamflowDataset",
@@ -232,13 +236,13 @@ def default_config_file():
                 # start from 0, each value means the decay rate
                 # if initial lr is 0.001, then 0: 0.5 neans the lr of 0 epoch is 0.001*0.5=0.0005
                 # "lr_scheduler": {0: 1, 1: 0.5, 2: 0.2},
-                # 3rd opt config, lr as a initial value (will cover the lr setting in "optim_params")
+                # 3rd opt config, initial lr need to be set in "optim_params" or it will use default one
                 # lr_factor as an exponential decay factor
-                # "lr": 0.001, "lr_factor": 0.1,
-                # 4th opt config, lr as a initial value, it will cover the lr setting in "optim_params"
+                # "lr_factor": 0.1,
+                # 4th opt config, initial lr need to be set in "optim_params" or it will use default one
                 # lr_patience represent how many epochs without opt (we watch val_loss) could be tolerated
                 # if lr_patience is satisfied, then lr will be decayed by lr_factor by a linear way
-                # "lr": 0.001, "lr_factor": 0.1, "lr_patience": 1,
+                # "lr_factor": 0.1, "lr_patience": 1,
             },
             "early_stopping": False,
             "patience": 1,
@@ -283,7 +287,7 @@ def default_config_file():
             "model_loader": {"load_way": "specified", "test_epoch": 20},
             # "model_loader": {"load_way": "best"},
             # "model_loader": {"load_way": "latest"},
-            # "model_loader": {"load_way": "pth", "pth": "path/to/weights"},
+            # "model_loader": {"load_way": "pth", "pth_path": "path/to/weights"},
             "metrics": ["NSE", "RMSE", "R2", "KGE", "FHV", "FLV"],
             "fill_nan": "no",
             "explainer": None,
@@ -1087,7 +1091,15 @@ def update_cfg(cfg_file, new_args):
     if new_args.start_epoch > 1:
         cfg_file["training_cfgs"]["start_epoch"] = new_args.start_epoch
     if new_args.stat_dict_file is not None:
-        cfg_file["data_cfgs"]["stat_dict_file"] = new_args.stat_dict_file
+        stat_dict_file = new_args.stat_dict_file
+        if len(stat_dict_file.split(";")) > 1:
+            target_, relevant_, constant_ = stat_dict_file.split(";")
+            stat_dict_file = {
+                "target_vars": target_,
+                "relevant_vars": relevant_,
+                "constant_vars": constant_,
+            }
+        cfg_file["data_cfgs"]["stat_dict_file"] = stat_dict_file
     if new_args.num_workers is not None and new_args.num_workers > 0:
         cfg_file["training_cfgs"]["num_workers"] = new_args.num_workers
     if new_args.which_first_tensor is not None: