refactor evaluation part

OuyangWenyu · OuyangWenyu · commit e5c9bcf9bd71 · 2025-04-21T16:08:25.000+08:00
diff --git a/torchhydro/configs/config.py b/torchhydro/configs/config.py
@@ -332,6 +332,26 @@ def default_config_file():
             # 0 means all testing periods belong to forecast periods without hindcast part
             "current_idx": 0,
             "calc_metrics": True,
+            # we provide some different evaluators:
+            # 1st -- once: for each time each var and each basin, only one result is evaluated
+            # stride means if rolling is true, after evaluating, we need a stride to skip some periods
+            # 2nd -- 1pace: we only chose one pace from results to evaluate
+            # -1 means we chose the final result of each sample which will be used in hindcast-only/forecast-only model inference
+            # 1 means we chose the first result of each sample which will be used in hindcast-forecast model inference
+            # 3rd -- rolling: we perform evaluation for each sample of each basin,
+            # stride means we will perform evaluation for each sample after stride periods
+            "evaluator": {
+                "eval_way": "once",
+                "stride": 0,
+            },
+            # "evaluator": {
+            #     "eval_way": "1pace",
+            #     "pace_idx": -1,
+            # },
+            # "evaluator": {
+            # "eval_way": "rolling",
+            # "stride": 1,
+            # },
         },
     }
 
@@ -418,6 +438,7 @@ def cmd(
     min_time_unit=None,
     min_time_interval=None,
     valid_batch_mode=None,
+    evaluator=None,
 ):
     """input args from cmd"""
     parser = argparse.ArgumentParser(
@@ -958,6 +979,13 @@ def cmd(
         help="The batch organization mode of valid data, train means same as train; test means same as test",
         default=valid_batch_mode,
     )
+    parser.add_argument(
+        "--evaluator",
+        dest="evaluator",
+        help="evaluation way",
+        default=evaluator,
+        type=json.loads,
+    )
     # To make pytest work in PyCharm, here we use the following code instead of "args = parser.parse_args()":
     # https://blog.csdn.net/u014742995/article/details/100119905
     args, unknown = parser.parse_known_args()
diff --git a/torchhydro/datasets/data_sets.py b/torchhydro/datasets/data_sets.py
@@ -219,6 +219,16 @@ def ngrid(self):
         """
         return len(self.basins)
 
+    @property
+    def noutputvar(self):
+        """How many output variables in the dataset
+        Returns
+        -------
+        int
+            number of variables
+        """
+        return len(self.data_cfgs["target_cols"])
+
     @property
     def nt(self):
         """length of longest time series in all basins
@@ -377,15 +387,17 @@ def _normalize(
         self.target_scaler = scaler_hub.target_scaler
         return scaler_hub.norm_data
 
-    def denormalize(self, norm_data, rolling=0):
+    def denormalize(self, norm_data, is_real_time=True):
         """Denormalize the norm_data
 
         Parameters
         ----------
         norm_data : np.ndarray
             batch-first data
-        rolling: int
-            default 0, if rolling is used, perform forecasting using rolling window size
+        is_real_time : bool, optional
+            whether the data is real time data, by default True
+            sometimes we may have multiple results for one time period and we flatten them
+            so we need a temp time to replace real one
 
         Returns
         -------
@@ -398,17 +410,8 @@ def denormalize(self, norm_data, rolling=0):
         units = {k: "dimensionless" for k in target_data.attrs["units"].keys()}
         if target_scaler.pbm_norm:
             units = {**units, **target_data.attrs["units"]}
-        if rolling > 0:
-            hindcast_output_window = target_scaler.data_cfgs["hindcast_output_window"]
-            rho = target_scaler.training_cfgs["hindcast_length"]
-            # TODO: -1 because seq2seqdataset has one more time, hence we need to cut it, as rolling will be refactored, we will modify it later
-            selected_time_points = target_data.coords["time"][
-                rho - hindcast_output_window : -1
-            ]
-        else:
-            warmup_length = self.warmup_length
-            selected_time_points = target_data.coords["time"][warmup_length:]
-
+        warmup_length = self.warmup_length
+        selected_time_points = target_data.coords["time"][warmup_length:]
         selected_data = target_data.sel(time=selected_time_points)
         denorm_xr_ds = target_scaler.inverse_transform(
             xr.DataArray(
diff --git a/torchhydro/trainers/deep_hydro.py b/torchhydro/trainers/deep_hydro.py
@@ -43,13 +43,13 @@
 from torchhydro.trainers.train_logger import TrainLogger
 from torchhydro.trainers.train_utils import (
     EarlyStopper,
-    rolling_evaluate,
     average_weights,
     evaluate_validation,
     compute_validation,
     model_infer,
     read_pth_from_model_loader,
     torch_single_train,
+    get_evaluation,
 )
 
 
@@ -377,7 +377,6 @@ def inference(self) -> Tuple[xr.Dataset, xr.Dataset]:
         """infer using trained model and unnormalized results"""
         data_cfgs = self.cfgs["data_cfgs"]
         training_cfgs = self.cfgs["training_cfgs"]
-        evaluation_cfgs = self.cfgs["evaluation_cfgs"]
         device = get_the_device(self.cfgs["training_cfgs"]["device"])
         test_dataloader = self._get_dataloader(training_cfgs, data_cfgs, mode="infer")
         seq_first = training_cfgs["which_first_tensor"] == "sequence"
@@ -404,33 +403,13 @@ def inference(self) -> Tuple[xr.Dataset, xr.Dataset]:
             # params of reshape should be (basin size, time length)
             pred = pred.flatten().reshape(test_dataloader.test_data.y.shape[0], -1, 1)
             obs = obs.flatten().reshape(test_dataloader.test_data.y.shape[0], -1, 1)
-
-        if evaluation_cfgs["rolling"] > 0:
-            ngrid = self.testdataset.ngrid
-            nt = self.testdataset.nt
-            nf = len(data_cfgs["target_cols"])
-            rolling = evaluation_cfgs["rolling"]
-            forecast_length = training_cfgs["forecast_length"]
-            hindcast_output_window = data_cfgs["hindcast_output_window"]
-            rho = training_cfgs["hindcast_length"]
-            pred = rolling_evaluate(
-                (ngrid, nt, nf),
-                rho,
-                forecast_length,
-                rolling,
-                hindcast_output_window,
-                pred,
-            )
-            obs = rolling_evaluate(
-                (ngrid, nt, nf),
-                rho,
-                forecast_length,
-                rolling,
-                hindcast_output_window,
-                obs,
-            )
-        pred_xr = self.testdataset.denormalize(pred, rolling=evaluation_cfgs["rolling"])
-        obs_xr = self.testdataset.denormalize(obs, rolling=evaluation_cfgs["rolling"])
+        evaluation_cfgs = self.cfgs["evaluation_cfgs"]
+        obs_xr, pred_xr = get_evaluation(
+            test_dataloader,
+            evaluation_cfgs,
+            pred,
+            obs,
+        )
         return pred_xr, obs_xr
 
     def _get_optimizer(self, training_cfgs):
@@ -457,26 +436,15 @@ def _get_loss_func(self, training_cfgs):
 
     def _get_dataloader(self, training_cfgs, data_cfgs, mode="train"):
         if mode == "infer":
-            ngrid = self.testdataset.ngrid
-            if data_cfgs["sampler"] != "BasinBatchSampler":
-                # TODO: this case should be tested more
-                return DataLoader(
-                    self.testdataset,
-                    batch_size=training_cfgs["batch_size"],
-                    shuffle=False,
-                    sampler=None,
-                    batch_sampler=None,
-                    drop_last=False,
-                    timeout=0,
-                    worker_init_fn=None,
-                )
-            test_num_samples = self.testdataset.num_samples
             return DataLoader(
                 self.testdataset,
-                batch_size=test_num_samples // ngrid,
+                batch_size=training_cfgs["batch_size"],
                 shuffle=False,
+                sampler=None,
+                batch_sampler=None,
                 drop_last=False,
                 timeout=0,
+                worker_init_fn=None,
             )
         worker_num = 0
         pin_memory = False
diff --git a/torchhydro/trainers/train_utils.py b/torchhydro/trainers/train_utils.py