refactor rolling evaluate func; refactor denormalize -- set it into dataset

OuyangWenyu · OuyangWenyu · commit fb16bf6c163f · 2025-01-12T15:37:25.000+08:00
diff --git a/torchhydro/datasets/data_scalers.py b/torchhydro/datasets/data_scalers.py
@@ -1,7 +1,7 @@
 """
 Author: Wenyu Ouyang
 Date: 2024-04-08 18:17:44
-LastEditTime: 2024-11-05 09:21:24
+LastEditTime: 2025-01-12 15:23:28
 LastEditors: Wenyu Ouyang
 Description: normalize the data
 FilePath: \torchhydro\torchhydro\datasets\data_scalers.py
diff --git a/torchhydro/datasets/data_sets.py b/torchhydro/datasets/data_sets.py
@@ -1,10 +1,10 @@
 """
 Author: Wenyu Ouyang
 Date: 2024-04-08 18:16:53
-LastEditTime: 2025-01-02 14:34:59
+LastEditTime: 2025-01-12 15:16:28
 LastEditors: Wenyu Ouyang
 Description: A pytorch dataset class; references to https://github.com/neuralhydrology/neuralhydrology
-FilePath: /torchhydro/torchhydro/datasets/data_sets.py
+FilePath: \torchhydro\torchhydro\datasets\data_sets.py
 Copyright (c) 2024-2024 Wenyu Ouyang. All rights reserved.
 """
 
@@ -26,6 +26,7 @@
 from torchhydro.datasets.data_sources import data_sources_dict
 
 from torchhydro.datasets.data_utils import (
+    set_unit_to_var,
     warn_if_nan,
     wrap_t_s_dict,
 )
@@ -279,6 +280,49 @@ def _normalize(self):
         self.target_scaler = scaler_hub.target_scaler
         return scaler_hub.x, scaler_hub.y, scaler_hub.c
 
+    def denormalize(self, norm_data, rolling=0):
+        """Denormalize the norm_data
+
+        Parameters
+        ----------
+        norm_data : np.ndarray
+            batch-first data
+        rolling: int
+            default 0, if rolling is used, perform forecasting using rolling window size
+
+        Returns
+        -------
+        xr.Dataset
+            denormlized data
+        """
+        target_scaler = self.target_scaler
+        target_data = target_scaler.data_target
+        # the units are dimensionless for pure DL models
+        units = {k: "dimensionless" for k in target_data.attrs["units"].keys()}
+        if target_scaler.pbm_norm:
+            units = {**units, **target_data.attrs["units"]}
+        if rolling > 0:
+            hindcast_output_window = target_scaler.data_cfgs["hindcast_output_window"]
+            rho = target_scaler.data_cfgs["hindcast_length"]
+            # TODO: -1 because seq2seqdataset has one more time, hence we need to cut it, as rolling will be refactored, we will modify it later
+            selected_time_points = target_data.coords["time"][
+                rho - hindcast_output_window : -1
+            ]
+        else:
+            warmup_length = self.warmup_length
+            selected_time_points = target_data.coords["time"][warmup_length:]
+
+        selected_data = target_data.sel(time=selected_time_points)
+        denorm_xr_ds = target_scaler.inverse_transform(
+            xr.DataArray(
+                norm_data.transpose(2, 0, 1),
+                dims=selected_data.dims,
+                coords=selected_data.coords,
+                attrs={"units": units},
+            )
+        )
+        return set_unit_to_var(denorm_xr_ds)
+
     def _to_dataarray_with_unit(self, data_forcing_ds, data_output_ds, data_attr_ds):
         # trans to dataarray to better use xbatch
         if data_output_ds is not None:
diff --git a/torchhydro/datasets/data_utils.py b/torchhydro/datasets/data_utils.py
@@ -1,10 +1,10 @@
 """
 Author: Wenyu Ouyang
 Date: 2023-09-21 15:37:58
-LastEditTime: 2025-01-02 14:06:24
+LastEditTime: 2025-01-12 15:31:29
 LastEditors: Wenyu Ouyang
 Description: Some basic funtions for dealing with data
-FilePath: /torchhydro/torchhydro/datasets/data_utils.py
+FilePath: \torchhydro\torchhydro\datasets\data_utils.py
 Copyright (c) 2023-2024 Wenyu Ouyang. All rights reserved.
 """
 
@@ -390,3 +390,26 @@ def dam_num_chosen(gages, usgs_id, dam_num):
             usgs_id[i] for i in range(data_attr.size) if data_attr[:, 0][i] == dam_num
         ]
     )
+
+
+def set_unit_to_var(ds):
+    """returned xa.Dataset need has units for each variable -- xr.DataArray
+    or the dataset cannot be saved to netCDF file
+
+    Parameters
+    ----------
+    ds : xr.Dataset
+        the dataset with units as attributes
+
+    Returns
+    -------
+    ds : xr.Dataset
+        unit attrs are for each variable dataarray
+    """
+    units_dict = ds.attrs["units"]
+    for var_name, units in units_dict.items():
+        if var_name in ds:
+            ds[var_name].attrs["units"] = units
+    if "units" in ds.attrs:
+        del ds.attrs["units"]
+    return ds
diff --git a/torchhydro/trainers/deep_hydro.py b/torchhydro/trainers/deep_hydro.py
@@ -1,10 +1,10 @@
 """
 Author: Wenyu Ouyang
 Date: 2024-04-08 18:15:48
-LastEditTime: 2025-01-09 12:17:20
+LastEditTime: 2025-01-12 14:57:18
 LastEditors: Wenyu Ouyang
 Description: HydroDL model class
-FilePath: /torchhydro/torchhydro/trainers/deep_hydro.py
+FilePath: \torchhydro\torchhydro\trainers\deep_hydro.py
 Copyright (c) 2024-2024 Wenyu Ouyang. All rights reserved.
 """
 
@@ -42,8 +42,8 @@
 from torchhydro.trainers.train_logger import TrainLogger
 from torchhydro.trainers.train_utils import (
     EarlyStopper,
+    rolling_evaluate,
     average_weights,
-    denormalize4eval,
     evaluate_validation,
     compute_validation,
     model_infer,
@@ -399,37 +399,31 @@ def inference(self) -> Tuple[xr.Dataset, xr.Dataset]:
             obs = obs.flatten().reshape(test_dataloader.test_data.y.shape[0], -1, 1)
 
         if evaluation_cfgs["rolling"] > 0:
-            if evaluation_cfgs["rolling"] != data_cfgs["forecast_length"]:
-                raise NotImplementedError(
-                    "rolling should be equal to forecast_length in data_cfgs now, others are not supported yet"
-                )
-            # TODO: now we only guarantee each time has only one value,
-            # so we directly reshape the data rather than a real rolling
             ngrid = self.testdataset.ngrid
             nt = self.testdataset.nt
-            target_len = len(data_cfgs["target_cols"])
-            hindcast_output_window = data_cfgs["hindcast_output_window"]
+            nf = len(data_cfgs["target_cols"])
+            rolling = evaluation_cfgs["rolling"]
             forecast_length = data_cfgs["forecast_length"]
-            window_size = hindcast_output_window + forecast_length
+            hindcast_output_window = data_cfgs["hindcast_output_window"]
             rho = data_cfgs["hindcast_length"]
-            recover_len = nt - rho + hindcast_output_window
-            samples = int(pred.shape[0] / ngrid)
-            pred_ = np.full((ngrid, recover_len, target_len), np.nan)
-            obs_ = np.full((ngrid, recover_len, target_len), np.nan)
-            # recover pred to pred_ and obs to obs_
-            pred_4d = pred.reshape(ngrid, samples, window_size, target_len)
-            obs_4d = obs.reshape(ngrid, samples, window_size, target_len)
-            for i in range(ngrid):
-                for j in range(0, recover_len - window_size + 1, window_size):
-                    pred_[i, j : j + window_size, :] = pred_4d[i, j, :, :]
-            for i in range(ngrid):
-                for j in range(0, recover_len - window_size + 1, window_size):
-                    obs_[i, j : j + window_size, :] = obs_4d[i, j, :, :]
-            pred = pred_.reshape(ngrid, recover_len, target_len)
-            obs = obs_.reshape(ngrid, recover_len, target_len)
-        pred_xr, obs_xr = denormalize4eval(
-            test_dataloader, pred, obs, rolling=evaluation_cfgs["rolling"]
-        )
+            pred = rolling_evaluate(
+                (ngrid, nt, nf),
+                rho,
+                forecast_length,
+                rolling,
+                hindcast_output_window,
+                pred,
+            )
+            obs = rolling_evaluate(
+                (ngrid, nt, nf),
+                rho,
+                forecast_length,
+                rolling,
+                hindcast_output_window,
+                obs,
+            )
+        pred_xr = self.testdataset.denormalize(pred, rolling=evaluation_cfgs["rolling"])
+        obs_xr = self.testdataset.denormalize(obs, rolling=evaluation_cfgs["rolling"])
         return pred_xr, obs_xr
 
     def _get_optimizer(self, training_cfgs):
diff --git a/torchhydro/trainers/resulter.py b/torchhydro/trainers/resulter.py
@@ -31,16 +31,6 @@
 from torchhydro.trainers.deep_hydro import DeepHydro
 
 
-def set_unit_to_var(ds):
-    units_dict = ds.attrs["units"]
-    for var_name, units in units_dict.items():
-        if var_name in ds:
-            ds[var_name].attrs["units"] = units
-    if "units" in ds.attrs:
-        del ds.attrs["units"]
-    return ds
-
-
 class Resulter:
     def __init__(self, cfgs) -> None:
         self.cfgs = cfgs
@@ -112,8 +102,6 @@ def save_result(self, pred, obs):
         save_dir = self.result_dir
         flow_pred_file = os.path.join(save_dir, self.pred_name)
         flow_obs_file = os.path.join(save_dir, self.obs_name)
-        pred = set_unit_to_var(pred)
-        obs = set_unit_to_var(obs)
         pred.to_netcdf(flow_pred_file + ".nc")
         obs.to_netcdf(flow_obs_file + ".nc")
 
diff --git a/torchhydro/trainers/train_utils.py b/torchhydro/trainers/train_utils.py