Merge remote-tracking branch 'downstream/dev' into dev

OuyangWenyu · OuyangWenyu · commit 4e8dc5aa6467 · 2025-04-30T14:28:01.000+08:00
diff --git a/torchhydro/datasets/data_dict.py b/torchhydro/datasets/data_dict.py
@@ -10,7 +10,7 @@
 
 from torchhydro.datasets.data_sets import (
     BaseDataset,
-    BaseDatasetValidSame,
+    ForecastDataset,
     HFDataset,
     BasinSingleFlowDataset,
     DplDataset,
@@ -24,7 +24,7 @@
 
 datasets_dict = {
     "StreamflowDataset": BaseDataset,
-    "BaseDatasetValidSame": BaseDatasetValidSame,
+    "ForecastDataset": ForecastDataset,
     "HFDataset": HFDataset,
     "SingleflowDataset": BasinSingleFlowDataset,
     "DplDataset": DplDataset,
diff --git a/torchhydro/datasets/data_sets.py b/torchhydro/datasets/data_sets.py
@@ -976,65 +976,100 @@ def _read_xyc(self):
         time_unit = self.data_cfgs["min_time_unit"]
 
         # Determine the date format
-        date_format = detect_date_format(end_date)
+        date_format = detect_date_format(start_date)
 
         # Adjust the end date based on the time unit
-        end_date_dt = datetime.strptime(end_date, date_format)
+        start_date_dt = datetime.strptime(start_date, date_format)
         if time_unit == "h":
-            adjusted_end_date = (end_date_dt + timedelta(hours=interval)).strftime(
+            adjusted_start_date = (start_date_dt - timedelta(hours=interval)).strftime(
                 date_format
             )
         elif time_unit == "D":
-            adjusted_end_date = (end_date_dt + timedelta(days=interval)).strftime(
+            adjusted_start_date = (start_date_dt - timedelta(days=interval)).strftime(
                 date_format
             )
         else:
             raise ValueError(f"Unsupported time unit: {time_unit}")
-        return self._read_xyc_specified_time(start_date, adjusted_end_date)
+        return self._read_xyc_specified_time(adjusted_start_date, end_date)
 
-    def _normalize(self):
-        x, y, c = super()._normalize()
-        # TODO: this work for minio? maybe better to move to basedataset
-        return x.compute(), y.compute(), c.compute()
+    def denormalize(self, norm_data, is_real_time=True):
+        """Denormalize the norm_data
+
+        Parameters
+        ----------
+        norm_data : np.ndarray
+            batch-first data
+        is_real_time : bool, optional
+            whether the data is real time data, by default True
+            sometimes we may have multiple results for one time period and we flatten them
+            so we need a temp time to replace real one
+
+        Returns
+        -------
+        xr.Dataset
+            denormlized data
+        """
+        target_scaler = self.target_scaler
+        target_data = target_scaler.data_target
+        # the units are dimensionless for pure DL models
+        units = {k: "dimensionless" for k in target_data.attrs["units"].keys()}
+        if target_scaler.pbm_norm:
+            units = {**units, **target_data.attrs["units"]}
+        warmup_length = self.warmup_length
+        selected_time_points = target_data.coords["time"][warmup_length:-1]
+        selected_data = target_data.sel(time=selected_time_points)
+        denorm_xr_ds = target_scaler.inverse_transform(
+            xr.DataArray(
+                norm_data,
+                dims=selected_data.dims,
+                coords=selected_data.coords,
+                attrs={"units": units},
+            )
+        )
+        return set_unit_to_var(denorm_xr_ds)
 
     def __getitem__(self, item: int):
         basin, time = self.lookup_table[item]
         rho = self.rho
         horizon = self.horizon
         hindcast_output_window = self.data_cfgs.get("hindcast_output_window", 0)
         # p cover all encoder-decoder periods; +1 means the period while +0 means start of the current period
-        p = self.x[basin, time + 1 : time + rho + horizon + 1, 0].reshape(-1, 1)
+        p = self.x[basin, time + 1 : time + rho + horizon + 1, :1]
         # s only cover encoder periods
         s = self.x[basin, time : time + rho, 1:]
-        x = np.concatenate((p[:rho], s), axis=1)
+        # xe = np.concatenate((p[:rho], s), axis=1)
 
         if self.c is None or self.c.shape[-1] == 0:
-            xc = x
+            pc = p
         else:
             c = self.c[basin, :]
             c = np.tile(c, (rho + horizon, 1))
-            xc = np.concatenate((x, c[:rho]), axis=1)
+            pc = np.concatenate((p[:rho], c[:rho]), axis=1)
+        xe = np.concatenate((pc[:rho], s), axis=1)
         # xh cover decoder periods
         try:
-            xh = np.concatenate((p[rho:], c[rho:]), axis=1)
+            xd = np.concatenate((p[rho:], c[rho:]), axis=1)
         except ValueError as e:
             print(f"Error in np.concatenate: {e}")
             print(f"p[rho:].shape: {p[rho:].shape}, c[rho:].shape: {c[rho:].shape}")
             raise
         # y cover specified encoder size (hindcast_output_window) and all decoder periods
         y = self.y[
             basin, time + rho - hindcast_output_window + 1 : time + rho + horizon + 1, :
-        ]
+        ]  # qs
+        # y_q = y[:, :1]
+        # y_s = y[:, 1:]
+        # y = np.concatenate((y_s, y_q), axis=1)
 
         if self.is_tra_val_te == "train":
             return [
-                torch.from_numpy(xc).float(),
-                torch.from_numpy(xh).float(),
+                torch.from_numpy(xe).float(),
+                torch.from_numpy(xd).float(),
                 torch.from_numpy(y).float(),
             ], torch.from_numpy(y).float()
         return [
-            torch.from_numpy(xc).float(),
-            torch.from_numpy(xh).float(),
+            torch.from_numpy(xe).float(),
+            torch.from_numpy(xd).float(),
         ], torch.from_numpy(y).float()
 
 
@@ -1086,15 +1121,15 @@ def __getitem__(self, item: int):
         ], torch.from_numpy(y).float()
 
 
-class BaseDatasetValidSame(BaseDataset):
+class ForecastDataset(BaseDataset):
     def __init__(self, data_cfgs: dict, is_tra_val_te: str):
-        super(BaseDatasetValidSame, self).__init__(data_cfgs, is_tra_val_te)
+        super(ForecastDataset, self).__init__(data_cfgs, is_tra_val_te)
 
     def __getitem__(self, item):
         basin, idx = self.lookup_table[item]
         warmup_length = self.warmup_length
         x = self.x[basin, idx - warmup_length : idx + self.rho + self.horizon, :]
-        y = self.y[basin, idx : idx + self.rho + self.horizon, :]
+        y = self.y[basin, idx + self.rho : idx + self.rho + self.horizon, :]
         if self.c is None or self.c.shape[-1] == 0:
             return torch.from_numpy(x).float(), torch.from_numpy(y).float()
         c = self.c[basin, :]
diff --git a/torchhydro/models/seq2seq.py b/torchhydro/models/seq2seq.py
@@ -65,7 +65,7 @@ def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1, dropout=0.3)
         self.hidden_dim = hidden_dim
         self.pre_fc = nn.Linear(input_dim, hidden_dim)
         self.pre_relu = nn.ReLU()
-        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)
+        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers)
         self.dropout = nn.Dropout(dropout)
         self.fc = nn.Linear(hidden_dim, output_dim)
 
@@ -88,7 +88,7 @@ def __init__(self, input_dim, output_dim, hidden_dim, num_layers=1, dropout=0.3)
         self.hidden_dim = hidden_dim
         self.pre_fc = nn.Linear(input_dim, hidden_dim)
         self.pre_relu = nn.ReLU()
-        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)
+        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers)
         self.dropout = nn.Dropout(dropout)
         self.fc_out = nn.Linear(hidden_dim, output_dim)
 
@@ -157,6 +157,12 @@ def __init__(
         )
         self.transfer = StateTransferNetwork(hidden_dim=hidden_size)
 
+    def _teacher_forcing_preparation(self, trgs):
+        # teacher forcing preparation
+        valid_mask = ~torch.isnan(trgs)
+        random_vals = torch.rand_like(valid_mask, dtype=torch.float)
+        return (random_vals < self.teacher_forcing_ratio) * valid_mask
+
     def forward(self, *src):
         if len(src) == 3:
             encoder_input, decoder_input, trgs = src
@@ -165,40 +171,43 @@ def forward(self, *src):
             device = decoder_input.device
             trgs = torch.full(
                 (
-                    decoder_input.shape[0],  # batch_size
                     self.hindcast_output_window + self.trg_len,  # seq
+                    decoder_input.shape[1],  # batch_size
                     self.output_size,  # features
                 ),
                 float("nan"),
             ).to(device)
-        encoder_outputs, hidden_, cell_ = self.encoder(encoder_input)
+        trgs_q = trgs[:, :, :1]
+        trgs_s = trgs[:, :, 1:]
+        trgs = torch.cat((trgs_s, trgs_q), dim=2)  # sq
+        encoder_outputs, hidden_, cell_ = self.encoder(encoder_input)  # sq
         hidden, cell = self.transfer(hidden_, cell_)
         outputs = []
-        current_input = encoder_outputs[:, -1, :].unsqueeze(1)
+        prev_output = encoder_outputs[-1, :, :].unsqueeze(0)  # sq
+        _, batch_size, _ = decoder_input.size()
 
+        outputs = torch.zeros(self.trg_len, batch_size, self.output_size).to(
+            decoder_input.device
+        )
+        use_teacher_forcing = self._teacher_forcing_preparation(trgs)
         for t in range(self.trg_len):
-            p = decoder_input[:, t, :].unsqueeze(1)
-            current_input = torch.cat((current_input, p), dim=2)
-            output, hidden, cell = self.decoder(current_input, hidden, cell)
-            outputs.append(output.squeeze(1))
-            trg = trgs[:, (self.hindcast_output_window + t), :].unsqueeze(1)
-            valid_mask = ~torch.isnan(trg)
-            random_vals = torch.rand_like(valid_mask, dtype=torch.float)
-            use_teacher_forcing = (
-                random_vals < self.teacher_forcing_ratio
-            ) * valid_mask
-            current_input = torch.where(
-                torch.isnan(trg),  # if trg is nan
-                output,  # then use output
-                trg * use_teacher_forcing
-                + output
-                * (~use_teacher_forcing),  # else calculate with teacher forcing
+            pc = decoder_input[t : t + 1, :, :]  # sq
+            obs = trgs[self.hindcast_output_window + t, :, :].unsqueeze(0)  # sq
+            safe_obs = torch.where(torch.isnan(obs), torch.zeros_like(obs), obs)
+            prev_output = torch.where(  # sq
+                use_teacher_forcing[t : t + 1, :, :],
+                safe_obs,
+                prev_output,
             )
-
-        outputs = torch.stack(outputs, dim=1)
+            current_input = torch.cat((pc, prev_output), dim=2)  # pcsq
+            output, hidden, cell = self.decoder(current_input, hidden, cell)
+            outputs[t, :, :] = output.squeeze(0)  # sq
         if self.hindcast_output_window > 0:
-            prec_outputs = encoder_outputs[:, -self.hindcast_output_window :, :]
-            outputs = torch.cat((prec_outputs, outputs), dim=1)
+            prec_outputs = encoder_outputs[-self.hindcast_output_window :, :, :]
+            outputs = torch.cat((prec_outputs, outputs), dim=0)
+        outputs_s = outputs[:, :, :1]
+        outputs_q = outputs[:, :, 1:]
+        outputs = torch.cat((outputs_q, outputs_s), dim=2)  # qs
         return outputs
 
 
diff --git a/torchhydro/models/simple_lstm.py b/torchhydro/models/simple_lstm.py
@@ -92,23 +92,26 @@ def forward(self, *x):
         xfc_rho, xfc_hor, xq_rho, xq_hor = x
 
         x_rho = torch.cat((xfc_rho, xq_rho), dim=-1)
-        seq_len, batch_size, _ = xfc_hor.size()
-
-        use_teacher_forcing = self._teacher_forcing_preparation(xq_hor)
+        hor_len, batch_size, _ = xfc_hor.size()
 
         # hindcast-forecast, we do not have forecast-hindcast situation
         # do rho forward first, prev_output is the last output of rho (seq_length = 1, batch_size, feature = output_size)
         if self.hindcast_with_output:
             _, h_n, c_n, prev_output = self._rho_forward(x_rho)
+            seq_len = hor_len
         else:
             # TODO: need more test
-            seq_len = xfc_rho.shape[0] + seq_len
+            seq_len = xfc_rho.shape[0] + hor_len
+            xfc_hor = torch.cat((xfc_rho, xfc_hor), dim=0)
+            xq_hor = torch.cat((xq_rho, xq_hor), dim=0)
             h_n = torch.randn(1, batch_size, self.hidden_size).to(xfc_rho.device) * 0.1
             c_n = torch.randn(1, batch_size, self.hidden_size).to(xfc_rho.device) * 0.1
             prev_output = (
-                torch.randn(1, batch_size, self.output_size).to(x.device) * 0.1
+                torch.randn(1, batch_size, self.output_size).to(xfc_rho.device) * 0.1
             )
 
+        use_teacher_forcing = self._teacher_forcing_preparation(xq_hor)
+
         # do hor forward
         outputs = torch.zeros(seq_len, batch_size, self.output_size).to(xfc_rho.device)
         # TODO: too slow here when seq_len is large, need to optimize
@@ -131,7 +134,7 @@ def forward(self, *x):
             prev_output = self.linearOut(out_lstm)
             outputs[t, :, :] = prev_output.squeeze(0)
         # Return the outputs
-        return outputs
+        return outputs[-hor_len:, :, :]
 
 
 class MultiLayerLSTM(nn.Module):
diff --git a/torchhydro/trainers/deep_hydro.py b/torchhydro/trainers/deep_hydro.py
@@ -385,7 +385,9 @@ def inference(self) -> Tuple[xr.Dataset, xr.Dataset]:
         test_preds = []
         obss = []
         with torch.no_grad():
-            for xs, ys in test_dataloader:
+            for xs, ys in tqdm(
+                test_dataloader, desc="Processing", total=len(test_dataloader)
+            ):
                 # here the a batch doesn't mean a basin; it is only an index in lookup table
                 # for NtoN mode, only basin is index in lookup table, so the batch is same as basin
                 # for Nto1 mode, batch is only an index
diff --git a/torchhydro/trainers/train_utils.py b/torchhydro/trainers/train_utils.py
@@ -392,13 +392,13 @@ def _recover_samples_to_basin(arr_3d, valorte_data_loader, pace_idx):
     for sample_idx in range(arr_3d.shape[0]):
         # Get the basin and start time index corresponding to this sample
         basin, start_time = dataset.lookup_table[sample_idx]
-        # Take the value at the last time step of this sample (at the position of rho + horizon)
-        value = arr_3d[sample_idx, pace_idx, :]
         # Calculate the time position in the result array
         if pace_idx < 0:
+            value = arr_3d[sample_idx, pace_idx, :]
             result_time_idx = start_time + warmup_len + rho + horizon + pace_idx
         else:
-            result_time_idx = start_time + warmup_len + rho + pace_idx
+            value = arr_3d[sample_idx, pace_idx - 1, :]
+            result_time_idx = start_time + warmup_len + rho + pace_idx - 1
         # Fill in the corresponding position
         basin_array[basin, result_time_idx, :] = value
 
@@ -609,7 +609,7 @@ def compute_validation(
     pred_final = None
     with torch.no_grad():
         iter_num = 0
-        for src, trg in data_loader:
+        for src, trg in tqdm(data_loader, desc="Processing", total=len(data_loader)):
             trg, output = model_infer(seq_first, device, model, src, trg)
             obs.append(trg)
             preds.append(output)