del gpu cache during validation

OuyangWenyu · OuyangWenyu · commit 69cb8a011210 · 2025-04-21T13:47:05.000Z
diff --git a/torchhydro/datasets/data_sets.py b/torchhydro/datasets/data_sets.py
@@ -495,24 +495,24 @@ def _read_xyc(self):
         end_date = self.t_s_dict["t_final_range"][1]
         return self._read_xyc_specified_time(start_date, end_date)
 
-    def _rm_timeunit_key(self, data_output_ds_):
+    def _rm_timeunit_key(self, ds_):
         """this means the data source return a dict with key as time_unit
             in this BaseDataset, we only support unified time range for all basins, so we chose the first key
             TODO: maybe this could be refactored better
 
         Parameters
         ----------
-        data_output_ds_ : dict
-            the output data with time_unit as key
+        ds_ : dict
+            the xarray data with time_unit as key
 
         Returns
         ----------
-        data_output_ds_ : xr.Dataset
+        ds_ : xr.Dataset
             the output data without time_unit
         """
-        if isinstance(data_output_ds_, dict):
-            data_output_ds_ = data_output_ds_[list(data_output_ds_.keys())[0]]
-        return data_output_ds_
+        if isinstance(ds_, dict):
+            ds_ = ds_[list(ds_.keys())[0]]
+        return ds_
 
     def _read_xyc_specified_time(self, start_date, end_date):
         """Read x, y, c data from data source with specified time range
diff --git a/torchhydro/trainers/train_utils.py b/torchhydro/trainers/train_utils.py
@@ -532,7 +532,7 @@ def compute_validation(
     data_loader: DataLoader,
     device: torch.device = None,
     **kwargs,
-) -> float:
+):
     """
     Function to compute the validation loss metrics
 
@@ -557,21 +557,34 @@ def compute_validation(
     obs = []
     preds = []
     valid_loss = 0.0
+    obs_final = None
+    pred_final = None
     with torch.no_grad():
+        iter_num = 0
         for src, trg in data_loader:
             trg, output = model_infer(seq_first, device, model, src, trg)
             obs.append(trg)
             preds.append(output)
             valid_loss_ = compute_loss(trg, output, criterion)
+            if torch.isnan(valid_loss_):
+                # for not-train mode, we may get all nan data for trg
+                # so we skip this batch
+                continue
             valid_loss = valid_loss + valid_loss_.item()
+            iter_num = iter_num + 1
             # clear memory to save GPU memory
+            if obs_final is None:
+                obs_final = trg.detach().cpu()
+                pred_final = output.detach().cpu()
+            else:
+                obs_final = torch.cat([obs_final, trg.detach().cpu()], dim=0)
+                pred_final = torch.cat([pred_final, output.detach().cpu()], dim=0)
+            del trg, output
             torch.cuda.empty_cache()
         # first dim is batch
-        obs_final = torch.cat(obs, dim=0)
-        pred_final = torch.cat(preds, dim=0)
-    valid_loss = valid_loss / len(data_loader)
-    y_obs = obs_final.detach().cpu().numpy()
-    y_pred = pred_final.detach().cpu().numpy()
+    valid_loss = valid_loss / iter_num
+    y_obs = obs_final.numpy()
+    y_pred = pred_final.numpy()
     return y_obs, y_pred, valid_loss