add hurs calculation from t2m and d2m

chantreux · chantreux · commit 1da96b370330 · 2025-11-21T11:10:48.000+01:00
diff --git a/scripts/derived/derived-era5-single-levels-daily-statistics.py b/scripts/derived/derived-era5-single-levels-daily-statistics.py
@@ -0,0 +1,62 @@
+import operations
+import pandas as pd
+import xarray as xr
+import glob
+import os
+import logging
+from pathlib import Path
+import sys
+sys.path.append('../utilities')
+from utils import  load_output_path_from_row, require_single_row
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+def main():
+    dataset="derived-era5-single-levels-daily-statistics"
+    variables_file_path = f"../../requests/{dataset}.csv"
+    df_parameters = pd.read_csv(variables_file_path)
+    derived_variables = df_parameters[df_parameters['product_type'] == 'derived']['filename_variable']
+    derived_variables_list = derived_variables.tolist()
+    logging.info(f"Derived variables to process: {derived_variables_list}")
+    for var in derived_variables_list:
+        logging.info(f"Calculating {var}")
+        mask_var = (df_parameters['filename_variable'] == var) & (df_parameters['product_type'] == 'derived')
+        var_row = require_single_row(df_parameters, mask_var, f"{var}/derived")
+        
+        # Create a list of years from start to end
+        year_list = list(range(var_row["cds_years_start"].squeeze() , var_row["cds_years_end"].squeeze()  + 1))
+        for year in year_list:
+
+            if var == "hurs":               
+                input_row_d2m = require_single_row(df_parameters, (df_parameters['filename_variable'] == "d2m") & (df_parameters['product_type'] == 'raw'), "d2m/raw")
+                input_row_t2m = require_single_row(df_parameters, (df_parameters['filename_variable'] == "t2m") & (df_parameters['product_type'] == 'raw'), "t2m/raw")
+                # Use utility function to load input paths
+                d2m_download_path = load_output_path_from_row(input_row_d2m, dataset)
+                d2m_file = glob.glob(f"{d2m_download_path}/*{year}*.nc")[0]
+                t2m_download_path = load_output_path_from_row(input_row_t2m, dataset)
+                t2m_file = glob.glob(f"{t2m_download_path}/*{year}*.nc")[0]
+                # Use utility function to build output path
+                dest_dir = load_output_path_from_row(var_row, dataset)
+                os.makedirs(dest_dir, exist_ok=True)
+                hurs_file = os.path.basename(d2m_file).replace("d2m", "hurs")
+                output_file=Path(f"{dest_dir}/{hurs_file}")
+                logging.info(f"output_file: {output_file}")
+                if output_file.exists():
+                    logging.info(f"File {output_file} already exists. Skipping...")
+                    continue
+                logging.info(f"Calculating hurs from {d2m_file} and {t2m_file}")
+                ds_d2m = xr.open_dataset(d2m_file)
+                ds_t2m = xr.open_dataset(t2m_file)
+                ds_merge = xr.merge([ds_d2m, ds_t2m])
+                hurs = operations.rh_from_thermofeel(ds_merge, "d2m", "t2m")
+
+                logging.info(f"Saving calculated hurs to {dest_dir}")
+                hurs.to_netcdf(output_file)
+
+                ds_d2m.close()
+                ds_t2m.close()
+                hurs.close()
+                del ds_d2m, ds_t2m, hurs
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/derived/operations.py b/scripts/derived/operations.py
@@ -3,8 +3,6 @@
 import numpy as np
 from thermofeel.thermofeel import calculate_relative_humidity_percent
 
-# Import the original computation (the function lives in the same package)
-
 
 def rh_from_thermofeel(ds: xr.Dataset, td_var: str, t2_var: str) -> xr.Dataset:
     """
@@ -47,17 +45,18 @@ def rh_from_thermofeel(ds: xr.Dataset, td_var: str, t2_var: str) -> xr.Dataset:
         dask="parallelized",
         output_dtypes=[float],
     )
-
+    # Ensure that RH values are within physical bounds [0, 100]
+    rh_da = rh_da.clip(min=0.0, max=100.0)
     # Name and attributes for the output
-    rh_da.name = "relative_humidity"
+    rh_da.name = "hurs"
     rh_da.attrs["units"] = "%"
     rh_da.attrs["long_name"] = "Relative Humidity"
 
     # Build output dataset (copy ds so we keep coords and any ancillary variables)
     ds_out = ds.copy()
 
     # Add the new variable and remove the original ones
-    ds_out["relative_humidity"] = rh_da
+    ds_out["hurs"] = rh_da
     ds_out = ds_out.drop_vars([td_var, t2_var])
 
     return ds_out
diff --git a/scripts/derived/reanalysis-cerra-land_accumulation.py b/scripts/derived/reanalysis-cerra-land_accumulation.py
@@ -165,11 +165,15 @@ def accumulation(ds,var):
             if output_file.exists():
                 logging.info(f"File {output_file} already exists. Skipping...")
                 continue
-            next_file=var_files[i+1] if i+1 < len(var_files) else None
-            logging.info(f"Processing file {file} and next file {next_file}")
-
-            check_time_gap(file, next_file, expected_timestep='1h')
-            ds_var = xr.open_mfdataset([file,next_file],concat_dim='valid_time', combine='nested') 
+            if i+2 < len(var_files):
+                next_file=var_files[i+1]
+                logging.info(f"Processing file {file} and next file {next_file}")
+                check_time_gap(file, next_file, expected_timestep='1h')
+                file_list=[file,next_file]
+            else:
+                file_list=[file]
+
+            ds_var = xr.open_mfdataset(file_list,concat_dim='valid_time', combine='nested') 
             ds_accumulated=accumulation(ds_var,var)
             first_month_data = get_first_month_accumulated(ds_accumulated)