diff --git a/scripts/eurostat/regional_statistics_by_nuts/birth_death_migration/import_data.py b/scripts/eurostat/regional_statistics_by_nuts/birth_death_migration/import_data.py index 61f76016e7..0a7e57a2cb 100644 --- a/scripts/eurostat/regional_statistics_by_nuts/birth_death_migration/import_data.py +++ b/scripts/eurostat/regional_statistics_by_nuts/birth_death_migration/import_data.py @@ -180,6 +180,20 @@ def clean_data(preprocessed_df, output_path): # number of columns should be 2 + 2X, we want the first 2 + X try: logging.info('Cleaning process start. ') + + # Skip redundant IDs + ids_to_skip = [ + 'HUX', 'HUXX', 'HUXXX', 'FRX', 'FRXX', 'FRXXX', 'EEXX', 'EEXXX' + ] + mask_to_skip = preprocessed_df['geo'].isin(ids_to_skip) + if mask_to_skip.any(): + skipped_data = preprocessed_df[mask_to_skip] + logging.info( + f"Skipping redundant IDs: {skipped_data['geo'].unique().tolist()}" + ) + + preprocessed_df = preprocessed_df[~mask_to_skip].copy() + num_clean_columns = len(preprocessed_df.columns) // 2 + 1 # drop unused columns clean_df = preprocessed_df.iloc[:, :num_clean_columns]