From c4a3343a0b45ff7c5395042ea8297e6595e86957 Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Thu, 23 Apr 2026 06:52:35 +0000 Subject: [PATCH 1/2] skipped the redundant ids coming in output --- .../birth_death_migration/import_data.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/eurostat/regional_statistics_by_nuts/birth_death_migration/import_data.py b/scripts/eurostat/regional_statistics_by_nuts/birth_death_migration/import_data.py index 61f76016e7..6965a93c53 100644 --- a/scripts/eurostat/regional_statistics_by_nuts/birth_death_migration/import_data.py +++ b/scripts/eurostat/regional_statistics_by_nuts/birth_death_migration/import_data.py @@ -180,6 +180,18 @@ def clean_data(preprocessed_df, output_path): # number of columns should be 2 + 2X, we want the first 2 + X try: logging.info('Cleaning process start. ') + + # Skip redundant IDs + ids_to_skip = ['HUX', 'HUXX', 'HUXXX', 'FRX', 'FRXX', 'FRXXX', 'EEXX', 'EEXXX'] + mask_to_skip = preprocessed_df['geo'].isin(ids_to_skip) + if mask_to_skip.any(): + skipped_data = preprocessed_df[mask_to_skip] + logging.info( + f"Skipping redundant IDs: {skipped_data['geo'].unique().tolist()}" + ) + + preprocessed_df = preprocessed_df[~mask_to_skip].copy() + num_clean_columns = len(preprocessed_df.columns) // 2 + 1 # drop unused columns clean_df = preprocessed_df.iloc[:, :num_clean_columns] From 1764148522bb576f7b3ac0837370e6113ea4027f Mon Sep 17 00:00:00 2001 From: Nivedita Singh Date: Thu, 23 Apr 2026 09:37:24 +0000 Subject: [PATCH 2/2] fixed tests --- .../birth_death_migration/import_data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/eurostat/regional_statistics_by_nuts/birth_death_migration/import_data.py b/scripts/eurostat/regional_statistics_by_nuts/birth_death_migration/import_data.py index 6965a93c53..0a7e57a2cb 100644 --- a/scripts/eurostat/regional_statistics_by_nuts/birth_death_migration/import_data.py +++ b/scripts/eurostat/regional_statistics_by_nuts/birth_death_migration/import_data.py @@ -182,7 +182,9 @@ def clean_data(preprocessed_df, output_path): logging.info('Cleaning process start. ') # Skip redundant IDs - ids_to_skip = ['HUX', 'HUXX', 'HUXXX', 'FRX', 'FRXX', 'FRXXX', 'EEXX', 'EEXXX'] + ids_to_skip = [ + 'HUX', 'HUXX', 'HUXXX', 'FRX', 'FRXX', 'FRXXX', 'EEXX', 'EEXXX' + ] mask_to_skip = preprocessed_df['geo'].isin(ids_to_skip) if mask_to_skip.any(): skipped_data = preprocessed_df[mask_to_skip]