Update counters integration to use string split and fix missing import

datacommonsorg · SandeepTuniki · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · b3dea6b58e53d8be2c3e60401cdbbf5dce044c1b
commit b3dea6b58e53d8be2c3e60401cdbbf5dce044c1b
diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py
@@ -179,16 +179,34 @@ def _initialize_data_sources(self, stats_summary: str, lint_report: str,
             logging.warning("lint_report file exists but is empty: %s",
                             lint_report)
 
-        if counters_report and os.path.exists(counters_report) and os.path.getsize(
+        if counters_report:
+            self._load_counters(counters_report)
+
+    def _load_counters(self, counters_report: str):
+        """Loads counters from a CSV file and stores them in data_sources."""
+        if os.path.exists(counters_report) and os.path.getsize(
                 counters_report) > 0:
             try:
-                with open(counters_report, 'r') as f:
-                    self.data_sources['counters'] = json.load(f)
+                df = pd.read_csv(counters_report)
+                def clean_key(x):
+                    if not isinstance(x, str):
+                        return x
+                    if ':' in x:
+                        x = x.split(':', 1)[1]
+                    if '_' in x:
+                        x = x.rsplit('_', 1)[-1]
+                    return x
+
+                df['key'] = df['key'].apply(clean_key)
+                # Aggregate by summing if there are duplicates
+                df = df.groupby('key')['value'].sum().reset_index()
+                self.data_sources['counters'] = dict(
+                    zip(df['key'], df['value']))
             except Exception as e:
                 logging.error(
-                    f"JSON parse error while reading counters report at {counters_report}: {e}"
+                    f"CSV parse error while reading counters report at {counters_report}: {e}"
                 )
-            try:
-                with open(counters_report, 'r') as f:
-                    self.data_sources['counters'] = json.load(f)
-            except Exception as e:
-                logging.error(
-                    f"JSON parse error while reading counters report at {counters_report}: {e}"
-                )
+            try:
+                with open(counters_report, 'r') as f:
+                    self.data_sources['counters'] = json.load(f)
+            except (json.JSONDecodeError, OSError) as e:
+                raise ValueError(
+                    f"Failed to parse counters report at {counters_report}: {e}"
+                ) from e
-            except Exception as e:
-                logging.error(
-                    f"CSV parse error while reading counters report at {counters_report}: {e}"
-                )
+            except pd.errors.ParserError as e:
+                logging.error(
+                    f"CSV parse error while reading counters report at {counters_report}: {e}"
+                )
-            try:
-                with open(counters_report, 'r') as f:
-                    self.data_sources['counters'] = json.load(f)
-            except Exception as e:
-                logging.error(
-                    f"JSON parse error while reading counters report at {counters_report}: {e}"
-                )
+            try:
+                with open(counters_report, 'r') as f:
+                    self.data_sources['counters'] = json.load(f)
+            except (json.JSONDecodeError, OSError) as e:
+                raise ValueError(
+                    f"Failed to parse counters report at {counters_report}: {e}"
+                ) from e
-            except Exception as e:
-                logging.error(
-                    f"CSV parse error while reading counters report at {counters_report}: {e}"
-                )
+            except pd.errors.ParserError as e:
+                logging.error(
+                    f"CSV parse error while reading counters report at {counters_report}: {e}"
+                )
-        elif counters_report and os.path.exists(counters_report):
+        elif os.path.exists(counters_report):
             logging.warning("counters_report file exists but is empty: %s",
                             counters_report)
 

diff --git a/tools/import_validation/runner_test.py b/tools/import_validation/runner_test.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Tests for the ValidationRunner."""
 
+import csv
 import unittest
 from unittest.mock import patch, MagicMock
 import pandas as pd
@@ -362,7 +363,7 @@ def setUp(self):
         self.report_path = os.path.join(self.test_dir.name, 'report.json')
         self.differ_path = os.path.join(self.test_dir.name, 'differ.csv')
         self.output_path = os.path.join(self.test_dir.name, 'output.csv')
-        self.counters_path = os.path.join(self.test_dir.name, 'counters.json')
+        self.counters_path = os.path.join(self.test_dir.name, 'counters.csv')
 
     def tearDown(self):
         self.test_dir.cleanup()
@@ -385,9 +386,11 @@ def test_runner_loads_counters_and_calls_validator(self, MockValidator):
                     }]
                 }, f)
 
-        counters_data = {'invalid-lat-lng': 0}
-        with open(self.counters_path, 'w') as f:
-            json.dump(counters_data, f)
+        # Create sample CSV data for counters
+        with open(self.counters_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            writer.writerow(['key', 'value'])
+            writer.writerow(['invalid-lat-lng', 0])
 
         # 3. Run the runner
         runner = ValidationRunner(
@@ -404,3 +407,48 @@ def test_runner_loads_counters_and_calls_validator(self, MockValidator):
         call_args, _ = mock_validator_instance.validate_counter_zero.call_args
         self.assertEqual(call_args[0]['invalid-lat-lng'], 0)
 
+    @patch('tools.import_validation.runner.Validator')
+    def test_runner_strips_counter_prefixes(self, MockValidator):
+        # 1. Setup the mock
+        mock_validator_instance = MockValidator.return_value
+        mock_validator_instance.validate_counter_max_threshold.return_value = ValidationResult(
+            ValidationStatus.PASSED, 'COUNTER_MAX_THRESHOLD')
+
+        # 2. Create test files with prefixed keys
+        with open(self.config_path, 'w') as f:
+            json.dump(
+                {
+                    'rules': [{
+                        'rule_id': 'check_dropped_points',
+                        'validator': 'COUNTER_MAX_THRESHOLD',
+                        'params': {
+                            'counter_name': 'dropped-points',
+                            'threshold': 10
+                        }
+                    }]
+                }, f)
+
+        # Create sample CSV data with prefixes and duplicates
+        with open(self.counters_path, 'w', newline='') as f:
+            writer = csv.writer(f)
+            writer.writerow(['key', 'value'])
+            writer.writerow(['2:prepare_output_dropped-points', 5])
+            writer.writerow(['3:write_statvar_mcf_dropped-points', 3])
+
+        # 3. Run the runner
+        runner = ValidationRunner(
+            validation_config_path=self.config_path,
+            stats_summary=self.stats_path,
+            differ_output=self.differ_path,
+            lint_report=self.report_path,
+            validation_output=self.output_path,
+            counters_report=self.counters_path)
+        runner.run_validations()
+
+        # 4. Assert that the correct method was called with aggregated counters
+        mock_validator_instance.validate_counter_max_threshold.assert_called_once()
+        call_args, _ = mock_validator_instance.validate_counter_max_threshold.call_args
+        # Values should be summed: 5 + 3 = 8
+        self.assertEqual(call_args[0]['dropped-points'], 8)
+
+