Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Update counters integration to use string split and fix missing import
  • Loading branch information
SandeepTuniki committed Apr 23, 2026
commit b3dea6b58e53d8be2c3e60401cdbbf5dce044c1b
28 changes: 23 additions & 5 deletions tools/import_validation/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,16 +179,34 @@ def _initialize_data_sources(self, stats_summary: str, lint_report: str,
logging.warning("lint_report file exists but is empty: %s",
lint_report)

if counters_report and os.path.exists(counters_report) and os.path.getsize(
if counters_report:
self._load_counters(counters_report)

def _load_counters(self, counters_report: str):
"""Loads counters from a CSV file and stores them in data_sources."""
if os.path.exists(counters_report) and os.path.getsize(
counters_report) > 0:
try:
with open(counters_report, 'r') as f:
self.data_sources['counters'] = json.load(f)
df = pd.read_csv(counters_report)
def clean_key(x):
if not isinstance(x, str):
return x
if ':' in x:
x = x.split(':', 1)[1]
if '_' in x:
x = x.rsplit('_', 1)[-1]
return x
Comment on lines +191 to +198
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The clean_key logic is too aggressive and will break counter names that contain underscores. For example, a counter named records_count (following the recommended naming convention) would be incorrectly truncated to count. Additionally, rsplit('_', 1) only works if the counter name itself has no underscores. A more robust approach would be to use a regular expression to strip the specific stage prefix pattern (e.g., digit:string_).

                def clean_key(x):
                    if not isinstance(x, str):
                        return x
                    import re
                    # Strip stage prefix like "2:prepare_output_"
                    return re.sub(r'^\d+:[^:]+_', '', x)
References
  1. When naming a variable for a count of items, prefer the pattern plural_noun_count (e.g., records_count) over singular_noun_counts (e.g., record_counts).


df['key'] = df['key'].apply(clean_key)
# Aggregate by summing if there are duplicates
df = df.groupby('key')['value'].sum().reset_index()
self.data_sources['counters'] = dict(
zip(df['key'], df['value']))
except Exception as e:
logging.error(
f"JSON parse error while reading counters report at {counters_report}: {e}"
f"CSV parse error while reading counters report at {counters_report}: {e}"
)
Comment on lines +189 to +208
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

It is recommended to raise an exception if the counters report fails to parse, rather than just logging an error. Continuing with an empty counters dictionary could lead to incorrect validation results (e.g., COUNTER_ZERO_CHECK passing when it should have failed because the data was missing). Raising a ValueError ensures the user is aware that the validation process is incomplete due to corrupted input data, enforcing the correctness of the validation process.

Suggested change
try:
with open(counters_report, 'r') as f:
self.data_sources['counters'] = json.load(f)
except Exception as e:
logging.error(
f"JSON parse error while reading counters report at {counters_report}: {e}"
)
try:
with open(counters_report, 'r') as f:
self.data_sources['counters'] = json.load(f)
except (json.JSONDecodeError, OSError) as e:
raise ValueError(
f"Failed to parse counters report at {counters_report}: {e}"
) from e
References
  1. When a configuration dictionary like resource_limits is provided, it is acceptable to assume it contains all required keys and allow it to fail on a KeyError if a key is missing, rather than defensively using default values. This enforces configuration correctness.

Comment on lines +205 to +208
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Avoid catching broad exceptions. It is better to catch specific errors that pd.read_csv might raise, such as pd.errors.ParserError or pd.errors.EmptyDataError.

Suggested change
except Exception as e:
logging.error(
f"CSV parse error while reading counters report at {counters_report}: {e}"
)
except pd.errors.ParserError as e:
logging.error(
f"CSV parse error while reading counters report at {counters_report}: {e}"
)

elif counters_report and os.path.exists(counters_report):
elif os.path.exists(counters_report):
logging.warning("counters_report file exists but is empty: %s",
counters_report)

Expand Down
56 changes: 52 additions & 4 deletions tools/import_validation/runner_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
"""Tests for the ValidationRunner."""

import csv
import unittest
from unittest.mock import patch, MagicMock
import pandas as pd
Expand Down Expand Up @@ -362,7 +363,7 @@ def setUp(self):
self.report_path = os.path.join(self.test_dir.name, 'report.json')
self.differ_path = os.path.join(self.test_dir.name, 'differ.csv')
self.output_path = os.path.join(self.test_dir.name, 'output.csv')
self.counters_path = os.path.join(self.test_dir.name, 'counters.json')
self.counters_path = os.path.join(self.test_dir.name, 'counters.csv')

def tearDown(self):
self.test_dir.cleanup()
Expand All @@ -385,9 +386,11 @@ def test_runner_loads_counters_and_calls_validator(self, MockValidator):
}]
}, f)

counters_data = {'invalid-lat-lng': 0}
with open(self.counters_path, 'w') as f:
json.dump(counters_data, f)
# Create sample CSV data for counters
with open(self.counters_path, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['key', 'value'])
writer.writerow(['invalid-lat-lng', 0])

# 3. Run the runner
runner = ValidationRunner(
Expand All @@ -404,3 +407,48 @@ def test_runner_loads_counters_and_calls_validator(self, MockValidator):
call_args, _ = mock_validator_instance.validate_counter_zero.call_args
self.assertEqual(call_args[0]['invalid-lat-lng'], 0)

@patch('tools.import_validation.runner.Validator')
def test_runner_strips_counter_prefixes(self, MockValidator):
# 1. Setup the mock
mock_validator_instance = MockValidator.return_value
mock_validator_instance.validate_counter_max_threshold.return_value = ValidationResult(
ValidationStatus.PASSED, 'COUNTER_MAX_THRESHOLD')

# 2. Create test files with prefixed keys
with open(self.config_path, 'w') as f:
json.dump(
{
'rules': [{
'rule_id': 'check_dropped_points',
'validator': 'COUNTER_MAX_THRESHOLD',
'params': {
'counter_name': 'dropped-points',
'threshold': 10
}
}]
}, f)

# Create sample CSV data with prefixes and duplicates
with open(self.counters_path, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['key', 'value'])
writer.writerow(['2:prepare_output_dropped-points', 5])
writer.writerow(['3:write_statvar_mcf_dropped-points', 3])

# 3. Run the runner
runner = ValidationRunner(
validation_config_path=self.config_path,
stats_summary=self.stats_path,
differ_output=self.differ_path,
lint_report=self.report_path,
validation_output=self.output_path,
counters_report=self.counters_path)
runner.run_validations()

# 4. Assert that the correct method was called with aggregated counters
mock_validator_instance.validate_counter_max_threshold.assert_called_once()
call_args, _ = mock_validator_instance.validate_counter_max_threshold.call_args
# Values should be summed: 5 + 3 = 8
self.assertEqual(call_args[0]['dropped-points'], 8)


Loading