Skip to content

Commit 51e4bc9

Browse files
Merge pull request #11 from AmandaBirmingham/readable_validation_errors_20260209
make validation errors more readable
2 parents 093e78e + 407fe79 commit 51e4bc9

File tree

5 files changed

+244
-10
lines changed

5 files changed

+244
-10
lines changed

metameq/src/metadata_extender.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from metameq.src.metadata_configurator import update_wip_metadata_dict, \
2121
build_full_flat_config_dict
2222
from metameq.src.metadata_validator import validate_metadata_df, \
23-
output_validation_msgs
23+
format_validation_msgs_as_df, output_validation_msgs
2424
import metameq.src.metadata_transformers as transformers
2525

2626

@@ -605,7 +605,7 @@ def _populate_metadata_df(
605605
metadata_df = _reorder_df(metadata_df, INTERNAL_COL_KEYS)
606606

607607
# Turn the validation messages into a DataFrame of validation messages for easier use downstream.
608-
validation_msgs_df = pandas.DataFrame(validation_msgs)
608+
validation_msgs_df = format_validation_msgs_as_df(validation_msgs)
609609

610610
return metadata_df, validation_msgs_df
611611

metameq/src/metadata_merger.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import numpy
12
import pandas
23
from typing import List, Optional, Literal
34
from metameq.src.util import validate_required_columns_exist
@@ -325,8 +326,8 @@ def _check_for_duplicate_field_vals(
325326
error_msgs = []
326327
duplicates_mask = metadata_df.duplicated(subset=col_name)
327328
if duplicates_mask.any():
328-
duplicates = metadata_df.loc[duplicates_mask, col_name].unique()
329-
duplicates.sort()
329+
duplicates = numpy.sort(
330+
metadata_df.loc[duplicates_mask, col_name].unique())
330331

331332
# generate an error message including the duplicate values
332333
error_msgs.append(

metameq/src/metadata_validator.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from dateutil import parser
55
import logging
66
import os
7+
import pandas
78
from pathlib import Path
89
from metameq.src.util import SAMPLE_NAME_KEY, get_extension, cast_field_to_type
910

@@ -146,6 +147,47 @@ def output_validation_msgs(validation_msgs_df, out_dir, out_base, sep="\t",
146147
validation_msgs_df.to_csv(out_fp, sep=sep, index=False)
147148

148149

150+
def format_validation_msgs_as_df(validation_msgs):
151+
"""Format validation messages into a more human-readable DataFrame.
152+
153+
Takes the list of validation message dictionaries (as returned by
154+
``_generate_validation_msg`` or ``validate_metadata_df``) and produces a
155+
DataFrame with one row per individual error message, sorted by sample
156+
name and field name.
157+
158+
Parameters
159+
----------
160+
validation_msgs : list
161+
A list of dictionaries, each containing SAMPLE_NAME_KEY,
162+
"field_name", and "error_message" keys, where "error_message"
163+
is a list of error strings.
164+
165+
Returns
166+
-------
167+
pandas.DataFrame
168+
A DataFrame with columns SAMPLE_NAME_KEY, "field_name", and
169+
"error_message" (a single string per row), sorted by
170+
SAMPLE_NAME_KEY then "field_name" then "error_message".
171+
"""
172+
flattened_rows = []
173+
for msg in validation_msgs:
174+
for err in msg["error_message"]:
175+
flattened_rows.append({
176+
SAMPLE_NAME_KEY: msg[SAMPLE_NAME_KEY],
177+
"field_name": msg["field_name"],
178+
"error_message": err
179+
})
180+
181+
result_df = pandas.DataFrame(
182+
flattened_rows,
183+
columns=[SAMPLE_NAME_KEY, "field_name", "error_message"])
184+
result_df.sort_values(
185+
by=[SAMPLE_NAME_KEY, "field_name", "error_message"],
186+
inplace=True)
187+
result_df.reset_index(drop=True, inplace=True)
188+
return result_df
189+
190+
149191
def _make_cerberus_schema(sample_type_metadata_dict):
150192
"""Convert a metadata fields dictionary into a cerberus-compatible validation schema.
151193

metameq/tests/test_metadata_extender/test_group_entry_points.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,8 @@ def test_write_extended_metadata_from_df_with_validation_errors(self):
289289
METADATA_FIELDS_KEY: {
290290
"restricted_field": {
291291
TYPE_KEY: "string",
292-
ALLOWED_KEY: ["allowed_value"]
292+
ALLOWED_KEY: ["allowed_value"],
293+
"regex": "^allowed_.*$"
293294
}
294295
},
295296
SAMPLE_TYPE_SPECIFIC_METADATA_KEY: {
@@ -323,15 +324,19 @@ def test_write_extended_metadata_from_df_with_validation_errors(self):
323324
})
324325
assert_frame_equal(expected_result_df, result_df)
325326

326-
# Verify validation errors file contains the error
327+
# Verify validation errors file contains the errors
328+
# (two flattened rows for sample1's restricted_field)
327329
validation_files = glob.glob(
328330
os.path.join(tmpdir, "*_test_output_validation_errors.csv"))
329331
self.assertEqual(1, len(validation_files))
330332
validation_df = pandas.read_csv(validation_files[0], sep=",", dtype=str, keep_default_na=False)
331333
expected_validation_df = pandas.DataFrame({
332-
"sample_name": ["sample1"],
333-
"field_name": ["restricted_field"],
334-
"error_message": ["['unallowed value invalid_value']"]
334+
"sample_name": ["sample1", "sample1"],
335+
"field_name": ["restricted_field", "restricted_field"],
336+
"error_message": [
337+
"unallowed value invalid_value",
338+
"value does not match regex '^allowed_.*$'"
339+
]
335340
})
336341
assert_frame_equal(expected_validation_df, validation_df)
337342

@@ -648,7 +653,7 @@ def test_write_extended_metadata_with_validation_errors(self):
648653
expected_validation_df = pandas.DataFrame({
649654
"sample_name": ["sample1"],
650655
"field_name": ["restricted_field"],
651-
"error_message": ["['unallowed value invalid_value']"]
656+
"error_message": ["unallowed value invalid_value"]
652657
})
653658
assert_frame_equal(expected_validation_df, validation_df)
654659

metameq/tests/test_metadata_validator.py

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
_make_cerberus_schema,
1212
_remove_leaf_keys_from_dict,
1313
_remove_leaf_keys_from_dict_in_list,
14+
format_validation_msgs_as_df,
1415
MetameqValidator,
1516
output_validation_msgs,
1617
validate_metadata_df
@@ -1133,3 +1134,188 @@ def test_validate_metadata_df_custom_check_with_validation(self):
11331134
"error_message": [["Date cannot be in the future"]]
11341135
})
11351136
pd.testing.assert_frame_equal(expected_df, result_df)
1137+
1138+
1139+
class TestFormatValidationMsgsAsDf(TestCase):
1140+
"""Tests for format_validation_msgs_as_df function."""
1141+
1142+
def test_format_validation_msgs_as_df_empty_list(self):
1143+
"""Test that empty input returns an empty DataFrame with correct columns."""
1144+
result = format_validation_msgs_as_df([])
1145+
1146+
self.assertIsInstance(result, pd.DataFrame)
1147+
self.assertEqual(
1148+
["sample_name", "field_name", "error_message"],
1149+
list(result.columns))
1150+
self.assertEqual(0, len(result))
1151+
1152+
def test_format_validation_msgs_as_df_single_error(self):
1153+
"""Test formatting a single validation message with one error."""
1154+
validation_msgs = [
1155+
{
1156+
"sample_name": "sample1",
1157+
"field_name": "age",
1158+
"error_message": ["must be of integer type"]
1159+
}
1160+
]
1161+
1162+
result = format_validation_msgs_as_df(validation_msgs)
1163+
1164+
expected = pd.DataFrame({
1165+
"sample_name": ["sample1"],
1166+
"field_name": ["age"],
1167+
"error_message": ["must be of integer type"]
1168+
})
1169+
pd.testing.assert_frame_equal(expected, result)
1170+
1171+
def test_format_validation_msgs_as_df_multiple_errors_same_field(self):
1172+
"""Test that multiple errors for one field are flattened to separate rows."""
1173+
validation_msgs = [
1174+
{
1175+
"sample_name": "sample1",
1176+
"field_name": "date_field",
1177+
"error_message": [
1178+
"Must be a valid date",
1179+
"value does not match regex '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'"
1180+
]
1181+
}
1182+
]
1183+
1184+
result = format_validation_msgs_as_df(validation_msgs)
1185+
1186+
expected = pd.DataFrame({
1187+
"sample_name": ["sample1", "sample1"],
1188+
"field_name": ["date_field", "date_field"],
1189+
"error_message": [
1190+
"Must be a valid date",
1191+
"value does not match regex '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'"
1192+
]
1193+
})
1194+
pd.testing.assert_frame_equal(expected, result)
1195+
1196+
def test_format_validation_msgs_as_df_multiple_fields_same_sample(self):
1197+
"""Test multiple fields with errors for the same sample."""
1198+
validation_msgs = [
1199+
{
1200+
"sample_name": "sample1",
1201+
"field_name": "age",
1202+
"error_message": ["must be of integer type"]
1203+
},
1204+
{
1205+
"sample_name": "sample1",
1206+
"field_name": "count",
1207+
"error_message": ["must be of integer type"]
1208+
}
1209+
]
1210+
1211+
result = format_validation_msgs_as_df(validation_msgs)
1212+
1213+
expected = pd.DataFrame({
1214+
"sample_name": ["sample1", "sample1"],
1215+
"field_name": ["age", "count"],
1216+
"error_message": [
1217+
"must be of integer type",
1218+
"must be of integer type"
1219+
]
1220+
})
1221+
pd.testing.assert_frame_equal(expected, result)
1222+
1223+
def test_format_validation_msgs_as_df_multiple_samples(self):
1224+
"""Test errors across multiple samples."""
1225+
validation_msgs = [
1226+
{
1227+
"sample_name": "sample1",
1228+
"field_name": "age",
1229+
"error_message": ["must be of integer type"]
1230+
},
1231+
{
1232+
"sample_name": "sample2",
1233+
"field_name": "age",
1234+
"error_message": ["must be of integer type"]
1235+
}
1236+
]
1237+
1238+
result = format_validation_msgs_as_df(validation_msgs)
1239+
1240+
expected = pd.DataFrame({
1241+
"sample_name": ["sample1", "sample2"],
1242+
"field_name": ["age", "age"],
1243+
"error_message": [
1244+
"must be of integer type",
1245+
"must be of integer type"
1246+
]
1247+
})
1248+
pd.testing.assert_frame_equal(expected, result)
1249+
1250+
def test_format_validation_msgs_as_df_sorted_by_sample_then_field(self):
1251+
"""Test that output is sorted by sample_name then field_name."""
1252+
validation_msgs = [
1253+
{
1254+
"sample_name": "sample_z",
1255+
"field_name": "beta_field",
1256+
"error_message": ["error z-beta"]
1257+
},
1258+
{
1259+
"sample_name": "sample_a",
1260+
"field_name": "gamma_field",
1261+
"error_message": ["error a-gamma"]
1262+
},
1263+
{
1264+
"sample_name": "sample_a",
1265+
"field_name": "alpha_field",
1266+
"error_message": ["error a-alpha"]
1267+
},
1268+
{
1269+
"sample_name": "sample_z",
1270+
"field_name": "alpha_field",
1271+
"error_message": ["error z-alpha"]
1272+
}
1273+
]
1274+
1275+
result = format_validation_msgs_as_df(validation_msgs)
1276+
1277+
expected = pd.DataFrame({
1278+
"sample_name": [
1279+
"sample_a", "sample_a", "sample_z", "sample_z"],
1280+
"field_name": [
1281+
"alpha_field", "gamma_field", "alpha_field", "beta_field"],
1282+
"error_message": [
1283+
"error a-alpha", "error a-gamma",
1284+
"error z-alpha", "error z-beta"]
1285+
})
1286+
pd.testing.assert_frame_equal(expected, result)
1287+
1288+
def test_format_validation_msgs_as_df_flattening_and_sorting_combined(self):
1289+
"""Test that flattening and sorting work correctly together."""
1290+
validation_msgs = [
1291+
{
1292+
"sample_name": "sample_b",
1293+
"field_name": "field_x",
1294+
"error_message": ["error 2", "error 1"]
1295+
},
1296+
{
1297+
"sample_name": "sample_a",
1298+
"field_name": "field_y",
1299+
"error_message": ["error 5", "error 3"]
1300+
},
1301+
{
1302+
"sample_name": "sample_a",
1303+
"field_name": "field_x",
1304+
"error_message": ["error 4"]
1305+
}
1306+
]
1307+
1308+
result = format_validation_msgs_as_df(validation_msgs)
1309+
1310+
expected = pd.DataFrame({
1311+
"sample_name": [
1312+
"sample_a", "sample_a", "sample_a",
1313+
"sample_b", "sample_b"],
1314+
"field_name": [
1315+
"field_x", "field_y", "field_y",
1316+
"field_x", "field_x"],
1317+
"error_message": [
1318+
"error 4", "error 3", "error 5",
1319+
"error 1", "error 2"]
1320+
})
1321+
pd.testing.assert_frame_equal(expected, result)

0 commit comments

Comments
 (0)