|
11 | 11 | _make_cerberus_schema, |
12 | 12 | _remove_leaf_keys_from_dict, |
13 | 13 | _remove_leaf_keys_from_dict_in_list, |
| 14 | + format_validation_msgs_as_df, |
14 | 15 | MetameqValidator, |
15 | 16 | output_validation_msgs, |
16 | 17 | validate_metadata_df |
@@ -1133,3 +1134,188 @@ def test_validate_metadata_df_custom_check_with_validation(self): |
1133 | 1134 | "error_message": [["Date cannot be in the future"]] |
1134 | 1135 | }) |
1135 | 1136 | pd.testing.assert_frame_equal(expected_df, result_df) |
| 1137 | + |
| 1138 | + |
| 1139 | +class TestFormatValidationMsgsAsDf(TestCase): |
| 1140 | + """Tests for format_validation_msgs_as_df function.""" |
| 1141 | + |
| 1142 | + def test_format_validation_msgs_as_df_empty_list(self): |
| 1143 | + """Test that empty input returns an empty DataFrame with correct columns.""" |
| 1144 | + result = format_validation_msgs_as_df([]) |
| 1145 | + |
| 1146 | + self.assertIsInstance(result, pd.DataFrame) |
| 1147 | + self.assertEqual( |
| 1148 | + ["sample_name", "field_name", "error_message"], |
| 1149 | + list(result.columns)) |
| 1150 | + self.assertEqual(0, len(result)) |
| 1151 | + |
| 1152 | + def test_format_validation_msgs_as_df_single_error(self): |
| 1153 | + """Test formatting a single validation message with one error.""" |
| 1154 | + validation_msgs = [ |
| 1155 | + { |
| 1156 | + "sample_name": "sample1", |
| 1157 | + "field_name": "age", |
| 1158 | + "error_message": ["must be of integer type"] |
| 1159 | + } |
| 1160 | + ] |
| 1161 | + |
| 1162 | + result = format_validation_msgs_as_df(validation_msgs) |
| 1163 | + |
| 1164 | + expected = pd.DataFrame({ |
| 1165 | + "sample_name": ["sample1"], |
| 1166 | + "field_name": ["age"], |
| 1167 | + "error_message": ["must be of integer type"] |
| 1168 | + }) |
| 1169 | + pd.testing.assert_frame_equal(expected, result) |
| 1170 | + |
| 1171 | + def test_format_validation_msgs_as_df_multiple_errors_same_field(self): |
| 1172 | + """Test that multiple errors for one field are flattened to separate rows.""" |
| 1173 | + validation_msgs = [ |
| 1174 | + { |
| 1175 | + "sample_name": "sample1", |
| 1176 | + "field_name": "date_field", |
| 1177 | + "error_message": [ |
| 1178 | + "Must be a valid date", |
| 1179 | + "value does not match regex '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'" |
| 1180 | + ] |
| 1181 | + } |
| 1182 | + ] |
| 1183 | + |
| 1184 | + result = format_validation_msgs_as_df(validation_msgs) |
| 1185 | + |
| 1186 | + expected = pd.DataFrame({ |
| 1187 | + "sample_name": ["sample1", "sample1"], |
| 1188 | + "field_name": ["date_field", "date_field"], |
| 1189 | + "error_message": [ |
| 1190 | + "Must be a valid date", |
| 1191 | + "value does not match regex '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'" |
| 1192 | + ] |
| 1193 | + }) |
| 1194 | + pd.testing.assert_frame_equal(expected, result) |
| 1195 | + |
| 1196 | + def test_format_validation_msgs_as_df_multiple_fields_same_sample(self): |
| 1197 | + """Test multiple fields with errors for the same sample.""" |
| 1198 | + validation_msgs = [ |
| 1199 | + { |
| 1200 | + "sample_name": "sample1", |
| 1201 | + "field_name": "age", |
| 1202 | + "error_message": ["must be of integer type"] |
| 1203 | + }, |
| 1204 | + { |
| 1205 | + "sample_name": "sample1", |
| 1206 | + "field_name": "count", |
| 1207 | + "error_message": ["must be of integer type"] |
| 1208 | + } |
| 1209 | + ] |
| 1210 | + |
| 1211 | + result = format_validation_msgs_as_df(validation_msgs) |
| 1212 | + |
| 1213 | + expected = pd.DataFrame({ |
| 1214 | + "sample_name": ["sample1", "sample1"], |
| 1215 | + "field_name": ["age", "count"], |
| 1216 | + "error_message": [ |
| 1217 | + "must be of integer type", |
| 1218 | + "must be of integer type" |
| 1219 | + ] |
| 1220 | + }) |
| 1221 | + pd.testing.assert_frame_equal(expected, result) |
| 1222 | + |
| 1223 | + def test_format_validation_msgs_as_df_multiple_samples(self): |
| 1224 | + """Test errors across multiple samples.""" |
| 1225 | + validation_msgs = [ |
| 1226 | + { |
| 1227 | + "sample_name": "sample1", |
| 1228 | + "field_name": "age", |
| 1229 | + "error_message": ["must be of integer type"] |
| 1230 | + }, |
| 1231 | + { |
| 1232 | + "sample_name": "sample2", |
| 1233 | + "field_name": "age", |
| 1234 | + "error_message": ["must be of integer type"] |
| 1235 | + } |
| 1236 | + ] |
| 1237 | + |
| 1238 | + result = format_validation_msgs_as_df(validation_msgs) |
| 1239 | + |
| 1240 | + expected = pd.DataFrame({ |
| 1241 | + "sample_name": ["sample1", "sample2"], |
| 1242 | + "field_name": ["age", "age"], |
| 1243 | + "error_message": [ |
| 1244 | + "must be of integer type", |
| 1245 | + "must be of integer type" |
| 1246 | + ] |
| 1247 | + }) |
| 1248 | + pd.testing.assert_frame_equal(expected, result) |
| 1249 | + |
| 1250 | + def test_format_validation_msgs_as_df_sorted_by_sample_then_field(self): |
| 1251 | + """Test that output is sorted by sample_name then field_name.""" |
| 1252 | + validation_msgs = [ |
| 1253 | + { |
| 1254 | + "sample_name": "sample_z", |
| 1255 | + "field_name": "beta_field", |
| 1256 | + "error_message": ["error z-beta"] |
| 1257 | + }, |
| 1258 | + { |
| 1259 | + "sample_name": "sample_a", |
| 1260 | + "field_name": "gamma_field", |
| 1261 | + "error_message": ["error a-gamma"] |
| 1262 | + }, |
| 1263 | + { |
| 1264 | + "sample_name": "sample_a", |
| 1265 | + "field_name": "alpha_field", |
| 1266 | + "error_message": ["error a-alpha"] |
| 1267 | + }, |
| 1268 | + { |
| 1269 | + "sample_name": "sample_z", |
| 1270 | + "field_name": "alpha_field", |
| 1271 | + "error_message": ["error z-alpha"] |
| 1272 | + } |
| 1273 | + ] |
| 1274 | + |
| 1275 | + result = format_validation_msgs_as_df(validation_msgs) |
| 1276 | + |
| 1277 | + expected = pd.DataFrame({ |
| 1278 | + "sample_name": [ |
| 1279 | + "sample_a", "sample_a", "sample_z", "sample_z"], |
| 1280 | + "field_name": [ |
| 1281 | + "alpha_field", "gamma_field", "alpha_field", "beta_field"], |
| 1282 | + "error_message": [ |
| 1283 | + "error a-alpha", "error a-gamma", |
| 1284 | + "error z-alpha", "error z-beta"] |
| 1285 | + }) |
| 1286 | + pd.testing.assert_frame_equal(expected, result) |
| 1287 | + |
| 1288 | + def test_format_validation_msgs_as_df_flattening_and_sorting_combined(self): |
| 1289 | + """Test that flattening and sorting work correctly together.""" |
| 1290 | + validation_msgs = [ |
| 1291 | + { |
| 1292 | + "sample_name": "sample_b", |
| 1293 | + "field_name": "field_x", |
| 1294 | + "error_message": ["error 2", "error 1"] |
| 1295 | + }, |
| 1296 | + { |
| 1297 | + "sample_name": "sample_a", |
| 1298 | + "field_name": "field_y", |
| 1299 | + "error_message": ["error 5", "error 3"] |
| 1300 | + }, |
| 1301 | + { |
| 1302 | + "sample_name": "sample_a", |
| 1303 | + "field_name": "field_x", |
| 1304 | + "error_message": ["error 4"] |
| 1305 | + } |
| 1306 | + ] |
| 1307 | + |
| 1308 | + result = format_validation_msgs_as_df(validation_msgs) |
| 1309 | + |
| 1310 | + expected = pd.DataFrame({ |
| 1311 | + "sample_name": [ |
| 1312 | + "sample_a", "sample_a", "sample_a", |
| 1313 | + "sample_b", "sample_b"], |
| 1314 | + "field_name": [ |
| 1315 | + "field_x", "field_y", "field_y", |
| 1316 | + "field_x", "field_x"], |
| 1317 | + "error_message": [ |
| 1318 | + "error 4", "error 3", "error 5", |
| 1319 | + "error 1", "error 2"] |
| 1320 | + }) |
| 1321 | + pd.testing.assert_frame_equal(expected, result) |
0 commit comments