From 5cd5516158bd022515434da4c2983219763f8765 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 18 Mar 2026 11:54:34 +0100 Subject: [PATCH 1/7] Simplify status clause --- src/routers/openml/datasets.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py index 164efd7d..0db65e76 100644 --- a/src/routers/openml/datasets.py +++ b/src/routers/openml/datasets.py @@ -115,16 +115,10 @@ async def list_datasets( # noqa: PLR0913 """, ) - if status == DatasetStatusFilter.ALL: - statuses = [ - DatasetStatusFilter.ACTIVE, - DatasetStatusFilter.DEACTIVATED, - DatasetStatusFilter.IN_PREPARATION, - ] - else: - statuses = [status] + clauses = [] + if status != DatasetStatusFilter.ALL: + clauses.append(f"AND IFNULL(cs.`status`, 'in_preparation') = '{status}' ") - where_status = ",".join(f"'{status}'" for status in statuses) if user is None: visible_to_user = "`visibility`='public'" elif UserGroup.ADMIN in await user.get_groups(): @@ -182,7 +176,7 @@ def quality_clause(quality: str, range_: str | None) -> str: WHERE {visible_to_user} {where_name} {where_version} {where_uploader} {where_data_id} {matching_tag} {number_instances_filter} {number_features_filter} {number_classes_filter} {number_missing_values_filter} - AND IFNULL(cs.`status`, 'in_preparation') IN ({where_status}) + {"".join(clauses)} LIMIT {pagination.limit} OFFSET {pagination.offset} """, # noqa: S608 # I am not sure how to do this correctly without an error from Bandit here. From 78d15fb5bc327434bcffb5c2626831fd2028370b Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 18 Mar 2026 12:14:39 +0100 Subject: [PATCH 2/7] Follow consistent pattern when constructing where clauses --- src/routers/openml/datasets.py | 65 ++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py index 0db65e76..6e7939e9 100644 --- a/src/routers/openml/datasets.py +++ b/src/routers/openml/datasets.py @@ -116,36 +116,45 @@ async def list_datasets( # noqa: PLR0913 ) clauses = [] + parameters = {} if status != DatasetStatusFilter.ALL: - clauses.append(f"AND IFNULL(cs.`status`, 'in_preparation') = '{status}' ") + clauses.append(f"AND IFNULL(cs.`status`, 'in_preparation') = :status") + parameters["status"] = status if user is None: - visible_to_user = "`visibility`='public'" - elif UserGroup.ADMIN in await user.get_groups(): - visible_to_user = "TRUE" - else: - visible_to_user = f"(`visibility`='public' OR `uploader`={user.user_id})" + clauses.append("AND `visibility`='public'") + elif UserGroup.ADMIN not in await user.get_groups(): + clauses.append("AND (`visibility`='public' OR `uploader`=:user_id)") + parameters["user_id"] = user.user_id + + if uploader: + clauses.append("AND `uploader`=:uploader") + parameters["uploader"] = uploader + + if data_name: + clauses.append("AND `name`=:data_name") + parameters["data_name"] = data_name - where_name = "" if data_name is None else "AND `name`=:data_name" - where_version = "" if data_version is None else "AND `version`=:data_version" - where_uploader = "" if uploader is None else "AND `uploader`=:uploader" - data_id_str = ",".join(str(did) for did in data_id) if data_id else "" - where_data_id = "" if not data_id else f"AND d.`did` IN ({data_id_str})" + if data_version: + clauses.append("AND `version`=:data_version") + parameters["data_version"] = data_version + + if data_id: + clauses.append("AND d.`did` IN :data_ids") + parameters["data_ids"] = data_id # requires some benchmarking on whether e.g., IN () is more efficient. - matching_tag = ( - text( + if tag: + clauses.append( + """ + AND d.`did` IN ( + SELECT `id` + FROM dataset_tag as dt + WHERE dt.`tag`=:tag + ) """ - AND d.`did` IN ( - SELECT `id` - FROM dataset_tag as dt - WHERE dt.`tag`=:tag - ) - """, ) - if tag - else "" - ) + parameters["tag"] = tag def quality_clause(quality: str, range_: str | None) -> str: if not range_: @@ -173,10 +182,9 @@ def quality_clause(quality: str, range_: str | None) -> str: IFNULL(cs.`status`, 'in_preparation') FROM dataset AS d LEFT JOIN ({current_status}) AS cs ON d.`did`=cs.`did` - WHERE {visible_to_user} {where_name} {where_version} {where_uploader} - {where_data_id} {matching_tag} {number_instances_filter} {number_features_filter} + WHERE 1=1 {number_instances_filter} {number_features_filter} {number_classes_filter} {number_missing_values_filter} - {"".join(clauses)} + {" ".join(clauses)} LIMIT {pagination.limit} OFFSET {pagination.offset} """, # noqa: S608 # I am not sure how to do this correctly without an error from Bandit here. @@ -187,12 +195,7 @@ def quality_clause(quality: str, range_: str | None) -> str: columns = ["did", "name", "version", "format", "file_id", "status"] result = await expdb_db.execute( matching_filter, - parameters={ - "tag": tag, - "data_name": data_name, - "data_version": data_version, - "uploader": uploader, - }, + parameters=parameters, ) rows = result.all() datasets: dict[int, dict[str, Any]] = { From 60ad465a30743554cc9536c34eebc92bebe08020 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 18 Mar 2026 13:58:53 +0100 Subject: [PATCH 3/7] Extract quality subquery creation function --- src/routers/openml/datasets.py | 49 +++++++++++++++++----------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py index 6e7939e9..02691952 100644 --- a/src/routers/openml/datasets.py +++ b/src/routers/openml/datasets.py @@ -73,6 +73,23 @@ class DatasetStatusFilter(StrEnum): ALL = "all" +def _quality_clause(quality: str, range_: str | None) -> str: + if not range_: + return "" + if not (match := re.match(integer_range_regex, range_)): + msg = f"`range_` not a valid range: {range_}" + raise ValueError(msg) + start, end = match.groups() + value = f"`value` BETWEEN {start} AND {end[2:]}" if end else f"`value`={start}" + return f""" AND + d.`did` IN ( + SELECT `data` + FROM data_quality + WHERE `quality`='{quality}' AND {value} + ) + """ # noqa: S608 - `quality` is not user provided, value is filtered with regex + + @router.post(path="/list", description="Provided for convenience, same as `GET` endpoint.") @router.get(path="/list") async def list_datasets( # noqa: PLR0913 @@ -116,9 +133,9 @@ async def list_datasets( # noqa: PLR0913 ) clauses = [] - parameters = {} + parameters: dict[str, Any] = {} if status != DatasetStatusFilter.ALL: - clauses.append(f"AND IFNULL(cs.`status`, 'in_preparation') = :status") + clauses.append("AND IFNULL(cs.`status`, 'in_preparation') = :status") parameters["status"] = status if user is None: @@ -152,30 +169,15 @@ async def list_datasets( # noqa: PLR0913 FROM dataset_tag as dt WHERE dt.`tag`=:tag ) - """ + """, ) parameters["tag"] = tag - def quality_clause(quality: str, range_: str | None) -> str: - if not range_: - return "" - if not (match := re.match(integer_range_regex, range_)): - msg = f"`range_` not a valid range: {range_}" - raise ValueError(msg) - start, end = match.groups() - value = f"`value` BETWEEN {start} AND {end[2:]}" if end else f"`value`={start}" - return f""" AND - d.`did` IN ( - SELECT `data` - FROM data_quality - WHERE `quality`='{quality}' AND {value} - ) - """ # noqa: S608 - `quality` is not user provided, value is filtered with regex - - number_instances_filter = quality_clause("NumberOfInstances", number_instances) - number_classes_filter = quality_clause("NumberOfClasses", number_classes) - number_features_filter = quality_clause("NumberOfFeatures", number_features) - number_missing_values_filter = quality_clause("NumberOfMissingValues", number_missing_values) + number_instances_filter = _quality_clause("NumberOfInstances", number_instances) + number_classes_filter = _quality_clause("NumberOfClasses", number_classes) + number_features_filter = _quality_clause("NumberOfFeatures", number_features) + number_missing_values_filter = _quality_clause("NumberOfMissingValues", number_missing_values) + columns = ["did", "name", "version", "format", "file_id", "status"] matching_filter = text( f""" SELECT d.`did`,d.`name`,d.`version`,d.`format`,d.`file_id`, @@ -192,7 +194,6 @@ def quality_clause(quality: str, range_: str | None) -> str: # of given options, so no injection is possible (I think). The `current_status` # subquery also has no user input. So I think this should be safe. ) - columns = ["did", "name", "version", "format", "file_id", "status"] result = await expdb_db.execute( matching_filter, parameters=parameters, From 5dda0263ec055339048db4a28da001ef1d1e5a52 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 18 Mar 2026 15:32:10 +0100 Subject: [PATCH 4/7] Use sqlaclhemy parametrization for pagination --- src/routers/openml/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py index 02691952..31cbbc6c 100644 --- a/src/routers/openml/datasets.py +++ b/src/routers/openml/datasets.py @@ -133,7 +133,7 @@ async def list_datasets( # noqa: PLR0913 ) clauses = [] - parameters: dict[str, Any] = {} + parameters: dict[str, Any] = {"offset": pagination.offset, "limit": pagination.limit} if status != DatasetStatusFilter.ALL: clauses.append("AND IFNULL(cs.`status`, 'in_preparation') = :status") parameters["status"] = status @@ -187,7 +187,7 @@ async def list_datasets( # noqa: PLR0913 WHERE 1=1 {number_instances_filter} {number_features_filter} {number_classes_filter} {number_missing_values_filter} {" ".join(clauses)} - LIMIT {pagination.limit} OFFSET {pagination.offset} + LIMIT :limit OFFSET :offset """, # noqa: S608 # I am not sure how to do this correctly without an error from Bandit here. # However, the `status` input is already checked by FastAPI to be from a set From 1205f73b487383014c62d7893cb2117833b724c3 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 23 Mar 2026 11:10:31 +0100 Subject: [PATCH 5/7] rename variable for clarity --- src/routers/openml/datasets.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py index 31cbbc6c..d8d79b1f 100644 --- a/src/routers/openml/datasets.py +++ b/src/routers/openml/datasets.py @@ -120,7 +120,7 @@ async def list_datasets( # noqa: PLR0913 expdb_db: Annotated[AsyncConnection, Depends(expdb_connection)] = None, ) -> list[dict[str, Any]]: assert expdb_db is not None # noqa: S101 - current_status = text( + status_subquery = text( """ SELECT ds1.`did`, ds1.`status` FROM dataset_status as ds1 @@ -133,7 +133,10 @@ async def list_datasets( # noqa: PLR0913 ) clauses = [] - parameters: dict[str, Any] = {"offset": pagination.offset, "limit": pagination.limit} + parameters: dict[str, Any] = { + "offset": pagination.offset, + "limit": pagination.limit, + } if status != DatasetStatusFilter.ALL: clauses.append("AND IFNULL(cs.`status`, 'in_preparation') = :status") parameters["status"] = status @@ -183,7 +186,7 @@ async def list_datasets( # noqa: PLR0913 SELECT d.`did`,d.`name`,d.`version`,d.`format`,d.`file_id`, IFNULL(cs.`status`, 'in_preparation') FROM dataset AS d - LEFT JOIN ({current_status}) AS cs ON d.`did`=cs.`did` + LEFT JOIN ({status_subquery}) AS cs ON d.`did`=cs.`did` WHERE 1=1 {number_instances_filter} {number_features_filter} {number_classes_filter} {number_missing_values_filter} {" ".join(clauses)} From 6b28375746c591392874fd0300ac6bf63577db79 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 23 Mar 2026 12:24:28 +0100 Subject: [PATCH 6/7] Ignore complexity check for now --- src/routers/openml/datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py index d8d79b1f..75708e2b 100644 --- a/src/routers/openml/datasets.py +++ b/src/routers/openml/datasets.py @@ -92,7 +92,7 @@ def _quality_clause(quality: str, range_: str | None) -> str: @router.post(path="/list", description="Provided for convenience, same as `GET` endpoint.") @router.get(path="/list") -async def list_datasets( # noqa: PLR0913 +async def list_datasets( # noqa: PLR0913, C901 pagination: Annotated[Pagination, Body(default_factory=Pagination)], data_name: Annotated[str | None, CasualString128] = None, tag: Annotated[str | None, SystemString64] = None, @@ -180,6 +180,7 @@ async def list_datasets( # noqa: PLR0913 number_classes_filter = _quality_clause("NumberOfClasses", number_classes) number_features_filter = _quality_clause("NumberOfFeatures", number_features) number_missing_values_filter = _quality_clause("NumberOfMissingValues", number_missing_values) + columns = ["did", "name", "version", "format", "file_id", "status"] matching_filter = text( f""" From 03cb1c83aac17c3c34a263f00f74c5244aa0ab1f Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 23 Mar 2026 12:34:10 +0100 Subject: [PATCH 7/7] Fix issue with data_ids binding --- src/routers/openml/datasets.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py index 75708e2b..1b6bc52f 100644 --- a/src/routers/openml/datasets.py +++ b/src/routers/openml/datasets.py @@ -4,7 +4,7 @@ from typing import Annotated, Any, Literal, NamedTuple from fastapi import APIRouter, Body, Depends -from sqlalchemy import text +from sqlalchemy import bindparam, text from sqlalchemy.engine import Row from sqlalchemy.ext.asyncio import AsyncConnection @@ -198,6 +198,9 @@ async def list_datasets( # noqa: PLR0913, C901 # of given options, so no injection is possible (I think). The `current_status` # subquery also has no user input. So I think this should be safe. ) + + if data_id: + matching_filter.bindparams(bindparam("data_ids", expanding=True)) result = await expdb_db.execute( matching_filter, parameters=parameters,