From 5cd5516158bd022515434da4c2983219763f8765 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 18 Mar 2026 11:54:34 +0100
Subject: [PATCH 1/7] Simplify status clause
---
src/routers/openml/datasets.py | 14 ++++----------
1 file changed, 4 insertions(+), 10 deletions(-)
diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py
index 164efd7d..0db65e76 100644
--- a/src/routers/openml/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -115,16 +115,10 @@ async def list_datasets( # noqa: PLR0913
""",
)
- if status == DatasetStatusFilter.ALL:
- statuses = [
- DatasetStatusFilter.ACTIVE,
- DatasetStatusFilter.DEACTIVATED,
- DatasetStatusFilter.IN_PREPARATION,
- ]
- else:
- statuses = [status]
+ clauses = []
+ if status != DatasetStatusFilter.ALL:
+ clauses.append(f"AND IFNULL(cs.`status`, 'in_preparation') = '{status}' ")
- where_status = ",".join(f"'{status}'" for status in statuses)
if user is None:
visible_to_user = "`visibility`='public'"
elif UserGroup.ADMIN in await user.get_groups():
@@ -182,7 +176,7 @@ def quality_clause(quality: str, range_: str | None) -> str:
WHERE {visible_to_user} {where_name} {where_version} {where_uploader}
{where_data_id} {matching_tag} {number_instances_filter} {number_features_filter}
{number_classes_filter} {number_missing_values_filter}
- AND IFNULL(cs.`status`, 'in_preparation') IN ({where_status})
+ {"".join(clauses)}
LIMIT {pagination.limit} OFFSET {pagination.offset}
""", # noqa: S608
# I am not sure how to do this correctly without an error from Bandit here.
From 78d15fb5bc327434bcffb5c2626831fd2028370b Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 18 Mar 2026 12:14:39 +0100
Subject: [PATCH 2/7] Follow consistent pattern when constructing where clauses
---
src/routers/openml/datasets.py | 65 ++++++++++++++++++----------------
1 file changed, 34 insertions(+), 31 deletions(-)
diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py
index 0db65e76..6e7939e9 100644
--- a/src/routers/openml/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -116,36 +116,45 @@ async def list_datasets( # noqa: PLR0913
)
clauses = []
+ parameters = {}
if status != DatasetStatusFilter.ALL:
- clauses.append(f"AND IFNULL(cs.`status`, 'in_preparation') = '{status}' ")
+ clauses.append(f"AND IFNULL(cs.`status`, 'in_preparation') = :status")
+ parameters["status"] = status
if user is None:
- visible_to_user = "`visibility`='public'"
- elif UserGroup.ADMIN in await user.get_groups():
- visible_to_user = "TRUE"
- else:
- visible_to_user = f"(`visibility`='public' OR `uploader`={user.user_id})"
+ clauses.append("AND `visibility`='public'")
+ elif UserGroup.ADMIN not in await user.get_groups():
+ clauses.append("AND (`visibility`='public' OR `uploader`=:user_id)")
+ parameters["user_id"] = user.user_id
+
+ if uploader:
+ clauses.append("AND `uploader`=:uploader")
+ parameters["uploader"] = uploader
+
+ if data_name:
+ clauses.append("AND `name`=:data_name")
+ parameters["data_name"] = data_name
- where_name = "" if data_name is None else "AND `name`=:data_name"
- where_version = "" if data_version is None else "AND `version`=:data_version"
- where_uploader = "" if uploader is None else "AND `uploader`=:uploader"
- data_id_str = ",".join(str(did) for did in data_id) if data_id else ""
- where_data_id = "" if not data_id else f"AND d.`did` IN ({data_id_str})"
+ if data_version:
+ clauses.append("AND `version`=:data_version")
+ parameters["data_version"] = data_version
+
+ if data_id:
+ clauses.append("AND d.`did` IN :data_ids")
+ parameters["data_ids"] = data_id
# requires some benchmarking on whether e.g., IN () is more efficient.
- matching_tag = (
- text(
+ if tag:
+ clauses.append(
+ """
+ AND d.`did` IN (
+ SELECT `id`
+ FROM dataset_tag as dt
+ WHERE dt.`tag`=:tag
+ )
"""
- AND d.`did` IN (
- SELECT `id`
- FROM dataset_tag as dt
- WHERE dt.`tag`=:tag
- )
- """,
)
- if tag
- else ""
- )
+ parameters["tag"] = tag
def quality_clause(quality: str, range_: str | None) -> str:
if not range_:
@@ -173,10 +182,9 @@ def quality_clause(quality: str, range_: str | None) -> str:
IFNULL(cs.`status`, 'in_preparation')
FROM dataset AS d
LEFT JOIN ({current_status}) AS cs ON d.`did`=cs.`did`
- WHERE {visible_to_user} {where_name} {where_version} {where_uploader}
- {where_data_id} {matching_tag} {number_instances_filter} {number_features_filter}
+ WHERE 1=1 {number_instances_filter} {number_features_filter}
{number_classes_filter} {number_missing_values_filter}
- {"".join(clauses)}
+ {" ".join(clauses)}
LIMIT {pagination.limit} OFFSET {pagination.offset}
""", # noqa: S608
# I am not sure how to do this correctly without an error from Bandit here.
@@ -187,12 +195,7 @@ def quality_clause(quality: str, range_: str | None) -> str:
columns = ["did", "name", "version", "format", "file_id", "status"]
result = await expdb_db.execute(
matching_filter,
- parameters={
- "tag": tag,
- "data_name": data_name,
- "data_version": data_version,
- "uploader": uploader,
- },
+ parameters=parameters,
)
rows = result.all()
datasets: dict[int, dict[str, Any]] = {
From 60ad465a30743554cc9536c34eebc92bebe08020 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 18 Mar 2026 13:58:53 +0100
Subject: [PATCH 3/7] Extract quality subquery creation function
---
src/routers/openml/datasets.py | 49 +++++++++++++++++-----------------
1 file changed, 25 insertions(+), 24 deletions(-)
diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py
index 6e7939e9..02691952 100644
--- a/src/routers/openml/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -73,6 +73,23 @@ class DatasetStatusFilter(StrEnum):
ALL = "all"
+def _quality_clause(quality: str, range_: str | None) -> str:
+ if not range_:
+ return ""
+ if not (match := re.match(integer_range_regex, range_)):
+ msg = f"`range_` not a valid range: {range_}"
+ raise ValueError(msg)
+ start, end = match.groups()
+ value = f"`value` BETWEEN {start} AND {end[2:]}" if end else f"`value`={start}"
+ return f""" AND
+ d.`did` IN (
+ SELECT `data`
+ FROM data_quality
+ WHERE `quality`='{quality}' AND {value}
+ )
+ """ # noqa: S608 - `quality` is not user provided, value is filtered with regex
+
+
@router.post(path="/list", description="Provided for convenience, same as `GET` endpoint.")
@router.get(path="/list")
async def list_datasets( # noqa: PLR0913
@@ -116,9 +133,9 @@ async def list_datasets( # noqa: PLR0913
)
clauses = []
- parameters = {}
+ parameters: dict[str, Any] = {}
if status != DatasetStatusFilter.ALL:
- clauses.append(f"AND IFNULL(cs.`status`, 'in_preparation') = :status")
+ clauses.append("AND IFNULL(cs.`status`, 'in_preparation') = :status")
parameters["status"] = status
if user is None:
@@ -152,30 +169,15 @@ async def list_datasets( # noqa: PLR0913
FROM dataset_tag as dt
WHERE dt.`tag`=:tag
)
- """
+ """,
)
parameters["tag"] = tag
- def quality_clause(quality: str, range_: str | None) -> str:
- if not range_:
- return ""
- if not (match := re.match(integer_range_regex, range_)):
- msg = f"`range_` not a valid range: {range_}"
- raise ValueError(msg)
- start, end = match.groups()
- value = f"`value` BETWEEN {start} AND {end[2:]}" if end else f"`value`={start}"
- return f""" AND
- d.`did` IN (
- SELECT `data`
- FROM data_quality
- WHERE `quality`='{quality}' AND {value}
- )
- """ # noqa: S608 - `quality` is not user provided, value is filtered with regex
-
- number_instances_filter = quality_clause("NumberOfInstances", number_instances)
- number_classes_filter = quality_clause("NumberOfClasses", number_classes)
- number_features_filter = quality_clause("NumberOfFeatures", number_features)
- number_missing_values_filter = quality_clause("NumberOfMissingValues", number_missing_values)
+ number_instances_filter = _quality_clause("NumberOfInstances", number_instances)
+ number_classes_filter = _quality_clause("NumberOfClasses", number_classes)
+ number_features_filter = _quality_clause("NumberOfFeatures", number_features)
+ number_missing_values_filter = _quality_clause("NumberOfMissingValues", number_missing_values)
+ columns = ["did", "name", "version", "format", "file_id", "status"]
matching_filter = text(
f"""
SELECT d.`did`,d.`name`,d.`version`,d.`format`,d.`file_id`,
@@ -192,7 +194,6 @@ def quality_clause(quality: str, range_: str | None) -> str:
# of given options, so no injection is possible (I think). The `current_status`
# subquery also has no user input. So I think this should be safe.
)
- columns = ["did", "name", "version", "format", "file_id", "status"]
result = await expdb_db.execute(
matching_filter,
parameters=parameters,
From 5dda0263ec055339048db4a28da001ef1d1e5a52 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 18 Mar 2026 15:32:10 +0100
Subject: [PATCH 4/7] Use sqlaclhemy parametrization for pagination
---
src/routers/openml/datasets.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py
index 02691952..31cbbc6c 100644
--- a/src/routers/openml/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -133,7 +133,7 @@ async def list_datasets( # noqa: PLR0913
)
clauses = []
- parameters: dict[str, Any] = {}
+ parameters: dict[str, Any] = {"offset": pagination.offset, "limit": pagination.limit}
if status != DatasetStatusFilter.ALL:
clauses.append("AND IFNULL(cs.`status`, 'in_preparation') = :status")
parameters["status"] = status
@@ -187,7 +187,7 @@ async def list_datasets( # noqa: PLR0913
WHERE 1=1 {number_instances_filter} {number_features_filter}
{number_classes_filter} {number_missing_values_filter}
{" ".join(clauses)}
- LIMIT {pagination.limit} OFFSET {pagination.offset}
+ LIMIT :limit OFFSET :offset
""", # noqa: S608
# I am not sure how to do this correctly without an error from Bandit here.
# However, the `status` input is already checked by FastAPI to be from a set
From 1205f73b487383014c62d7893cb2117833b724c3 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Mon, 23 Mar 2026 11:10:31 +0100
Subject: [PATCH 5/7] rename variable for clarity
---
src/routers/openml/datasets.py | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py
index 31cbbc6c..d8d79b1f 100644
--- a/src/routers/openml/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -120,7 +120,7 @@ async def list_datasets( # noqa: PLR0913
expdb_db: Annotated[AsyncConnection, Depends(expdb_connection)] = None,
) -> list[dict[str, Any]]:
assert expdb_db is not None # noqa: S101
- current_status = text(
+ status_subquery = text(
"""
SELECT ds1.`did`, ds1.`status`
FROM dataset_status as ds1
@@ -133,7 +133,10 @@ async def list_datasets( # noqa: PLR0913
)
clauses = []
- parameters: dict[str, Any] = {"offset": pagination.offset, "limit": pagination.limit}
+ parameters: dict[str, Any] = {
+ "offset": pagination.offset,
+ "limit": pagination.limit,
+ }
if status != DatasetStatusFilter.ALL:
clauses.append("AND IFNULL(cs.`status`, 'in_preparation') = :status")
parameters["status"] = status
@@ -183,7 +186,7 @@ async def list_datasets( # noqa: PLR0913
SELECT d.`did`,d.`name`,d.`version`,d.`format`,d.`file_id`,
IFNULL(cs.`status`, 'in_preparation')
FROM dataset AS d
- LEFT JOIN ({current_status}) AS cs ON d.`did`=cs.`did`
+ LEFT JOIN ({status_subquery}) AS cs ON d.`did`=cs.`did`
WHERE 1=1 {number_instances_filter} {number_features_filter}
{number_classes_filter} {number_missing_values_filter}
{" ".join(clauses)}
From 6b28375746c591392874fd0300ac6bf63577db79 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Mon, 23 Mar 2026 12:24:28 +0100
Subject: [PATCH 6/7] Ignore complexity check for now
---
src/routers/openml/datasets.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py
index d8d79b1f..75708e2b 100644
--- a/src/routers/openml/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -92,7 +92,7 @@ def _quality_clause(quality: str, range_: str | None) -> str:
@router.post(path="/list", description="Provided for convenience, same as `GET` endpoint.")
@router.get(path="/list")
-async def list_datasets( # noqa: PLR0913
+async def list_datasets( # noqa: PLR0913, C901
pagination: Annotated[Pagination, Body(default_factory=Pagination)],
data_name: Annotated[str | None, CasualString128] = None,
tag: Annotated[str | None, SystemString64] = None,
@@ -180,6 +180,7 @@ async def list_datasets( # noqa: PLR0913
number_classes_filter = _quality_clause("NumberOfClasses", number_classes)
number_features_filter = _quality_clause("NumberOfFeatures", number_features)
number_missing_values_filter = _quality_clause("NumberOfMissingValues", number_missing_values)
+
columns = ["did", "name", "version", "format", "file_id", "status"]
matching_filter = text(
f"""
From 03cb1c83aac17c3c34a263f00f74c5244aa0ab1f Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Mon, 23 Mar 2026 12:34:10 +0100
Subject: [PATCH 7/7] Fix issue with data_ids binding
---
src/routers/openml/datasets.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py
index 75708e2b..1b6bc52f 100644
--- a/src/routers/openml/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -4,7 +4,7 @@
from typing import Annotated, Any, Literal, NamedTuple
from fastapi import APIRouter, Body, Depends
-from sqlalchemy import text
+from sqlalchemy import bindparam, text
from sqlalchemy.engine import Row
from sqlalchemy.ext.asyncio import AsyncConnection
@@ -198,6 +198,9 @@ async def list_datasets( # noqa: PLR0913, C901
# of given options, so no injection is possible (I think). The `current_status`
# subquery also has no user input. So I think this should be safe.
)
+
+ if data_id:
+ matching_filter.bindparams(bindparam("data_ids", expanding=True))
result = await expdb_db.execute(
matching_filter,
parameters=parameters,