From 0159f474c6bbc15f20d52bc946bd252bd852b196 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Tue, 30 Dec 2025 09:11:27 +0500
Subject: [PATCH 01/15] set up folder structure and base code

---
 openml/_api/__init__.py           |   8 +++
 openml/_api/config.py             |   5 ++
 openml/_api/http/__init__.py      |   1 +
 openml/_api/http/client.py        |  23 ++++++
 openml/_api/http/utils.py         |   0
 openml/_api/resources/__init__.py |   2 +
 openml/_api/resources/base.py     |  22 ++++++
 openml/_api/resources/datasets.py |  13 ++++
 openml/_api/resources/tasks.py    | 113 ++++++++++++++++++++++++++++++
 openml/_api/runtime/core.py       |  58 +++++++++++++++
 openml/_api/runtime/fallback.py   |   5 ++
 openml/tasks/functions.py         |   8 ++-
 12 files changed, 255 insertions(+), 3 deletions(-)
 create mode 100644 openml/_api/__init__.py
 create mode 100644 openml/_api/config.py
 create mode 100644 openml/_api/http/__init__.py
 create mode 100644 openml/_api/http/client.py
 create mode 100644 openml/_api/http/utils.py
 create mode 100644 openml/_api/resources/__init__.py
 create mode 100644 openml/_api/resources/base.py
 create mode 100644 openml/_api/resources/datasets.py
 create mode 100644 openml/_api/resources/tasks.py
 create mode 100644 openml/_api/runtime/core.py
 create mode 100644 openml/_api/runtime/fallback.py

diff --git a/openml/_api/__init__.py b/openml/_api/__init__.py
new file mode 100644
index 000000000..5089f94dd
--- /dev/null
+++ b/openml/_api/__init__.py
@@ -0,0 +1,8 @@
+from openml._api.runtime.core import APIContext
+
+
+def set_api_version(version: str, strict=False):
+    api_context.set_version(version=version, strict=strict)
+
+
+api_context = APIContext()
diff --git a/openml/_api/config.py b/openml/_api/config.py
new file mode 100644
index 000000000..bd93c3cad
--- /dev/null
+++ b/openml/_api/config.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+API_V1_SERVER = "https://www.openml.org/api/v1/xml"
+API_V2_SERVER = "http://127.0.0.1:8001"
+API_KEY = "..."
diff --git a/openml/_api/http/__init__.py b/openml/_api/http/__init__.py
new file mode 100644
index 000000000..fde2a5b0a
--- /dev/null
+++ b/openml/_api/http/__init__.py
@@ -0,0 +1 @@
+from openml._api.http.client import HTTPClient
diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
new file mode 100644
index 000000000..81a9213e3
--- /dev/null
+++ b/openml/_api/http/client.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+import requests
+
+from openml.__version__ import __version__
+
+
+class HTTPClient:
+    def __init__(self, base_url: str):
+        self.base_url = base_url
+        self.headers = {"user-agent": f"openml-python/{__version__}"}
+
+    def get(self, path, params=None):
+        url = f"{self.base_url}/{path}"
+        return requests.get(url, params=params, headers=self.headers)
+
+    def post(self, path, data=None, files=None):
+        url = f"{self.base_url}/{path}"
+        return requests.post(url, data=data, files=files, headers=self.headers)
+
+    def delete(self, path, params=None):
+        url = f"{self.base_url}/{path}"
+        return requests.delete(url, params=params, headers=self.headers)
diff --git a/openml/_api/http/utils.py b/openml/_api/http/utils.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/openml/_api/resources/__init__.py b/openml/_api/resources/__init__.py
new file mode 100644
index 000000000..078fc5998
--- /dev/null
+++ b/openml/_api/resources/__init__.py
@@ -0,0 +1,2 @@
+from openml._api.resources.datasets import DatasetsV1, DatasetsV2
+from openml._api.resources.tasks import TasksV1, TasksV2
diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py
new file mode 100644
index 000000000..1fae27665
--- /dev/null
+++ b/openml/_api/resources/base.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from openml._api.http import HTTPClient
+
+
+class ResourceAPI:
+    def __init__(self, http: HTTPClient):
+        self._http = http
+
+
+class DatasetsAPI(ResourceAPI, ABC):
+    @abstractmethod
+    def get(self, id: int) -> dict: ...
+
+
+class TasksAPI(ResourceAPI, ABC):
+    @abstractmethod
+    def get(self, id: int) -> dict: ...
diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py
new file mode 100644
index 000000000..cd1bb595a
--- /dev/null
+++ b/openml/_api/resources/datasets.py
@@ -0,0 +1,13 @@
+from __future__ import annotations
+
+from openml._api.resources.base import DatasetsAPI
+
+
+class DatasetsV1(DatasetsAPI):
+    def get(self, id):
+        pass
+
+
+class DatasetsV2(DatasetsAPI):
+    def get(self, id):
+        pass
diff --git a/openml/_api/resources/tasks.py b/openml/_api/resources/tasks.py
new file mode 100644
index 000000000..b0e9afbf8
--- /dev/null
+++ b/openml/_api/resources/tasks.py
@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+import xmltodict
+
+from openml._api.resources.base import TasksAPI
+from openml.tasks.task import (
+    OpenMLClassificationTask,
+    OpenMLClusteringTask,
+    OpenMLLearningCurveTask,
+    OpenMLRegressionTask,
+    OpenMLTask,
+    TaskType,
+)
+
+
+class TasksV1(TasksAPI):
+    def get(self, id, return_response=False):
+        path = f"task/{id}"
+        response = self._http.get(path)
+        xml_content = response.content
+        task = self._create_task_from_xml(xml_content)
+
+        if return_response:
+            return task, response
+
+        return task
+
+    def _create_task_from_xml(self, xml: str) -> OpenMLTask:
+        """Create a task given a xml string.
+
+        Parameters
+        ----------
+        xml : string
+            Task xml representation.
+
+        Returns
+        -------
+        OpenMLTask
+        """
+        dic = xmltodict.parse(xml)["oml:task"]
+        estimation_parameters = {}
+        inputs = {}
+        # Due to the unordered structure we obtain, we first have to extract
+        # the possible keys of oml:input; dic["oml:input"] is a list of
+        # OrderedDicts
+
+        # Check if there is a list of inputs
+        if isinstance(dic["oml:input"], list):
+            for input_ in dic["oml:input"]:
+                name = input_["@name"]
+                inputs[name] = input_
+        # Single input case
+        elif isinstance(dic["oml:input"], dict):
+            name = dic["oml:input"]["@name"]
+            inputs[name] = dic["oml:input"]
+
+        evaluation_measures = None
+        if "evaluation_measures" in inputs:
+            evaluation_measures = inputs["evaluation_measures"]["oml:evaluation_measures"][
+                "oml:evaluation_measure"
+            ]
+
+        task_type = TaskType(int(dic["oml:task_type_id"]))
+        common_kwargs = {
+            "task_id": dic["oml:task_id"],
+            "task_type": dic["oml:task_type"],
+            "task_type_id": task_type,
+            "data_set_id": inputs["source_data"]["oml:data_set"]["oml:data_set_id"],
+            "evaluation_measure": evaluation_measures,
+        }
+        # TODO: add OpenMLClusteringTask?
+        if task_type in (
+            TaskType.SUPERVISED_CLASSIFICATION,
+            TaskType.SUPERVISED_REGRESSION,
+            TaskType.LEARNING_CURVE,
+        ):
+            # Convert some more parameters
+            for parameter in inputs["estimation_procedure"]["oml:estimation_procedure"][
+                "oml:parameter"
+            ]:
+                name = parameter["@name"]
+                text = parameter.get("#text", "")
+                estimation_parameters[name] = text
+
+            common_kwargs["estimation_procedure_type"] = inputs["estimation_procedure"][
+                "oml:estimation_procedure"
+            ]["oml:type"]
+            common_kwargs["estimation_procedure_id"] = int(
+                inputs["estimation_procedure"]["oml:estimation_procedure"]["oml:id"]
+            )
+
+            common_kwargs["estimation_parameters"] = estimation_parameters
+            common_kwargs["target_name"] = inputs["source_data"]["oml:data_set"][
+                "oml:target_feature"
+            ]
+            common_kwargs["data_splits_url"] = inputs["estimation_procedure"][
+                "oml:estimation_procedure"
+            ]["oml:data_splits_url"]
+
+        cls = {
+            TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask,
+            TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask,
+            TaskType.CLUSTERING: OpenMLClusteringTask,
+            TaskType.LEARNING_CURVE: OpenMLLearningCurveTask,
+        }.get(task_type)
+        if cls is None:
+            raise NotImplementedError(f"Task type {common_kwargs['task_type']} not supported.")
+        return cls(**common_kwargs)  # type: ignore
+
+
+class TasksV2(TasksAPI):
+    def get(self, id):
+        pass
diff --git a/openml/_api/runtime/core.py b/openml/_api/runtime/core.py
new file mode 100644
index 000000000..80f35587c
--- /dev/null
+++ b/openml/_api/runtime/core.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+from openml._api.config import (
+    API_V1_SERVER,
+    API_V2_SERVER,
+)
+from openml._api.http.client import HTTPClient
+from openml._api.resources import (
+    DatasetsV1,
+    DatasetsV2,
+    TasksV1,
+    TasksV2,
+)
+from openml._api.runtime.fallback import FallbackProxy
+
+
+class APIBackend:
+    def __init__(self, *, datasets, tasks):
+        self.datasets = datasets
+        self.tasks = tasks
+
+
+def build_backend(version: str, strict: bool) -> APIBackend:
+    v1_http = HTTPClient(API_V1_SERVER)
+    v2_http = HTTPClient(API_V2_SERVER)
+
+    v1 = APIBackend(
+        datasets=DatasetsV1(v1_http),
+        tasks=TasksV1(v1_http),
+    )
+
+    if version == "v1":
+        return v1
+
+    v2 = APIBackend(
+        datasets=DatasetsV2(v2_http),
+        tasks=TasksV2(v2_http),
+    )
+
+    if strict:
+        return v2
+
+    return APIBackend(
+        datasets=FallbackProxy(v2.datasets, v1.datasets),
+        tasks=FallbackProxy(v2.tasks, v1.tasks),
+    )
+
+
+class APIContext:
+    def __init__(self):
+        self._backend = build_backend("v1", strict=False)
+
+    def set_version(self, version: str, strict: bool = False):
+        self._backend = build_backend(version, strict)
+
+    @property
+    def backend(self):
+        return self._backend
diff --git a/openml/_api/runtime/fallback.py b/openml/_api/runtime/fallback.py
new file mode 100644
index 000000000..56e96a966
--- /dev/null
+++ b/openml/_api/runtime/fallback.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+
+class FallbackProxy:
+    pass
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index d2bf5e946..91be65965 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -12,6 +12,7 @@
 
 import openml._api_calls
 import openml.utils
+from openml._api import api_context
 from openml.datasets import get_dataset
 from openml.exceptions import OpenMLCacheException
 
@@ -442,11 +443,12 @@ def _get_task_description(task_id: int) -> OpenMLTask:
     except OpenMLCacheException:
         _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
         xml_file = _cache_dir / "task.xml"
-        task_xml = openml._api_calls._perform_api_call("task/%d" % task_id, "get")
+        task, response = api_context.backend.tasks.get(task_id, return_response=True)
 
         with xml_file.open("w", encoding="utf8") as fh:
-            fh.write(task_xml)
-        return _create_task_from_xml(task_xml)
+            fh.write(response.text)
+
+        return task
 
 
 def _create_task_from_xml(xml: str) -> OpenMLTask:

From 52ef37999fad8509e5e85b8512e442bd9dc69e04 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Mon, 5 Jan 2026 12:48:58 +0500
Subject: [PATCH 02/15] fix pre-commit

---
 openml/_api/__init__.py           |  2 +-
 openml/_api/http/__init__.py      |  2 ++
 openml/_api/http/client.py        | 32 +++++++++++++++++++++++--------
 openml/_api/resources/__init__.py |  2 ++
 openml/_api/resources/base.py     | 13 +++++++++++--
 openml/_api/resources/datasets.py | 15 +++++++++++----
 openml/_api/resources/tasks.py    | 25 +++++++++++++++++++-----
 openml/_api/runtime/__init__.py   |  0
 openml/_api/runtime/core.py       | 23 +++++++++++-----------
 openml/_api/runtime/fallback.py   |  9 ++++++++-
 openml/tasks/functions.py         | 12 ++++++++----
 11 files changed, 99 insertions(+), 36 deletions(-)
 create mode 100644 openml/_api/runtime/__init__.py

diff --git a/openml/_api/__init__.py b/openml/_api/__init__.py
index 5089f94dd..881f40671 100644
--- a/openml/_api/__init__.py
+++ b/openml/_api/__init__.py
@@ -1,7 +1,7 @@
 from openml._api.runtime.core import APIContext
 
 
-def set_api_version(version: str, strict=False):
+def set_api_version(version: str, *, strict: bool = False) -> None:
     api_context.set_version(version=version, strict=strict)
 
 
diff --git a/openml/_api/http/__init__.py b/openml/_api/http/__init__.py
index fde2a5b0a..8e6d1e4ce 100644
--- a/openml/_api/http/__init__.py
+++ b/openml/_api/http/__init__.py
@@ -1 +1,3 @@
 from openml._api.http.client import HTTPClient
+
+__all__ = ["HTTPClient"]
diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
index 81a9213e3..dea5de809 100644
--- a/openml/_api/http/client.py
+++ b/openml/_api/http/client.py
@@ -1,23 +1,39 @@
 from __future__ import annotations
 
+from typing import Any, Mapping
+
 import requests
+from requests import Response
 
 from openml.__version__ import __version__
 
 
 class HTTPClient:
-    def __init__(self, base_url: str):
+    def __init__(self, base_url: str) -> None:
         self.base_url = base_url
-        self.headers = {"user-agent": f"openml-python/{__version__}"}
+        self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"}
 
-    def get(self, path, params=None):
+    def get(
+        self,
+        path: str,
+        params: Mapping[str, Any] | None = None,
+    ) -> Response:
         url = f"{self.base_url}/{path}"
-        return requests.get(url, params=params, headers=self.headers)
+        return requests.get(url, params=params, headers=self.headers, timeout=10)
 
-    def post(self, path, data=None, files=None):
+    def post(
+        self,
+        path: str,
+        data: Mapping[str, Any] | None = None,
+        files: Any = None,
+    ) -> Response:
         url = f"{self.base_url}/{path}"
-        return requests.post(url, data=data, files=files, headers=self.headers)
+        return requests.post(url, data=data, files=files, headers=self.headers, timeout=10)
 
-    def delete(self, path, params=None):
+    def delete(
+        self,
+        path: str,
+        params: Mapping[str, Any] | None = None,
+    ) -> Response:
         url = f"{self.base_url}/{path}"
-        return requests.delete(url, params=params, headers=self.headers)
+        return requests.delete(url, params=params, headers=self.headers, timeout=10)
diff --git a/openml/_api/resources/__init__.py b/openml/_api/resources/__init__.py
index 078fc5998..b1af3c1a8 100644
--- a/openml/_api/resources/__init__.py
+++ b/openml/_api/resources/__init__.py
@@ -1,2 +1,4 @@
 from openml._api.resources.datasets import DatasetsV1, DatasetsV2
 from openml._api.resources.tasks import TasksV1, TasksV2
+
+__all__ = ["DatasetsV1", "DatasetsV2", "TasksV1", "TasksV2"]
diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py
index 1fae27665..6fbf8977d 100644
--- a/openml/_api/resources/base.py
+++ b/openml/_api/resources/base.py
@@ -4,7 +4,11 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
+    from requests import Response
+
     from openml._api.http import HTTPClient
+    from openml.datasets.dataset import OpenMLDataset
+    from openml.tasks.task import OpenMLTask
 
 
 class ResourceAPI:
@@ -14,9 +18,14 @@ def __init__(self, http: HTTPClient):
 
 class DatasetsAPI(ResourceAPI, ABC):
     @abstractmethod
-    def get(self, id: int) -> dict: ...
+    def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ...
 
 
 class TasksAPI(ResourceAPI, ABC):
     @abstractmethod
-    def get(self, id: int) -> dict: ...
+    def get(
+        self,
+        task_id: int,
+        *,
+        return_response: bool = False,
+    ) -> OpenMLTask | tuple[OpenMLTask, Response]: ...
diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py
index cd1bb595a..9ff1ec278 100644
--- a/openml/_api/resources/datasets.py
+++ b/openml/_api/resources/datasets.py
@@ -1,13 +1,20 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 from openml._api.resources.base import DatasetsAPI
 
+if TYPE_CHECKING:
+    from responses import Response
+
+    from openml.datasets.dataset import OpenMLDataset
+
 
 class DatasetsV1(DatasetsAPI):
-    def get(self, id):
-        pass
+    def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
+        raise NotImplementedError
 
 
 class DatasetsV2(DatasetsAPI):
-    def get(self, id):
-        pass
+    def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
+        raise NotImplementedError
diff --git a/openml/_api/resources/tasks.py b/openml/_api/resources/tasks.py
index b0e9afbf8..f494fb9a3 100644
--- a/openml/_api/resources/tasks.py
+++ b/openml/_api/resources/tasks.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import xmltodict
 
 from openml._api.resources.base import TasksAPI
@@ -12,12 +14,20 @@
     TaskType,
 )
 
+if TYPE_CHECKING:
+    from requests import Response
+
 
 class TasksV1(TasksAPI):
-    def get(self, id, return_response=False):
-        path = f"task/{id}"
+    def get(
+        self,
+        task_id: int,
+        *,
+        return_response: bool = False,
+    ) -> OpenMLTask | tuple[OpenMLTask, Response]:
+        path = f"task/{task_id}"
         response = self._http.get(path)
-        xml_content = response.content
+        xml_content = response.text
         task = self._create_task_from_xml(xml_content)
 
         if return_response:
@@ -109,5 +119,10 @@ def _create_task_from_xml(self, xml: str) -> OpenMLTask:
 
 
 class TasksV2(TasksAPI):
-    def get(self, id):
-        pass
+    def get(
+        self,
+        task_id: int,
+        *,
+        return_response: bool = False,
+    ) -> OpenMLTask | tuple[OpenMLTask, Response]:
+        raise NotImplementedError
diff --git a/openml/_api/runtime/__init__.py b/openml/_api/runtime/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/openml/_api/runtime/core.py b/openml/_api/runtime/core.py
index 80f35587c..aa09a69db 100644
--- a/openml/_api/runtime/core.py
+++ b/openml/_api/runtime/core.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 from openml._api.config import (
     API_V1_SERVER,
     API_V2_SERVER,
@@ -11,16 +13,18 @@
     TasksV1,
     TasksV2,
 )
-from openml._api.runtime.fallback import FallbackProxy
+
+if TYPE_CHECKING:
+    from openml._api.resources.base import DatasetsAPI, TasksAPI
 
 
 class APIBackend:
-    def __init__(self, *, datasets, tasks):
+    def __init__(self, *, datasets: DatasetsAPI, tasks: TasksAPI):
         self.datasets = datasets
         self.tasks = tasks
 
 
-def build_backend(version: str, strict: bool) -> APIBackend:
+def build_backend(version: str, *, strict: bool) -> APIBackend:
     v1_http = HTTPClient(API_V1_SERVER)
     v2_http = HTTPClient(API_V2_SERVER)
 
@@ -40,19 +44,16 @@ def build_backend(version: str, strict: bool) -> APIBackend:
     if strict:
         return v2
 
-    return APIBackend(
-        datasets=FallbackProxy(v2.datasets, v1.datasets),
-        tasks=FallbackProxy(v2.tasks, v1.tasks),
-    )
+    return v1
 
 
 class APIContext:
-    def __init__(self):
+    def __init__(self) -> None:
         self._backend = build_backend("v1", strict=False)
 
-    def set_version(self, version: str, strict: bool = False):
-        self._backend = build_backend(version, strict)
+    def set_version(self, version: str, *, strict: bool = False) -> None:
+        self._backend = build_backend(version=version, strict=strict)
 
     @property
-    def backend(self):
+    def backend(self) -> APIBackend:
         return self._backend
diff --git a/openml/_api/runtime/fallback.py b/openml/_api/runtime/fallback.py
index 56e96a966..1bc99d270 100644
--- a/openml/_api/runtime/fallback.py
+++ b/openml/_api/runtime/fallback.py
@@ -1,5 +1,12 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from openml._api.resources.base import ResourceAPI
+
 
 class FallbackProxy:
-    pass
+    def __init__(self, primary: ResourceAPI, fallback: ResourceAPI):
+        self._primary = primary
+        self._fallback = fallback
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index ef67f75bf..a794ad56d 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -445,10 +445,14 @@ def _get_task_description(task_id: int) -> OpenMLTask:
     except OpenMLCacheException:
         _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
         xml_file = _cache_dir / "task.xml"
-        task, response = api_context.backend.tasks.get(task_id, return_response=True)
-
-        with xml_file.open("w", encoding="utf8") as fh:
-            fh.write(response.text)
+        result = api_context.backend.tasks.get(task_id, return_response=True)
+
+        if isinstance(result, tuple):
+            task, response = result
+            with xml_file.open("w", encoding="utf8") as fh:
+                fh.write(response.text)
+        else:
+            task = result
 
         return task
 

From f7ba710a9a3c457ec7c48ec45fa174c9194eeb98 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Tue, 6 Jan 2026 16:24:35 +0530
Subject: [PATCH 03/15] Merge base migration pr, ruff

---
 openml/_api/http/client.py        |   5 +-
 openml/_api/resources/base.py     |  70 ++++-
 openml/_api/resources/datasets.py | 440 +++++++++++++++++++++++++++++-
 3 files changed, 504 insertions(+), 11 deletions(-)

diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
index dea5de809..b0d3c911f 100644
--- a/openml/_api/http/client.py
+++ b/openml/_api/http/client.py
@@ -25,10 +25,13 @@ def post(
         self,
         path: str,
         data: Mapping[str, Any] | None = None,
+        json: dict | None = None,
         files: Any = None,
     ) -> Response:
         url = f"{self.base_url}/{path}"
-        return requests.post(url, data=data, files=files, headers=self.headers, timeout=10)
+        return requests.post(
+            url, data=data, json=json, files=files, headers=self.headers, timeout=10
+        )
 
     def delete(
         self,
diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py
index 6fbf8977d..9d480b06a 100644
--- a/openml/_api/resources/base.py
+++ b/openml/_api/resources/base.py
@@ -4,6 +4,7 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
+    import pandas as pd
     from requests import Response
 
     from openml._api.http import HTTPClient
@@ -18,7 +19,74 @@ def __init__(self, http: HTTPClient):
 
 class DatasetsAPI(ResourceAPI, ABC):
     @abstractmethod
-    def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ...
+    def get(
+        self, dataset_id: int, *, return_response: bool
+    ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ...
+
+    @abstractmethod
+    def list(  # noqa: PLR0913
+        self,
+        data_id: list[int] | None = None,
+        offset: int | None = None,
+        size: int | None = None,
+        status: str | None = None,
+        tag: str | None = None,
+        data_name: str | None = None,
+        data_version: int | None = None,
+        number_instances: int | str | None = None,
+        number_features: int | str | None = None,
+        number_classes: int | str | None = None,
+        number_missing_values: int | str | None = None,
+    ) -> pd.DataFrame: ...
+
+    def _name_to_id(
+        self,
+        dataset_name: str,
+        version: int | None = None,
+        error_if_multiple: bool = False,  # noqa: FBT001, FBT002
+    ) -> int:
+        """Attempt to find the dataset id of the dataset with the given name.
+
+        If multiple datasets with the name exist, and ``error_if_multiple`` is ``False``,
+        then return the least recent still active dataset.
+
+        Raises an error if no dataset with the name is found.
+        Raises an error if a version is specified but it could not be found.
+
+        Parameters
+        ----------
+        dataset_name : str
+            The name of the dataset for which to find its id.
+        version : int, optional
+            Version to retrieve. If not specified, the oldest active version is returned.
+        error_if_multiple : bool (default=False)
+            If `False`, if multiple datasets match, return the least recent active dataset.
+            If `True`, if multiple datasets match, raise an error.
+        download_qualities : bool, optional (default=True)
+            If `True`, also download qualities.xml file. If False it skip the qualities.xml.
+
+        Returns
+        -------
+        int
+        The id of the dataset.
+        """
+        status = None if version is not None else "active"
+        candidates = self.list(
+            data_name=dataset_name,
+            status=status,
+            data_version=version,
+        )
+        if error_if_multiple and len(candidates) > 1:
+            msg = f"Multiple active datasets exist with name '{dataset_name}'."
+            raise ValueError(msg)
+
+        if candidates.empty:
+            no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'"
+            and_version = f" and version '{version}'." if version is not None else "."
+            raise RuntimeError(no_dataset_for_name + and_version)
+
+        # Dataset ids are chronological so we can just sort based on ids (instead of version)
+        return candidates["did"].min()  # type: ignore
 
 
 class TasksAPI(ResourceAPI, ABC):
diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py
index 9ff1ec278..f985cd75a 100644
--- a/openml/_api/resources/datasets.py
+++ b/openml/_api/resources/datasets.py
@@ -1,20 +1,442 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
-
-from openml._api.resources.base import DatasetsAPI
+from functools import partial
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
-    from responses import Response
+    from requests import Response
 
-    from openml.datasets.dataset import OpenMLDataset
+import pandas as pd
+import xmltodict
+
+import openml.utils
+from openml._api.resources.base import DatasetsAPI
+from openml.datasets.dataset import OpenMLDataset
 
 
 class DatasetsV1(DatasetsAPI):
-    def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
-        raise NotImplementedError
+    def get(
+        self, dataset_id: int, *, return_response: bool = False
+    ) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
+        path = f"data/{dataset_id}"
+        response = self._http.get(path)
+        xml_content = response.text  # .text returns str, .content returns bytes
+        dataset = self._create_dataset_from_xml(xml_content)
+
+        if return_response:
+            return dataset, response
+
+        return dataset
+
+    def list(  # noqa: PLR0913
+        self,
+        data_id: list[int] | None = None,
+        offset: int | None = None,
+        size: int | None = None,
+        status: str | None = None,
+        tag: str | None = None,
+        data_name: str | None = None,
+        data_version: int | None = None,
+        number_instances: int | str | None = None,
+        number_features: int | str | None = None,
+        number_classes: int | str | None = None,
+        number_missing_values: int | str | None = None,
+    ) -> pd.DataFrame:
+        """Return a dataframe of all dataset which are on OpenML.
+
+        Supports large amount of results.
+
+        Parameters
+        ----------
+        data_id : list, optional
+            A list of data ids, to specify which datasets should be
+            listed
+        offset : int, optional
+            The number of datasets to skip, starting from the first.
+        size : int, optional
+            The maximum number of datasets to show.
+        status : str, optional
+            Should be {active, in_preparation, deactivated}. By
+            default active datasets are returned, but also datasets
+            from another status can be requested.
+        tag : str, optional
+        data_name : str, optional
+        data_version : int, optional
+        number_instances : int | str, optional
+        number_features : int | str, optional
+        number_classes : int | str, optional
+        number_missing_values : int | str, optional
+
+        Returns
+        -------
+        datasets: dataframe
+            Each row maps to a dataset
+            Each column contains the following information:
+            - dataset id
+            - name
+            - format
+            - status
+            If qualities are calculated for the dataset, some of
+            these are also included as columns.
+        """
+        listing_call = partial(
+            self._list_datasets,
+            data_id=data_id,
+            status=status,
+            tag=tag,
+            data_name=data_name,
+            data_version=data_version,
+            number_instances=number_instances,
+            number_features=number_features,
+            number_classes=number_classes,
+            number_missing_values=number_missing_values,
+        )
+        batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+        if len(batches) == 0:
+            return pd.DataFrame()
+
+        return pd.concat(batches)
+
+    def _list_datasets(
+        self,
+        limit: int,
+        offset: int,
+        *,
+        data_id: list[int] | None = None,  # type: ignore
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        """
+        Perform api call to return a list of all datasets.
+
+        Parameters
+        ----------
+        The arguments that are lists are separated from the single value
+        ones which are put into the kwargs.
+        display_errors is also separated from the kwargs since it has a
+        default value.
+
+        limit : int
+            The maximum number of datasets to show.
+        offset : int
+            The number of datasets to skip, starting from the first.
+        data_id : list, optional
+
+        kwargs : dict, optional
+            Legal filter operators (keys in the dict):
+            tag, status, limit, offset, data_name, data_version, number_instances,
+            number_features, number_classes, number_missing_values.
+
+        Returns
+        -------
+        datasets : dataframe
+        """
+        api_call = "data/list"
+
+        if limit is not None:
+            api_call += f"/limit/{limit}"
+        if offset is not None:
+            api_call += f"/offset/{offset}"
+
+        if kwargs is not None:
+            for operator, value in kwargs.items():
+                if value is not None:
+                    api_call += f"/{operator}/{value}"
+        if data_id is not None:
+            api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}"
+        return self.__list_datasets(api_call=api_call)
+
+    def __list_datasets(self, api_call: str) -> pd.DataFrame:
+        xml_string = self._http.get(api_call).text
+        datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
+
+        # Minimalistic check if the XML is useful
+        assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type(
+            datasets_dict["oml:data"],
+        )
+        assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[
+            "oml:data"
+        ]["@xmlns:oml"]
+
+        datasets = {}
+        for dataset_ in datasets_dict["oml:data"]["oml:dataset"]:
+            ignore_attribute = ["oml:file_id", "oml:quality"]
+            dataset = {
+                k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute
+            }
+            dataset["did"] = int(dataset["did"])
+            dataset["version"] = int(dataset["version"])
+
+            # The number of qualities can range from 0 to infinity
+            for quality in dataset_.get("oml:quality", []):
+                try:
+                    dataset[quality["@name"]] = int(quality["#text"])
+                except ValueError:
+                    dataset[quality["@name"]] = float(quality["#text"])
+            datasets[dataset["did"]] = dataset
+
+        return pd.DataFrame.from_dict(datasets, orient="index").astype(
+            {
+                "did": int,
+                "version": int,
+                "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
+            }
+        )
+
+    def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset:
+        """Create a dataset given a xml string.
+
+        Parameters
+        ----------
+        xml : string
+            Dataset xml representation.
+
+        Returns
+        -------
+        OpenMLDataset
+        """
+        description = xmltodict.parse(xml)["oml:data_set_description"]
+
+        # TODO file path after download, cache_format default = 'pickle'
+        arff_file = None
+        features_file = None
+        parquet_file = None
+        qualities_file = None
+
+        return OpenMLDataset(
+            description["oml:name"],
+            description.get("oml:description"),
+            data_format=description["oml:format"],
+            dataset_id=int(description["oml:id"]),
+            version=int(description["oml:version"]),
+            creator=description.get("oml:creator"),
+            contributor=description.get("oml:contributor"),
+            collection_date=description.get("oml:collection_date"),
+            upload_date=description.get("oml:upload_date"),
+            language=description.get("oml:language"),
+            licence=description.get("oml:licence"),
+            url=description["oml:url"],
+            default_target_attribute=description.get("oml:default_target_attribute"),
+            row_id_attribute=description.get("oml:row_id_attribute"),
+            ignore_attribute=description.get("oml:ignore_attribute"),
+            version_label=description.get("oml:version_label"),
+            citation=description.get("oml:citation"),
+            tag=description.get("oml:tag"),
+            visibility=description.get("oml:visibility"),
+            original_data_url=description.get("oml:original_data_url"),
+            paper_url=description.get("oml:paper_url"),
+            update_comment=description.get("oml:update_comment"),
+            md5_checksum=description.get("oml:md5_checksum"),
+            data_file=str(arff_file) if arff_file is not None else None,
+            features_file=str(features_file) if features_file is not None else None,
+            qualities_file=str(qualities_file) if qualities_file is not None else None,
+            parquet_url=description.get("oml:parquet_url"),
+            parquet_file=str(parquet_file) if parquet_file is not None else None,
+        )
 
 
 class DatasetsV2(DatasetsAPI):
-    def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
-        raise NotImplementedError
+    def get(
+        self, dataset_id: int, *, return_response: bool = False
+    ) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
+        path = f"datasets/{dataset_id}"
+        response = self._http.get(path)
+        json_content = response.json()
+        dataset = self._create_dataset_from_json(json_content)
+
+        if return_response:
+            return dataset, response
+
+        return dataset
+
+    def list(  # noqa: PLR0913
+        self,
+        data_id: list[int] | None = None,
+        offset: int | None = None,
+        size: int | None = None,
+        status: str | None = None,
+        tag: str | None = None,
+        data_name: str | None = None,
+        data_version: int | None = None,
+        number_instances: int | str | None = None,
+        number_features: int | str | None = None,
+        number_classes: int | str | None = None,
+        number_missing_values: int | str | None = None,
+    ) -> pd.DataFrame:
+        """Return a dataframe of all dataset which are on OpenML.
+
+        Supports large amount of results.
+
+        Parameters
+        ----------
+        data_id : list, optional
+            A list of data ids, to specify which datasets should be
+            listed
+        offset : int, optional
+            The number of datasets to skip, starting from the first.
+        size : int, optional
+            The maximum number of datasets to show.
+        status : str, optional
+            Should be {active, in_preparation, deactivated}. By
+            default active datasets are returned, but also datasets
+            from another status can be requested.
+        tag : str, optional
+        data_name : str, optional
+        data_version : int, optional
+        number_instances : int | str, optional
+        number_features : int | str, optional
+        number_classes : int | str, optional
+        number_missing_values : int | str, optional
+
+        Returns
+        -------
+        datasets: dataframe
+            Each row maps to a dataset
+            Each column contains the following information:
+            - dataset id
+            - name
+            - format
+            - status
+            If qualities are calculated for the dataset, some of
+            these are also included as columns.
+        """
+        listing_call = partial(
+            self._list_datasets,
+            data_id=data_id,
+            status=status,
+            tag=tag,
+            data_name=data_name,
+            data_version=data_version,
+            number_instances=number_instances,
+            number_features=number_features,
+            number_classes=number_classes,
+            number_missing_values=number_missing_values,
+        )
+        batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+        if len(batches) == 0:
+            return pd.DataFrame()
+
+        return pd.concat(batches)
+
+    def _list_datasets(
+        self,
+        limit: int,
+        offset: int,
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        """
+        Perform api call to return a list of all datasets.
+
+        Parameters
+        ----------
+        The arguments that are lists are separated from the single value
+        ones which are put into the kwargs.
+        display_errors is also separated from the kwargs since it has a
+        default value.
+
+        limit : int
+            The maximum number of datasets to show.
+        offset : int
+            The number of datasets to skip, starting from the first.
+        data_id : list, optional
+
+        kwargs : dict, optional
+            Legal filter operators (keys in the dict):
+            tag, status, limit, offset, data_name, data_version, number_instances,
+            number_features, number_classes, number_missing_values, data_id.
+
+        Returns
+        -------
+        datasets : dataframe
+        """
+        json: dict[str, Any] = {"pagination": {}}
+
+        if limit is not None:
+            json["pagination"]["limit"] = limit
+        if offset is not None:
+            json["pagination"]["offset"] = offset
+
+        if kwargs is not None:
+            for operator, value in kwargs.items():
+                if value is not None:
+                    json[operator] = value
+
+        return self.__list_datasets(json=json)
+
+    def __list_datasets(self, json: dict) -> pd.DataFrame:
+        api_call = "datasets/list"
+        datasets_list = self._http.post(api_call, json=json).json()
+
+        # Minimalistic check if the JSON is useful
+        assert isinstance(datasets_list, list), type(datasets_list)
+
+        datasets = {}
+        for dataset_ in datasets_list:
+            ignore_attribute = ["file_id", "quality"]
+            dataset = {k: v for (k, v) in dataset_.items() if k not in ignore_attribute}
+            dataset["did"] = int(dataset["did"])
+            dataset["version"] = int(dataset["version"])
+
+            # The number of qualities can range from 0 to infinity
+            for quality in dataset_.get("quality", []):
+                try:
+                    dataset[quality["name"]] = int(quality["text"])
+                except ValueError:
+                    dataset[quality["name"]] = float(quality["text"])
+            datasets[dataset["did"]] = dataset
+
+        return pd.DataFrame.from_dict(datasets, orient="index").astype(
+            {
+                "did": int,
+                "version": int,
+                "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
+            }
+        )
+
+    def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset:
+        """Create a dataset given a json.
+
+        Parameters
+        ----------
+        json_content : dict
+            Dataset dict/json representation.
+
+        Returns
+        -------
+        OpenMLDataset
+        """
+        # TODO file path after download, cache_format default = 'pickle'
+        arff_file = None
+        features_file = None
+        parquet_file = None
+        qualities_file = None
+
+        return OpenMLDataset(
+            json_content["name"],
+            json_content.get("description"),
+            data_format=json_content["format"],
+            dataset_id=int(json_content["id"]),
+            version=int(json_content["version"]),
+            creator=json_content.get("creator"),
+            contributor=json_content.get("contributor"),
+            collection_date=json_content.get("collection_date"),
+            upload_date=json_content.get("upload_date"),
+            language=json_content.get("language"),
+            licence=json_content.get("licence"),
+            url=json_content["url"],
+            default_target_attribute=json_content.get("default_target_attribute"),
+            row_id_attribute=json_content.get("row_id_attribute"),
+            ignore_attribute=json_content.get("ignore_attribute"),
+            version_label=json_content.get("version_label"),
+            citation=json_content.get("citation"),
+            tag=json_content.get("tag"),
+            visibility=json_content.get("visibility"),
+            original_data_url=json_content.get("original_data_url"),
+            paper_url=json_content.get("paper_url"),
+            update_comment=json_content.get("update_comment"),
+            md5_checksum=json_content.get("md5_checksum"),
+            data_file=str(arff_file) if arff_file is not None else None,
+            features_file=str(features_file) if features_file is not None else None,
+            qualities_file=str(qualities_file) if qualities_file is not None else None,
+            parquet_url=json_content.get("parquet_url"),
+            parquet_file=str(parquet_file) if parquet_file is not None else None,
+        )

From 5dfcbce55a027d19cd502ea7bb3d521c2b1bca29 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 7 Jan 2026 22:14:31 +0500
Subject: [PATCH 04/15] refactor

---
 openml/_api/config.py       | 62 +++++++++++++++++++++++++++++++++++--
 openml/_api/http/client.py  | 18 +++++++----
 openml/_api/runtime/core.py |  9 ++----
 3 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/openml/_api/config.py b/openml/_api/config.py
index bd93c3cad..1431f66b1 100644
--- a/openml/_api/config.py
+++ b/openml/_api/config.py
@@ -1,5 +1,61 @@
 from __future__ import annotations
 
-API_V1_SERVER = "https://www.openml.org/api/v1/xml"
-API_V2_SERVER = "http://127.0.0.1:8001"
-API_KEY = "..."
+from dataclasses import dataclass
+from typing import Literal
+
+DelayMethod = Literal["human", "robot"]
+
+
+@dataclass
+class APIConfig:
+    server: str
+    base_url: str
+    key: str
+
+
+@dataclass
+class APISettings:
+    v1: APIConfig
+    v2: APIConfig
+
+
+@dataclass
+class ConnectionConfig:
+    retries: int = 3
+    delay_method: DelayMethod = "human"
+    delay_time: int = 1  # seconds
+
+    def __post_init__(self) -> None:
+        if self.delay_method not in ("human", "robot"):
+            raise ValueError(f"delay_method must be 'human' or 'robot', got {self.delay_method}")
+
+
+@dataclass
+class CacheConfig:
+    dir: str = "~/.openml/cache"
+    ttl: int = 60 * 60 * 24 * 7  # one week
+
+
+@dataclass
+class Settings:
+    api: APISettings
+    connection: ConnectionConfig
+    cache: CacheConfig
+
+
+settings = Settings(
+    api=APISettings(
+        v1=APIConfig(
+            server="https://www.openml.org/",
+            base_url="api/v1/xml/",
+            key="...",
+        ),
+        v2=APIConfig(
+            server="http://127.0.0.1:8001/",
+            base_url="",
+            key="...",
+        ),
+    ),
+    connection=ConnectionConfig(),
+    cache=CacheConfig(),
+)
diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
index dea5de809..74e08c709 100644
--- a/openml/_api/http/client.py
+++ b/openml/_api/http/client.py
@@ -1,24 +1,30 @@
 from __future__ import annotations
 
-from typing import Any, Mapping
+from typing import TYPE_CHECKING, Any, Mapping
 
 import requests
 from requests import Response
 
 from openml.__version__ import __version__
 
+if TYPE_CHECKING:
+    from openml._api.config import APIConfig
+
 
 class HTTPClient:
-    def __init__(self, base_url: str) -> None:
-        self.base_url = base_url
+    def __init__(self, config: APIConfig) -> None:
+        self.config = config
         self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"}
 
+    def _create_url(self, path: str) -> str:
+        return self.config.server + self.config.base_url + path
+
     def get(
         self,
         path: str,
         params: Mapping[str, Any] | None = None,
     ) -> Response:
-        url = f"{self.base_url}/{path}"
+        url = self._create_url(path)
         return requests.get(url, params=params, headers=self.headers, timeout=10)
 
     def post(
@@ -27,7 +33,7 @@ def post(
         data: Mapping[str, Any] | None = None,
         files: Any = None,
     ) -> Response:
-        url = f"{self.base_url}/{path}"
+        url = self._create_url(path)
         return requests.post(url, data=data, files=files, headers=self.headers, timeout=10)
 
     def delete(
@@ -35,5 +41,5 @@ def delete(
         path: str,
         params: Mapping[str, Any] | None = None,
     ) -> Response:
-        url = f"{self.base_url}/{path}"
+        url = self._create_url(path)
         return requests.delete(url, params=params, headers=self.headers, timeout=10)
diff --git a/openml/_api/runtime/core.py b/openml/_api/runtime/core.py
index aa09a69db..98b587411 100644
--- a/openml/_api/runtime/core.py
+++ b/openml/_api/runtime/core.py
@@ -2,10 +2,7 @@
 
 from typing import TYPE_CHECKING
 
-from openml._api.config import (
-    API_V1_SERVER,
-    API_V2_SERVER,
-)
+from openml._api.config import settings
 from openml._api.http.client import HTTPClient
 from openml._api.resources import (
     DatasetsV1,
@@ -25,8 +22,8 @@ def __init__(self, *, datasets: DatasetsAPI, tasks: TasksAPI):
 
 
 def build_backend(version: str, *, strict: bool) -> APIBackend:
-    v1_http = HTTPClient(API_V1_SERVER)
-    v2_http = HTTPClient(API_V2_SERVER)
+    v1_http = HTTPClient(config=settings.api.v1)
+    v2_http = HTTPClient(config=settings.api.v2)
 
     v1 = APIBackend(
         datasets=DatasetsV1(v1_http),

From 2acbe9992cf95bfc103ff4fa0c360a58c1842870 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 7 Jan 2026 22:24:03 +0500
Subject: [PATCH 05/15] implement cache_dir

---
 openml/_api/http/client.py | 74 +++++++++++++++++++++++++++++++++-----
 1 file changed, 66 insertions(+), 8 deletions(-)

diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
index 74e08c709..49b05c88e 100644
--- a/openml/_api/http/client.py
+++ b/openml/_api/http/client.py
@@ -1,36 +1,93 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Mapping
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+from urllib.parse import urlencode, urljoin, urlparse
 
 import requests
 from requests import Response
 
 from openml.__version__ import __version__
+from openml._api.config import settings
 
 if TYPE_CHECKING:
     from openml._api.config import APIConfig
 
 
-class HTTPClient:
+class CacheMixin:
+    @property
+    def dir(self) -> str:
+        return settings.cache.dir
+
+    @property
+    def ttl(self) -> int:
+        return settings.cache.ttl
+
+    def _get_cache_directory(self, url: str, params: dict[str, Any]) -> Path:
+        parsed_url = urlparse(url)
+        netloc_parts = parsed_url.netloc.split(".")[::-1]  # reverse domain
+        path_parts = parsed_url.path.strip("/").split("/")
+
+        # remove api_key and serialize params if any
+        filtered_params = {k: v for k, v in params.items() if k != "api_key"}
+        params_part = [urlencode(filtered_params)] if filtered_params else []
+
+        return Path(self.dir).joinpath(*netloc_parts, *path_parts, *params_part)
+
+    def _get_cache_response(self, url: str, params: dict[str, Any]) -> Response | None:  # noqa: ARG002
+        return None
+
+    def _set_cache_response(self, url: str, params: dict[str, Any], response: Response) -> None:  # noqa: ARG002
+        return None
+
+
+class HTTPClient(CacheMixin):
     def __init__(self, config: APIConfig) -> None:
         self.config = config
         self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"}
 
-    def _create_url(self, path: str) -> str:
-        return self.config.server + self.config.base_url + path
+    @property
+    def server(self) -> str:
+        return self.config.server
+
+    @property
+    def base_url(self) -> str:
+        return self.config.base_url
+
+    def _create_url(self, path: str) -> Any:
+        return urljoin(self.server, urljoin(self.base_url, path))
 
     def get(
         self,
         path: str,
-        params: Mapping[str, Any] | None = None,
+        *,
+        params: dict[str, Any] | None = None,
+        use_cache: bool = False,
+        use_api_key: bool = False,
     ) -> Response:
         url = self._create_url(path)
-        return requests.get(url, params=params, headers=self.headers, timeout=10)
+        params = dict(params) if params is not None else {}
+
+        if use_api_key:
+            params["api_key"] = self.config.key
+
+        if use_cache:
+            response = self._get_cache_response(url, params)
+            if response:
+                return response
+
+        response = requests.get(url, params=params, headers=self.headers, timeout=10)
+
+        if use_cache:
+            self._set_cache_response(url, params, response)
+
+        return response
 
     def post(
         self,
         path: str,
-        data: Mapping[str, Any] | None = None,
+        *,
+        data: dict[str, Any] | None = None,
         files: Any = None,
     ) -> Response:
         url = self._create_url(path)
@@ -39,7 +96,8 @@ def post(
     def delete(
         self,
         path: str,
-        params: Mapping[str, Any] | None = None,
+        *,
+        params: dict[str, Any] | None = None,
     ) -> Response:
         url = self._create_url(path)
         return requests.delete(url, params=params, headers=self.headers, timeout=10)

From af99880a9e16a49833c63084c9e9267c112b6b91 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Wed, 7 Jan 2026 23:42:17 +0500
Subject: [PATCH 06/15] refactor

---
 openml/_api/config.py      |   1 +
 openml/_api/http/client.py | 100 +++++++++++++++++++++++++++----------
 2 files changed, 75 insertions(+), 26 deletions(-)

diff --git a/openml/_api/config.py b/openml/_api/config.py
index 1431f66b1..848fe8da1 100644
--- a/openml/_api/config.py
+++ b/openml/_api/config.py
@@ -11,6 +11,7 @@ class APIConfig:
     server: str
     base_url: str
     key: str
+    timeout: int = 10  # seconds
 
 
 @dataclass
diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
index 49b05c88e..a90e93933 100644
--- a/openml/_api/http/client.py
+++ b/openml/_api/http/client.py
@@ -23,7 +23,7 @@ def dir(self) -> str:
     def ttl(self) -> int:
         return settings.cache.ttl
 
-    def _get_cache_directory(self, url: str, params: dict[str, Any]) -> Path:
+    def _get_cache_dir(self, url: str, params: dict[str, Any]) -> Path:
         parsed_url = urlparse(url)
         netloc_parts = parsed_url.netloc.split(".")[::-1]  # reverse domain
         path_parts = parsed_url.path.strip("/").split("/")
@@ -34,10 +34,10 @@ def _get_cache_directory(self, url: str, params: dict[str, Any]) -> Path:
 
         return Path(self.dir).joinpath(*netloc_parts, *path_parts, *params_part)
 
-    def _get_cache_response(self, url: str, params: dict[str, Any]) -> Response | None:  # noqa: ARG002
-        return None
+    def _get_cache_response(self, cache_dir: Path) -> Response:  # noqa: ARG002
+        return Response()
 
-    def _set_cache_response(self, url: str, params: dict[str, Any], response: Response) -> None:  # noqa: ARG002
+    def _set_cache_response(self, cache_dir: Path, response: Response) -> None:  # noqa: ARG002
         return None
 
 
@@ -54,50 +54,98 @@ def server(self) -> str:
     def base_url(self) -> str:
         return self.config.base_url
 
-    def _create_url(self, path: str) -> Any:
-        return urljoin(self.server, urljoin(self.base_url, path))
+    @property
+    def key(self) -> str:
+        return self.config.key
 
-    def get(
+    @property
+    def timeout(self) -> int:
+        return self.config.timeout
+
+    def request(
         self,
+        method: str,
         path: str,
         *,
-        params: dict[str, Any] | None = None,
         use_cache: bool = False,
         use_api_key: bool = False,
+        **request_kwargs: Any,
     ) -> Response:
-        url = self._create_url(path)
-        params = dict(params) if params is not None else {}
+        url = urljoin(self.server, urljoin(self.base_url, path))
 
+        params = request_kwargs.pop("params", {})
+        params = params.copy()
         if use_api_key:
-            params["api_key"] = self.config.key
+            params["api_key"] = self.key
 
-        if use_cache:
-            response = self._get_cache_response(url, params)
-            if response:
-                return response
+        headers = request_kwargs.pop("headers", {})
+        headers = headers.copy()
+        headers.update(self.headers)
+
+        timeout = request_kwargs.pop("timeout", self.timeout)
+        cache_dir = self._get_cache_dir(url, params)
 
-        response = requests.get(url, params=params, headers=self.headers, timeout=10)
+        if use_cache:
+            try:
+                return self._get_cache_response(cache_dir)
+            # TODO: handle ttl expired error
+            except Exception:
+                raise
+
+        response = requests.request(
+            method=method,
+            url=url,
+            params=params,
+            headers=headers,
+            timeout=timeout,
+            **request_kwargs,
+        )
 
         if use_cache:
-            self._set_cache_response(url, params, response)
+            self._set_cache_response(cache_dir, response)
 
         return response
 
-    def post(
+    def get(
         self,
         path: str,
         *,
-        data: dict[str, Any] | None = None,
-        files: Any = None,
+        use_cache: bool = False,
+        use_api_key: bool = False,
+        **request_kwargs: Any,
     ) -> Response:
-        url = self._create_url(path)
-        return requests.post(url, data=data, files=files, headers=self.headers, timeout=10)
+        # TODO: remove override when cache is implemented
+        use_cache = False
+        return self.request(
+            method="GET",
+            path=path,
+            use_cache=use_cache,
+            use_api_key=use_api_key,
+            **request_kwargs,
+        )
+
+    def post(
+        self,
+        path: str,
+        **request_kwargs: Any,
+    ) -> Response:
+        return self.request(
+            method="POST",
+            path=path,
+            use_cache=False,
+            use_api_key=True,
+            **request_kwargs,
+        )
 
     def delete(
         self,
         path: str,
-        *,
-        params: dict[str, Any] | None = None,
+        **request_kwargs: Any,
     ) -> Response:
-        url = self._create_url(path)
-        return requests.delete(url, params=params, headers=self.headers, timeout=10)
+        return self.request(
+            method="DELETE",
+            path=path,
+            use_cache=False,
+            use_api_key=True,
+            **request_kwargs,
+        )

From 8964517d5fa9b656dc1473adfc09e9a56c524073 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Thu, 8 Jan 2026 09:14:49 +0530
Subject: [PATCH 07/15] edit, fork, delete updated

---
 openml/_api/resources/base.py     |  36 +++-
 openml/_api/resources/datasets.py | 278 +++++++++++++++++++++++++++++-
 2 files changed, 304 insertions(+), 10 deletions(-)

diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py
index 9d480b06a..5a74239d1 100644
--- a/openml/_api/resources/base.py
+++ b/openml/_api/resources/base.py
@@ -2,6 +2,7 @@
 
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING
+from typing_extensions import Literal
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -20,13 +21,18 @@ def __init__(self, http: HTTPClient):
 class DatasetsAPI(ResourceAPI, ABC):
     @abstractmethod
     def get(
-        self, dataset_id: int, *, return_response: bool
+        self,
+        dataset_id: int | str,
+        version: int | None = None,
+        error_if_multiple: bool = False,  # noqa: FBT002, FBT001
+        *,
+        return_response: bool = False,
     ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ...
 
     @abstractmethod
     def list(  # noqa: PLR0913
         self,
-        data_id: list[int] | None = None,
+        data_id: list[int] | None = None,  # type: ignore
         offset: int | None = None,
         size: int | None = None,
         status: str | None = None,
@@ -39,6 +45,32 @@ def list(  # noqa: PLR0913
         number_missing_values: int | str | None = None,
     ) -> pd.DataFrame: ...
 
+    @abstractmethod
+    def delete(self, dataset_id: int) -> bool: ...
+
+    @abstractmethod
+    def edit(  # noqa: PLR0913
+        self,
+        data_id: int,
+        description: str | None = None,
+        creator: str | None = None,
+        contributor: str | None = None,
+        collection_date: str | None = None,
+        language: str | None = None,
+        default_target_attribute: str | None = None,
+        ignore_attribute: str | list[str] | None = None,  # type: ignore
+        citation: str | None = None,
+        row_id_attribute: str | None = None,
+        original_data_url: str | None = None,
+        paper_url: str | None = None,
+    ) -> int: ...
+
+    @abstractmethod
+    def fork(self, data_id: int) -> int: ...
+
+    @abstractmethod
+    def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: ...
+
     def _name_to_id(
         self,
         dataset_name: str,
diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py
index f985cd75a..5414fba43 100644
--- a/openml/_api/resources/datasets.py
+++ b/openml/_api/resources/datasets.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
+from collections import OrderedDict
 from functools import partial
 from typing import TYPE_CHECKING, Any
+from typing_extensions import Literal
 
 if TYPE_CHECKING:
     from requests import Response
@@ -16,11 +18,23 @@
 
 class DatasetsV1(DatasetsAPI):
     def get(
-        self, dataset_id: int, *, return_response: bool = False
+        self,
+        dataset_id: int | str,
+        version: int | None = None,
+        error_if_multiple: bool = False,  # noqa: FBT002, FBT001
+        *,
+        return_response: bool = False,
     ) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
-        path = f"data/{dataset_id}"
+        if isinstance(dataset_id, int):
+            resolved_id = dataset_id
+        elif dataset_id.isdigit():
+            resolved_id = int(dataset_id)
+        else:
+            resolved_id = self._name_to_id(dataset_id, version, error_if_multiple)
+
+        path = f"data/{resolved_id}"
         response = self._http.get(path)
-        xml_content = response.text  # .text returns str, .content returns bytes
+        xml_content = response.text
         dataset = self._create_dataset_from_xml(xml_content)
 
         if return_response:
@@ -97,6 +111,194 @@ def list(  # noqa: PLR0913
 
         return pd.concat(batches)
 
+    def delete(self, dataset_id: int) -> bool:
+        """Delete dataset with id `dataset_id` from the OpenML server.
+
+        This can only be done if you are the owner of the dataset and
+        no tasks are attached to the dataset.
+
+        Parameters
+        ----------
+        dataset_id : int
+            OpenML id of the dataset
+
+        Returns
+        -------
+        bool
+            True if the deletion was successful. False otherwise.
+        """
+        return openml.utils._delete_entity("data", dataset_id)
+
+    def edit(  # noqa: PLR0913
+        self,
+        data_id: int,
+        description: str | None = None,
+        creator: str | None = None,
+        contributor: str | None = None,
+        collection_date: str | None = None,
+        language: str | None = None,
+        default_target_attribute: str | None = None,
+        ignore_attribute: str | list[str] | None = None,  # type: ignore
+        citation: str | None = None,
+        row_id_attribute: str | None = None,
+        original_data_url: str | None = None,
+        paper_url: str | None = None,
+    ) -> int:
+        """Edits an OpenMLDataset.
+
+        In addition to providing the dataset id of the dataset to edit (through data_id),
+        you must specify a value for at least one of the optional function arguments,
+        i.e. one value for a field to edit.
+
+        This function allows editing of both non-critical and critical fields.
+        Critical fields are default_target_attribute, ignore_attribute, row_id_attribute.
+
+        - Editing non-critical data fields is allowed for all authenticated users.
+        - Editing critical fields is allowed only for the owner, provided there are no tasks
+        associated with this dataset.
+
+        If dataset has tasks or if the user is not the owner, the only way
+        to edit critical fields is to use fork_dataset followed by edit_dataset.
+
+        Parameters
+        ----------
+        data_id : int
+            ID of the dataset.
+        description : str
+            Description of the dataset.
+        creator : str
+            The person who created the dataset.
+        contributor : str
+            People who contributed to the current version of the dataset.
+        collection_date : str
+            The date the data was originally collected, given by the uploader.
+        language : str
+            Language in which the data is represented.
+            Starts with 1 upper case letter, rest lower case, e.g. 'English'.
+        default_target_attribute : str
+            The default target attribute, if it exists.
+            Can have multiple values, comma separated.
+        ignore_attribute : str | list
+            Attributes that should be excluded in modelling,
+            such as identifiers and indexes.
+        citation : str
+            Reference(s) that should be cited when building on this data.
+        row_id_attribute : str, optional
+            The attribute that represents the row-id column, if present in the
+            dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
+            specified, the index of the dataframe will be used as the
+            ``row_id_attribute``. If the name of the index is ``None``, it will
+            be discarded.
+
+            .. versionadded: 0.8
+                Inference of ``row_id_attribute`` from a dataframe.
+        original_data_url : str, optional
+            For derived data, the url to the original dataset.
+        paper_url : str, optional
+            Link to a paper describing the dataset.
+
+        Returns
+        -------
+        Dataset id
+        """
+        if not isinstance(data_id, int):
+            raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
+
+        # compose data edit parameters as xml
+        form_data = {"data_id": data_id}  # type: openml._api_calls.DATA_TYPE
+        xml = OrderedDict()  # type: 'OrderedDict[str, OrderedDict]'
+        xml["oml:data_edit_parameters"] = OrderedDict()
+        xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml"
+        xml["oml:data_edit_parameters"]["oml:description"] = description
+        xml["oml:data_edit_parameters"]["oml:creator"] = creator
+        xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
+        xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
+        xml["oml:data_edit_parameters"]["oml:language"] = language
+        xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute
+        xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute
+        xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute
+        xml["oml:data_edit_parameters"]["oml:citation"] = citation
+        xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
+        xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url
+
+        # delete None inputs
+        for k in list(xml["oml:data_edit_parameters"]):
+            if not xml["oml:data_edit_parameters"][k]:
+                del xml["oml:data_edit_parameters"][k]
+
+        file_elements = {
+            "edit_parameters": ("description.xml", xmltodict.unparse(xml)),
+        }  # type: openml._api_calls.FILE_ELEMENTS_TYPE
+        result_xml = self._http.post("data/edit", data=form_data, files=file_elements).text
+        result = xmltodict.parse(result_xml)
+        data_id = result["oml:data_edit"]["oml:id"]
+        return int(data_id)
+
+    def fork(self, data_id: int) -> int:
+        """
+        Creates a new dataset version, with the authenticated user as the new owner.
+        The forked dataset can have distinct dataset meta-data,
+        but the actual data itself is shared with the original version.
+
+        This API is intended for use when a user is unable to edit the critical fields of a dataset
+        through the edit_dataset API.
+        (Critical fields are default_target_attribute, ignore_attribute, row_id_attribute.)
+
+        Specifically, this happens when the user is:
+                1. Not the owner of the dataset.
+                2. User is the owner of the dataset, but the dataset has tasks.
+
+        In these two cases the only way to edit critical fields is:
+                1. STEP 1: Fork the dataset using fork_dataset API
+                2. STEP 2: Call edit_dataset API on the forked version.
+
+
+        Parameters
+        ----------
+        data_id : int
+            id of the dataset to be forked
+
+        Returns
+        -------
+        Dataset id of the forked dataset
+
+        """
+        if not isinstance(data_id, int):
+            raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
+        # compose data fork parameters
+        form_data = {"data_id": data_id}
+        result_xml = self._http.post("data/fork", data=form_data).text
+        result = xmltodict.parse(result_xml)
+        data_id = result["oml:data_fork"]["oml:id"]
+        return int(data_id)
+
+    def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None:
+        """
+        Updates the status of a dataset to either 'active' or 'deactivated'.
+        Please see the OpenML API documentation for a description of the status
+        and all legal status transitions:
+        https://docs.openml.org/concepts/data/#dataset-status
+
+        Parameters
+        ----------
+        data_id : int
+            The data id of the dataset
+        status : str,
+            'active' or 'deactivated'
+        """
+        legal_status = {"active", "deactivated"}
+        if status not in legal_status:
+            raise ValueError(f"Illegal status value. Legal values: {legal_status}")
+
+        data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status}
+        result_xml = self._http.post("data/status/update", data=data).text
+        result = xmltodict.parse(result_xml)
+        server_data_id = result["oml:data_status_update"]["oml:id"]
+        server_status = result["oml:data_status_update"]["oml:status"]
+        if status != server_status or int(data_id) != int(server_data_id):
+            # This should never happen
+            raise ValueError("Data id/status does not collide")
+
     def _list_datasets(
         self,
         limit: int,
@@ -236,9 +438,21 @@ def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset:
 
 class DatasetsV2(DatasetsAPI):
     def get(
-        self, dataset_id: int, *, return_response: bool = False
+        self,
+        dataset_id: int | str,
+        version: int | None = None,
+        error_if_multiple: bool = False,  # noqa: FBT002, FBT001
+        *,
+        return_response: bool = False,
     ) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
-        path = f"datasets/{dataset_id}"
+        if isinstance(dataset_id, int):
+            resolved_id = dataset_id
+        elif dataset_id.isdigit():
+            resolved_id = int(dataset_id)
+        else:
+            resolved_id = self._name_to_id(dataset_id, version, error_if_multiple)
+
+        path = f"data/{resolved_id}"
         response = self._http.get(path)
         json_content = response.json()
         dataset = self._create_dataset_from_json(json_content)
@@ -317,6 +531,55 @@ def list(  # noqa: PLR0913
 
         return pd.concat(batches)
 
+    def delete(self, dataset_id: int) -> bool:
+        raise NotImplementedError()
+
+    def edit(  # noqa: PLR0913
+        self,
+        data_id: int,
+        description: str | None = None,
+        creator: str | None = None,
+        contributor: str | None = None,
+        collection_date: str | None = None,
+        language: str | None = None,
+        default_target_attribute: str | None = None,
+        ignore_attribute: str | list[str] | None = None,  # type: ignore
+        citation: str | None = None,
+        row_id_attribute: str | None = None,
+        original_data_url: str | None = None,
+        paper_url: str | None = None,
+    ) -> int:
+        raise NotImplementedError()
+
+    def fork(self, data_id: int) -> int:
+        raise NotImplementedError()
+
+    def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None:
+        """
+        Updates the status of a dataset to either 'active' or 'deactivated'.
+        Please see the OpenML API documentation for a description of the status
+        and all legal status transitions:
+        https://docs.openml.org/concepts/data/#dataset-status
+
+        Parameters
+        ----------
+        data_id : int
+            The data id of the dataset
+        status : str,
+            'active' or 'deactivated'
+        """
+        legal_status = {"active", "deactivated"}
+        if status not in legal_status:
+            raise ValueError(f"Illegal status value. Legal values: {legal_status}")
+
+        data: openml._api_calls.DATA_TYPE = {"dataset_id": data_id, "status": status}
+        result = self._http.post("datasets/status/update", json=data).json()
+        server_data_id = result["dataset_id"]
+        server_status = result["status"]
+        if status != server_status or int(data_id) != int(server_data_id):
+            # This should never happen
+            raise ValueError("Data id/status does not collide")
+
     def _list_datasets(
         self,
         limit: int,
@@ -365,7 +628,6 @@ def _list_datasets(
     def __list_datasets(self, json: dict) -> pd.DataFrame:
         api_call = "datasets/list"
         datasets_list = self._http.post(api_call, json=json).json()
-
         # Minimalistic check if the JSON is useful
         assert isinstance(datasets_list, list), type(datasets_list)
 
@@ -379,9 +641,9 @@ def __list_datasets(self, json: dict) -> pd.DataFrame:
             # The number of qualities can range from 0 to infinity
             for quality in dataset_.get("quality", []):
                 try:
-                    dataset[quality["name"]] = int(quality["text"])
+                    dataset[quality["name"]] = int(quality["value"])
                 except ValueError:
-                    dataset[quality["name"]] = float(quality["text"])
+                    dataset[quality["name"]] = float(quality["value"])
             datasets[dataset["did"]] = dataset
 
         return pd.DataFrame.from_dict(datasets, orient="index").astype(

From 1c2fa9996aa0024af93ab1819877836b6ab803f2 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Thu, 8 Jan 2026 15:57:09 +0530
Subject: [PATCH 08/15] Added features, updated list

---
 openml/_api/resources/base.py     |  76 +----
 openml/_api/resources/datasets.py | 494 +++++++++++++-----------------
 2 files changed, 234 insertions(+), 336 deletions(-)

diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py
index 5a74239d1..990dda998 100644
--- a/openml/_api/resources/base.py
+++ b/openml/_api/resources/base.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 from typing_extensions import Literal
 
 if TYPE_CHECKING:
@@ -23,26 +23,18 @@ class DatasetsAPI(ResourceAPI, ABC):
     def get(
         self,
         dataset_id: int | str,
-        version: int | None = None,
-        error_if_multiple: bool = False,  # noqa: FBT002, FBT001
         *,
         return_response: bool = False,
     ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ...
 
     @abstractmethod
-    def list(  # noqa: PLR0913
+    def list(
         self,
+        limit: int,
+        offset: int,
+        *,
         data_id: list[int] | None = None,  # type: ignore
-        offset: int | None = None,
-        size: int | None = None,
-        status: str | None = None,
-        tag: str | None = None,
-        data_name: str | None = None,
-        data_version: int | None = None,
-        number_instances: int | str | None = None,
-        number_features: int | str | None = None,
-        number_classes: int | str | None = None,
-        number_missing_values: int | str | None = None,
+        **kwargs: Any,
     ) -> pd.DataFrame: ...
 
     @abstractmethod
@@ -71,54 +63,14 @@ def fork(self, data_id: int) -> int: ...
     @abstractmethod
     def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: ...
 
-    def _name_to_id(
-        self,
-        dataset_name: str,
-        version: int | None = None,
-        error_if_multiple: bool = False,  # noqa: FBT001, FBT002
-    ) -> int:
-        """Attempt to find the dataset id of the dataset with the given name.
-
-        If multiple datasets with the name exist, and ``error_if_multiple`` is ``False``,
-        then return the least recent still active dataset.
-
-        Raises an error if no dataset with the name is found.
-        Raises an error if a version is specified but it could not be found.
-
-        Parameters
-        ----------
-        dataset_name : str
-            The name of the dataset for which to find its id.
-        version : int, optional
-            Version to retrieve. If not specified, the oldest active version is returned.
-        error_if_multiple : bool (default=False)
-            If `False`, if multiple datasets match, return the least recent active dataset.
-            If `True`, if multiple datasets match, raise an error.
-        download_qualities : bool, optional (default=True)
-            If `True`, also download qualities.xml file. If False it skip the qualities.xml.
-
-        Returns
-        -------
-        int
-        The id of the dataset.
-        """
-        status = None if version is not None else "active"
-        candidates = self.list(
-            data_name=dataset_name,
-            status=status,
-            data_version=version,
-        )
-        if error_if_multiple and len(candidates) > 1:
-            msg = f"Multiple active datasets exist with name '{dataset_name}'."
-            raise ValueError(msg)
-
-        if candidates.empty:
-            no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'"
-            and_version = f" and version '{version}'." if version is not None else "."
-            raise RuntimeError(no_dataset_for_name + and_version)
-
-        # Dataset ids are chronological so we can just sort based on ids (instead of version)
-        return candidates["did"].min()  # type: ignore
+    @abstractmethod
+    def list_qualities(self) -> list[str]: ...  # type: ignore
+
+    @abstractmethod
+    def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: ...
+
+    @abstractmethod
+    def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool: ...
 
 
 class TasksAPI(ResourceAPI, ABC):
diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py
index 5414fba43..845212b20 100644
--- a/openml/_api/resources/datasets.py
+++ b/openml/_api/resources/datasets.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 from collections import OrderedDict
-from functools import partial
 from typing import TYPE_CHECKING, Any
 from typing_extensions import Literal
 
@@ -20,19 +19,10 @@ class DatasetsV1(DatasetsAPI):
     def get(
         self,
         dataset_id: int | str,
-        version: int | None = None,
-        error_if_multiple: bool = False,  # noqa: FBT002, FBT001
         *,
         return_response: bool = False,
     ) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
-        if isinstance(dataset_id, int):
-            resolved_id = dataset_id
-        elif dataset_id.isdigit():
-            resolved_id = int(dataset_id)
-        else:
-            resolved_id = self._name_to_id(dataset_id, version, error_if_multiple)
-
-        path = f"data/{resolved_id}"
+        path = f"data/{dataset_id}"
         response = self._http.get(path)
         xml_content = response.text
         dataset = self._create_dataset_from_xml(xml_content)
@@ -42,74 +32,88 @@ def get(
 
         return dataset
 
-    def list(  # noqa: PLR0913
+    def list(
         self,
-        data_id: list[int] | None = None,
-        offset: int | None = None,
-        size: int | None = None,
-        status: str | None = None,
-        tag: str | None = None,
-        data_name: str | None = None,
-        data_version: int | None = None,
-        number_instances: int | str | None = None,
-        number_features: int | str | None = None,
-        number_classes: int | str | None = None,
-        number_missing_values: int | str | None = None,
+        limit: int,
+        offset: int,
+        *,
+        data_id: list[int] | None = None,  # type: ignore
+        **kwargs: Any,
     ) -> pd.DataFrame:
-        """Return a dataframe of all dataset which are on OpenML.
-
-        Supports large amount of results.
+        """
+        Perform api call to return a list of all datasets.
 
         Parameters
         ----------
-        data_id : list, optional
-            A list of data ids, to specify which datasets should be
-            listed
-        offset : int, optional
-            The number of datasets to skip, starting from the first.
-        size : int, optional
+        The arguments that are lists are separated from the single value
+        ones which are put into the kwargs.
+        display_errors is also separated from the kwargs since it has a
+        default value.
+
+        limit : int
             The maximum number of datasets to show.
-        status : str, optional
-            Should be {active, in_preparation, deactivated}. By
-            default active datasets are returned, but also datasets
-            from another status can be requested.
-        tag : str, optional
-        data_name : str, optional
-        data_version : int, optional
-        number_instances : int | str, optional
-        number_features : int | str, optional
-        number_classes : int | str, optional
-        number_missing_values : int | str, optional
+        offset : int
+            The number of datasets to skip, starting from the first.
+        data_id : list, optional
+
+        kwargs : dict, optional
+            Legal filter operators (keys in the dict):
+            tag, status, limit, offset, data_name, data_version, number_instances,
+            number_features, number_classes, number_missing_values.
 
         Returns
         -------
-        datasets: dataframe
-            Each row maps to a dataset
-            Each column contains the following information:
-            - dataset id
-            - name
-            - format
-            - status
-            If qualities are calculated for the dataset, some of
-            these are also included as columns.
+        datasets : dataframe
         """
-        listing_call = partial(
-            self._list_datasets,
-            data_id=data_id,
-            status=status,
-            tag=tag,
-            data_name=data_name,
-            data_version=data_version,
-            number_instances=number_instances,
-            number_features=number_features,
-            number_classes=number_classes,
-            number_missing_values=number_missing_values,
+        api_call = "data/list"
+
+        if limit is not None:
+            api_call += f"/limit/{limit}"
+        if offset is not None:
+            api_call += f"/offset/{offset}"
+
+        if kwargs is not None:
+            for operator, value in kwargs.items():
+                if value is not None:
+                    api_call += f"/{operator}/{value}"
+        if data_id is not None:
+            api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}"
+
+        xml_string = self._http.get(api_call).text
+        datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
+
+        # Minimalistic check if the XML is useful
+        assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type(
+            datasets_dict["oml:data"],
         )
-        batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
-        if len(batches) == 0:
-            return pd.DataFrame()
+        assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[
+            "oml:data"
+        ]["@xmlns:oml"]
+
+        datasets = {}
+        for dataset_ in datasets_dict["oml:data"]["oml:dataset"]:
+            ignore_attribute = ["oml:file_id", "oml:quality"]
+            dataset = {
+                k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute
+            }
+            dataset["did"] = int(dataset["did"])
+            dataset["version"] = int(dataset["version"])
+
+            # The number of qualities can range from 0 to infinity
+            for quality in dataset_.get("oml:quality", []):
+                try:
+                    dataset[quality["@name"]] = int(quality["#text"])
+                except ValueError:
+                    dataset[quality["@name"]] = float(quality["#text"])
+            datasets[dataset["did"]] = dataset
 
-        return pd.concat(batches)
+        return pd.DataFrame.from_dict(datasets, orient="index").astype(
+            {
+                "did": int,
+                "version": int,
+                "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
+            }
+        )
 
     def delete(self, dataset_id: int) -> bool:
         """Delete dataset with id `dataset_id` from the OpenML server.
@@ -299,90 +303,27 @@ def status_update(self, data_id: int, status: Literal["active", "deactivated"])
             # This should never happen
             raise ValueError("Data id/status does not collide")
 
-    def _list_datasets(
-        self,
-        limit: int,
-        offset: int,
-        *,
-        data_id: list[int] | None = None,  # type: ignore
-        **kwargs: Any,
-    ) -> pd.DataFrame:
-        """
-        Perform api call to return a list of all datasets.
-
-        Parameters
-        ----------
-        The arguments that are lists are separated from the single value
-        ones which are put into the kwargs.
-        display_errors is also separated from the kwargs since it has a
-        default value.
+    def list_qualities(self) -> list[str]:  # type: ignore
+        """Return list of data qualities available.
 
-        limit : int
-            The maximum number of datasets to show.
-        offset : int
-            The number of datasets to skip, starting from the first.
-        data_id : list, optional
-
-        kwargs : dict, optional
-            Legal filter operators (keys in the dict):
-            tag, status, limit, offset, data_name, data_version, number_instances,
-            number_features, number_classes, number_missing_values.
+        The function performs an API call to retrieve the entire list of
+        data qualities that are computed on the datasets uploaded.
 
         Returns
         -------
-        datasets : dataframe
+        list
         """
-        api_call = "data/list"
-
-        if limit is not None:
-            api_call += f"/limit/{limit}"
-        if offset is not None:
-            api_call += f"/offset/{offset}"
-
-        if kwargs is not None:
-            for operator, value in kwargs.items():
-                if value is not None:
-                    api_call += f"/{operator}/{value}"
-        if data_id is not None:
-            api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}"
-        return self.__list_datasets(api_call=api_call)
-
-    def __list_datasets(self, api_call: str) -> pd.DataFrame:
+        api_call = "data/qualities/list"
         xml_string = self._http.get(api_call).text
-        datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
-
+        qualities = xmltodict.parse(xml_string, force_list=("oml:quality"))
         # Minimalistic check if the XML is useful
-        assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type(
-            datasets_dict["oml:data"],
-        )
-        assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[
-            "oml:data"
-        ]["@xmlns:oml"]
-
-        datasets = {}
-        for dataset_ in datasets_dict["oml:data"]["oml:dataset"]:
-            ignore_attribute = ["oml:file_id", "oml:quality"]
-            dataset = {
-                k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute
-            }
-            dataset["did"] = int(dataset["did"])
-            dataset["version"] = int(dataset["version"])
+        if "oml:data_qualities_list" not in qualities:
+            raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"')
 
-            # The number of qualities can range from 0 to infinity
-            for quality in dataset_.get("oml:quality", []):
-                try:
-                    dataset[quality["@name"]] = int(quality["#text"])
-                except ValueError:
-                    dataset[quality["@name"]] = float(quality["#text"])
-            datasets[dataset["did"]] = dataset
+        if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list):
+            raise TypeError('Error in return XML, does not contain "oml:quality" as a list')
 
-        return pd.DataFrame.from_dict(datasets, orient="index").astype(
-            {
-                "did": int,
-                "version": int,
-                "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
-            }
-        )
+        return qualities["oml:data_qualities_list"]["oml:quality"]
 
     def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset:
         """Create a dataset given a xml string.
@@ -435,24 +376,74 @@ def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset:
             parquet_file=str(parquet_file) if parquet_file is not None else None,
         )
 
+    def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool:
+        """
+        An ontology describes the concept that are described in a feature. An
+        ontology is defined by an URL where the information is provided. Adds
+        an ontology (URL) to a given dataset feature (defined by a dataset id
+        and index). The dataset has to exists on OpenML and needs to have been
+        processed by the evaluation engine.
+
+        Parameters
+        ----------
+        data_id : int
+            id of the dataset to which the feature belongs
+        index : int
+            index of the feature in dataset (0-based)
+        ontology : str
+            URL to ontology (max. 256 characters)
+
+        Returns
+        -------
+        True or throws an OpenML server exception
+        """
+        upload_data: dict[str, int | str] = {
+            "data_id": data_id,
+            "index": index,
+            "ontology": ontology,
+        }
+        self._http.post("data/feature/ontology/add", data=upload_data)
+        # an error will be thrown in case the request was unsuccessful
+        return True
+
+    def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool:
+        """
+        Removes an existing ontology (URL) from a given dataset feature (defined
+        by a dataset id and index). The dataset has to exists on OpenML and needs
+        to have been processed by the evaluation engine. Ontology needs to be
+        attached to the specific fearure.
+
+        Parameters
+        ----------
+        data_id : int
+            id of the dataset to which the feature belongs
+        index : int
+            index of the feature in dataset (0-based)
+        ontology : str
+            URL to ontology (max. 256 characters)
+
+        Returns
+        -------
+        True or throws an OpenML server exception
+        """
+        upload_data: dict[str, int | str] = {
+            "data_id": data_id,
+            "index": index,
+            "ontology": ontology,
+        }
+        self._http.post("data/feature/ontology/remove", data=upload_data)
+        # an error will be thrown in case the request was unsuccessful
+        return True
+
 
 class DatasetsV2(DatasetsAPI):
     def get(
         self,
         dataset_id: int | str,
-        version: int | None = None,
-        error_if_multiple: bool = False,  # noqa: FBT002, FBT001
         *,
         return_response: bool = False,
     ) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
-        if isinstance(dataset_id, int):
-            resolved_id = dataset_id
-        elif dataset_id.isdigit():
-            resolved_id = int(dataset_id)
-        else:
-            resolved_id = self._name_to_id(dataset_id, version, error_if_multiple)
-
-        path = f"data/{resolved_id}"
+        path = f"data/{dataset_id}"
         response = self._http.get(path)
         json_content = response.json()
         dataset = self._create_dataset_from_json(json_content)
@@ -462,125 +453,7 @@ def get(
 
         return dataset
 
-    def list(  # noqa: PLR0913
-        self,
-        data_id: list[int] | None = None,
-        offset: int | None = None,
-        size: int | None = None,
-        status: str | None = None,
-        tag: str | None = None,
-        data_name: str | None = None,
-        data_version: int | None = None,
-        number_instances: int | str | None = None,
-        number_features: int | str | None = None,
-        number_classes: int | str | None = None,
-        number_missing_values: int | str | None = None,
-    ) -> pd.DataFrame:
-        """Return a dataframe of all dataset which are on OpenML.
-
-        Supports large amount of results.
-
-        Parameters
-        ----------
-        data_id : list, optional
-            A list of data ids, to specify which datasets should be
-            listed
-        offset : int, optional
-            The number of datasets to skip, starting from the first.
-        size : int, optional
-            The maximum number of datasets to show.
-        status : str, optional
-            Should be {active, in_preparation, deactivated}. By
-            default active datasets are returned, but also datasets
-            from another status can be requested.
-        tag : str, optional
-        data_name : str, optional
-        data_version : int, optional
-        number_instances : int | str, optional
-        number_features : int | str, optional
-        number_classes : int | str, optional
-        number_missing_values : int | str, optional
-
-        Returns
-        -------
-        datasets: dataframe
-            Each row maps to a dataset
-            Each column contains the following information:
-            - dataset id
-            - name
-            - format
-            - status
-            If qualities are calculated for the dataset, some of
-            these are also included as columns.
-        """
-        listing_call = partial(
-            self._list_datasets,
-            data_id=data_id,
-            status=status,
-            tag=tag,
-            data_name=data_name,
-            data_version=data_version,
-            number_instances=number_instances,
-            number_features=number_features,
-            number_classes=number_classes,
-            number_missing_values=number_missing_values,
-        )
-        batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
-        if len(batches) == 0:
-            return pd.DataFrame()
-
-        return pd.concat(batches)
-
-    def delete(self, dataset_id: int) -> bool:
-        raise NotImplementedError()
-
-    def edit(  # noqa: PLR0913
-        self,
-        data_id: int,
-        description: str | None = None,
-        creator: str | None = None,
-        contributor: str | None = None,
-        collection_date: str | None = None,
-        language: str | None = None,
-        default_target_attribute: str | None = None,
-        ignore_attribute: str | list[str] | None = None,  # type: ignore
-        citation: str | None = None,
-        row_id_attribute: str | None = None,
-        original_data_url: str | None = None,
-        paper_url: str | None = None,
-    ) -> int:
-        raise NotImplementedError()
-
-    def fork(self, data_id: int) -> int:
-        raise NotImplementedError()
-
-    def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None:
-        """
-        Updates the status of a dataset to either 'active' or 'deactivated'.
-        Please see the OpenML API documentation for a description of the status
-        and all legal status transitions:
-        https://docs.openml.org/concepts/data/#dataset-status
-
-        Parameters
-        ----------
-        data_id : int
-            The data id of the dataset
-        status : str,
-            'active' or 'deactivated'
-        """
-        legal_status = {"active", "deactivated"}
-        if status not in legal_status:
-            raise ValueError(f"Illegal status value. Legal values: {legal_status}")
-
-        data: openml._api_calls.DATA_TYPE = {"dataset_id": data_id, "status": status}
-        result = self._http.post("datasets/status/update", json=data).json()
-        server_data_id = result["dataset_id"]
-        server_status = result["status"]
-        if status != server_status or int(data_id) != int(server_data_id):
-            # This should never happen
-            raise ValueError("Data id/status does not collide")
-
-    def _list_datasets(
+    def list(
         self,
         limit: int,
         offset: int,
@@ -623,9 +496,6 @@ def _list_datasets(
                 if value is not None:
                     json[operator] = value
 
-        return self.__list_datasets(json=json)
-
-    def __list_datasets(self, json: dict) -> pd.DataFrame:
         api_call = "datasets/list"
         datasets_list = self._http.post(api_call, json=json).json()
         # Minimalistic check if the JSON is useful
@@ -654,6 +524,76 @@ def __list_datasets(self, json: dict) -> pd.DataFrame:
             }
         )
 
+    def delete(self, dataset_id: int) -> bool:
+        raise NotImplementedError()
+
+    def edit(  # noqa: PLR0913
+        self,
+        data_id: int,
+        description: str | None = None,
+        creator: str | None = None,
+        contributor: str | None = None,
+        collection_date: str | None = None,
+        language: str | None = None,
+        default_target_attribute: str | None = None,
+        ignore_attribute: str | list[str] | None = None,  # type: ignore
+        citation: str | None = None,
+        row_id_attribute: str | None = None,
+        original_data_url: str | None = None,
+        paper_url: str | None = None,
+    ) -> int:
+        raise NotImplementedError()
+
+    def fork(self, data_id: int) -> int:
+        raise NotImplementedError()
+
+    def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None:
+        """
+        Updates the status of a dataset to either 'active' or 'deactivated'.
+        Please see the OpenML API documentation for a description of the status
+        and all legal status transitions:
+        https://docs.openml.org/concepts/data/#dataset-status
+
+        Parameters
+        ----------
+        data_id : int
+            The data id of the dataset
+        status : str,
+            'active' or 'deactivated'
+        """
+        legal_status = {"active", "deactivated"}
+        if status not in legal_status:
+            raise ValueError(f"Illegal status value. Legal values: {legal_status}")
+
+        data: openml._api_calls.DATA_TYPE = {"dataset_id": data_id, "status": status}
+        result = self._http.post("datasets/status/update", json=data).json()
+        server_data_id = result["dataset_id"]
+        server_status = result["status"]
+        if status != server_status or int(data_id) != int(server_data_id):
+            # This should never happen
+            raise ValueError("Data id/status does not collide")
+
+    def list_qualities(self) -> list[str]:  # type: ignore
+        """Return list of data qualities available.
+
+        The function performs an API call to retrieve the entire list of
+        data qualities that are computed on the datasets uploaded.
+
+        Returns
+        -------
+        list
+        """
+        api_call = "datasets/qualities/list"
+        qualities = self._http.get(api_call).json()
+        # Minimalistic check if the XML is useful
+        if "data_qualities_list" not in qualities:
+            raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"')
+
+        if not isinstance(qualities["data_qualities_list"]["quality"], list):
+            raise TypeError('Error in return json, does not contain "quality" as a list')
+
+        return qualities["data_qualities_list"]["quality"]
+
     def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset:
         """Create a dataset given a json.
 
@@ -702,3 +642,9 @@ def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset:
             parquet_url=json_content.get("parquet_url"),
             parquet_file=str(parquet_file) if parquet_file is not None else None,
         )
+
+    def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool:
+        raise NotImplementedError()
+
+    def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool:
+        raise NotImplementedError()

From 9bcbcb32c232bb35b34e90ad7739de6c938ee5f3 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Fri, 9 Jan 2026 13:01:34 +0530
Subject: [PATCH 09/15] Refactor functions, except get

---
 openml/datasets/functions.py | 181 ++++-------------------------------
 1 file changed, 21 insertions(+), 160 deletions(-)

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index ac5466a44..23cdefdd2 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -5,7 +5,6 @@
 import logging
 import os
 import warnings
-from collections import OrderedDict
 from functools import partial
 from pathlib import Path
 from pyexpat import ExpatError
@@ -22,6 +21,7 @@
 
 import openml._api_calls
 import openml.utils
+from openml._api import api_context
 from openml.config import OPENML_SKIP_PARQUET_ENV_VAR
 from openml.exceptions import (
     OpenMLHashException,
@@ -65,17 +65,7 @@ def list_qualities() -> list[str]:
     -------
     list
     """
-    api_call = "data/qualities/list"
-    xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    qualities = xmltodict.parse(xml_string, force_list=("oml:quality"))
-    # Minimalistic check if the XML is useful
-    if "oml:data_qualities_list" not in qualities:
-        raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"')
-
-    if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list):
-        raise TypeError('Error in return XML, does not contain "oml:quality" as a list')
-
-    return qualities["oml:data_qualities_list"]["oml:quality"]
+    return api_context.backend.datasets.list_qualities()
 
 
 def list_datasets(
@@ -129,7 +119,7 @@ def list_datasets(
         these are also included as columns.
     """
     listing_call = partial(
-        _list_datasets,
+        api_context.backend.datasets.list,
         data_id=data_id,
         status=status,
         tag=tag,
@@ -147,92 +137,6 @@ def list_datasets(
     return pd.concat(batches)
 
 
-def _list_datasets(
-    limit: int,
-    offset: int,
-    *,
-    data_id: list[int] | None = None,
-    **kwargs: Any,
-) -> pd.DataFrame:
-    """
-    Perform api call to return a list of all datasets.
-
-    Parameters
-    ----------
-    The arguments that are lists are separated from the single value
-    ones which are put into the kwargs.
-    display_errors is also separated from the kwargs since it has a
-    default value.
-
-    limit : int
-        The maximum number of datasets to show.
-    offset : int
-        The number of datasets to skip, starting from the first.
-    data_id : list, optional
-
-    kwargs : dict, optional
-        Legal filter operators (keys in the dict):
-        tag, status, limit, offset, data_name, data_version, number_instances,
-        number_features, number_classes, number_missing_values.
-
-    Returns
-    -------
-    datasets : dataframe
-    """
-    api_call = "data/list"
-
-    if limit is not None:
-        api_call += f"/limit/{limit}"
-    if offset is not None:
-        api_call += f"/offset/{offset}"
-
-    if kwargs is not None:
-        for operator, value in kwargs.items():
-            if value is not None:
-                api_call += f"/{operator}/{value}"
-    if data_id is not None:
-        api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}"
-    return __list_datasets(api_call=api_call)
-
-
-def __list_datasets(api_call: str) -> pd.DataFrame:
-    xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
-
-    # Minimalistic check if the XML is useful
-    assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type(
-        datasets_dict["oml:data"],
-    )
-    assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[
-        "oml:data"
-    ]["@xmlns:oml"]
-
-    datasets = {}
-    for dataset_ in datasets_dict["oml:data"]["oml:dataset"]:
-        ignore_attribute = ["oml:file_id", "oml:quality"]
-        dataset = {
-            k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute
-        }
-        dataset["did"] = int(dataset["did"])
-        dataset["version"] = int(dataset["version"])
-
-        # The number of qualities can range from 0 to infinity
-        for quality in dataset_.get("oml:quality", []):
-            try:
-                dataset[quality["@name"]] = int(quality["#text"])
-            except ValueError:
-                dataset[quality["@name"]] = float(quality["#text"])
-        datasets[dataset["did"]] = dataset
-
-    return pd.DataFrame.from_dict(datasets, orient="index").astype(
-        {
-            "did": int,
-            "version": int,
-            "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
-        }
-    )
-
-
 def _expand_parameter(parameter: str | list[str] | None) -> list[str]:
     expanded_parameter = []
     if isinstance(parameter, str):
@@ -808,14 +712,7 @@ def status_update(data_id: int, status: Literal["active", "deactivated"]) -> Non
     if status not in legal_status:
         raise ValueError(f"Illegal status value. Legal values: {legal_status}")
 
-    data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status}
-    result_xml = openml._api_calls._perform_api_call("data/status/update", "post", data=data)
-    result = xmltodict.parse(result_xml)
-    server_data_id = result["oml:data_status_update"]["oml:id"]
-    server_status = result["oml:data_status_update"]["oml:status"]
-    if status != server_status or int(data_id) != int(server_data_id):
-        # This should never happen
-        raise ValueError("Data id/status does not collide")
+    api_context.backend.datasets.status_update(data_id=data_id, status=status)
 
 
 def edit_dataset(
@@ -889,43 +786,20 @@ def edit_dataset(
     -------
     Dataset id
     """
-    if not isinstance(data_id, int):
-        raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
-
-    # compose data edit parameters as xml
-    form_data = {"data_id": data_id}  # type: openml._api_calls.DATA_TYPE
-    xml = OrderedDict()  # type: 'OrderedDict[str, OrderedDict]'
-    xml["oml:data_edit_parameters"] = OrderedDict()
-    xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml"
-    xml["oml:data_edit_parameters"]["oml:description"] = description
-    xml["oml:data_edit_parameters"]["oml:creator"] = creator
-    xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
-    xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
-    xml["oml:data_edit_parameters"]["oml:language"] = language
-    xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute
-    xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute
-    xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute
-    xml["oml:data_edit_parameters"]["oml:citation"] = citation
-    xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
-    xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url
-
-    # delete None inputs
-    for k in list(xml["oml:data_edit_parameters"]):
-        if not xml["oml:data_edit_parameters"][k]:
-            del xml["oml:data_edit_parameters"][k]
-
-    file_elements = {
-        "edit_parameters": ("description.xml", xmltodict.unparse(xml)),
-    }  # type: openml._api_calls.FILE_ELEMENTS_TYPE
-    result_xml = openml._api_calls._perform_api_call(
-        "data/edit",
-        "post",
-        data=form_data,
-        file_elements=file_elements,
+    return api_context.backend.datasets.edit(
+        data_id,
+        description,
+        creator,
+        contributor,
+        collection_date,
+        language,
+        default_target_attribute,
+        ignore_attribute,
+        citation,
+        row_id_attribute,
+        original_data_url,
+        paper_url,
     )
-    result = xmltodict.parse(result_xml)
-    data_id = result["oml:data_edit"]["oml:id"]
-    return int(data_id)
 
 
 def fork_dataset(data_id: int) -> int:
@@ -957,14 +831,7 @@ def fork_dataset(data_id: int) -> int:
     Dataset id of the forked dataset
 
     """
-    if not isinstance(data_id, int):
-        raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
-    # compose data fork parameters
-    form_data = {"data_id": data_id}  # type: openml._api_calls.DATA_TYPE
-    result_xml = openml._api_calls._perform_api_call("data/fork", "post", data=form_data)
-    result = xmltodict.parse(result_xml)
-    data_id = result["oml:data_fork"]["oml:id"]
-    return int(data_id)
+    return api_context.backend.datasets.fork(data_id=data_id)
 
 
 def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool:
@@ -988,10 +855,7 @@ def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool:
     -------
     True or throws an OpenML server exception
     """
-    upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology}
-    openml._api_calls._perform_api_call("data/feature/ontology/add", "post", data=upload_data)
-    # an error will be thrown in case the request was unsuccessful
-    return True
+    return api_context.backend.datasets.feature_add_ontology(data_id, index, ontology)
 
 
 def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> bool:
@@ -1014,10 +878,7 @@ def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> boo
     -------
     True or throws an OpenML server exception
     """
-    upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology}
-    openml._api_calls._perform_api_call("data/feature/ontology/remove", "post", data=upload_data)
-    # an error will be thrown in case the request was unsuccessful
-    return True
+    return api_context.backend.datasets.feature_remove_ontology(data_id, index, ontology)
 
 
 def _topic_add_dataset(data_id: int, topic: str) -> int:
@@ -1460,4 +1321,4 @@ def delete_dataset(dataset_id: int) -> bool:
     bool
         True if the deletion was successful. False otherwise.
     """
-    return openml.utils._delete_entity("data", dataset_id)
+    return api_context.backend.datasets.delete(dataset_id)

From 96df5e30b46ea80633cb9593ceacf36ff10c8308 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Fri, 9 Jan 2026 15:38:07 +0530
Subject: [PATCH 10/15] Remove circular import using lazy import

---
 openml/datasets/functions.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 23cdefdd2..6ede42ea9 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -21,7 +21,6 @@
 
 import openml._api_calls
 import openml.utils
-from openml._api import api_context
 from openml.config import OPENML_SKIP_PARQUET_ENV_VAR
 from openml.exceptions import (
     OpenMLHashException,
@@ -65,6 +64,8 @@ def list_qualities() -> list[str]:
     -------
     list
     """
+    from openml._api import api_context
+
     return api_context.backend.datasets.list_qualities()
 
 
@@ -118,6 +119,8 @@ def list_datasets(
         If qualities are calculated for the dataset, some of
         these are also included as columns.
     """
+    from openml._api import api_context
+
     listing_call = partial(
         api_context.backend.datasets.list,
         data_id=data_id,
@@ -708,6 +711,8 @@ def status_update(data_id: int, status: Literal["active", "deactivated"]) -> Non
     status : str,
         'active' or 'deactivated'
     """
+    from openml._api import api_context
+
     legal_status = {"active", "deactivated"}
     if status not in legal_status:
         raise ValueError(f"Illegal status value. Legal values: {legal_status}")
@@ -786,6 +791,8 @@ def edit_dataset(
     -------
     Dataset id
     """
+    from openml._api import api_context
+
     return api_context.backend.datasets.edit(
         data_id,
         description,
@@ -831,6 +838,8 @@ def fork_dataset(data_id: int) -> int:
     Dataset id of the forked dataset
 
     """
+    from openml._api import api_context
+
     return api_context.backend.datasets.fork(data_id=data_id)
 
 
@@ -855,6 +864,8 @@ def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool:
     -------
     True or throws an OpenML server exception
     """
+    from openml._api import api_context
+
     return api_context.backend.datasets.feature_add_ontology(data_id, index, ontology)
 
 
@@ -878,6 +889,8 @@ def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> boo
     -------
     True or throws an OpenML server exception
     """
+    from openml._api import api_context
+
     return api_context.backend.datasets.feature_remove_ontology(data_id, index, ontology)
 
 
@@ -1321,4 +1334,6 @@ def delete_dataset(dataset_id: int) -> bool:
     bool
         True if the deletion was successful. False otherwise.
     """
+    from openml._api import api_context
+
     return api_context.backend.datasets.delete(dataset_id)

From c955f43c7f2cbb86fdd759b179b6c2dcfcf8b7e5 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Wed, 14 Jan 2026 17:47:23 +0530
Subject: [PATCH 11/15] Modify reviews, feature and qualities

---
 openml/_api/resources/base.py     |  27 +-
 openml/_api/resources/datasets.py | 437 +++++++++++++++++++++++-------
 openml/datasets/dataset.py        |  77 +-----
 openml/datasets/functions.py      |   6 +-
 4 files changed, 376 insertions(+), 171 deletions(-)

diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py
index 990dda998..3030ce6ff 100644
--- a/openml/_api/resources/base.py
+++ b/openml/_api/resources/base.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from typing_extensions import Literal
 
@@ -9,7 +10,7 @@
     from requests import Response
 
     from openml._api.http import HTTPClient
-    from openml.datasets.dataset import OpenMLDataset
+    from openml.datasets.dataset import OpenMLDataFeature, OpenMLDataset
     from openml.tasks.task import OpenMLTask
 
 
@@ -58,19 +59,35 @@ def edit(  # noqa: PLR0913
     ) -> int: ...
 
     @abstractmethod
-    def fork(self, data_id: int) -> int: ...
+    def fork(self, dataset_id: int) -> int: ...
 
     @abstractmethod
-    def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: ...
+    def status_update(self, dataset_id: int, status: Literal["active", "deactivated"]) -> None: ...
 
     @abstractmethod
     def list_qualities(self) -> list[str]: ...  # type: ignore
 
     @abstractmethod
-    def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: ...
+    def feature_add_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: ...
 
     @abstractmethod
-    def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool: ...
+    def feature_remove_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: ...
+
+    @abstractmethod
+    def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]: ...
+
+    @abstractmethod
+    def get_qualities(self, dataset_id: int) -> dict[str, float] | None: ...
+
+    @abstractmethod
+    def parse_features_file(
+        self, features_file: Path, features_pickle_file: Path
+    ) -> dict[int, OpenMLDataFeature]: ...
+
+    @abstractmethod
+    def parse_qualities_file(
+        self, qualities_file: Path, qualities_pickle_file: Path
+    ) -> dict[str, float] | None: ...
 
 
 class TasksAPI(ResourceAPI, ABC):
diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py
index 845212b20..58883f626 100644
--- a/openml/_api/resources/datasets.py
+++ b/openml/_api/resources/datasets.py
@@ -1,18 +1,30 @@
 from __future__ import annotations
 
+import json
+import logging
+import pickle
 from collections import OrderedDict
+from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from typing_extensions import Literal
 
+from openml._api.resources.base import DatasetsAPI
+from openml.datasets.data_feature import OpenMLDataFeature
+from openml.datasets.dataset import OpenMLDataset
+from openml.exceptions import OpenMLNotAuthorizedError, OpenMLServerError, OpenMLServerException
+
 if TYPE_CHECKING:
     from requests import Response
 
+    import openml
+
 import pandas as pd
 import xmltodict
 
-import openml.utils
-from openml._api.resources.base import DatasetsAPI
-from openml.datasets.dataset import OpenMLDataset
+logger = logging.getLogger(__name__)
+
+
+NO_ACCESS_GRANTED_ERRCODE = 112
 
 
 class DatasetsV1(DatasetsAPI):
@@ -26,7 +38,6 @@ def get(
         response = self._http.get(path)
         xml_content = response.text
         dataset = self._create_dataset_from_xml(xml_content)
-
         if return_response:
             return dataset, response
 
@@ -78,42 +89,8 @@ def list(
                     api_call += f"/{operator}/{value}"
         if data_id is not None:
             api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}"
-
         xml_string = self._http.get(api_call).text
-        datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
-
-        # Minimalistic check if the XML is useful
-        assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type(
-            datasets_dict["oml:data"],
-        )
-        assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[
-            "oml:data"
-        ]["@xmlns:oml"]
-
-        datasets = {}
-        for dataset_ in datasets_dict["oml:data"]["oml:dataset"]:
-            ignore_attribute = ["oml:file_id", "oml:quality"]
-            dataset = {
-                k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute
-            }
-            dataset["did"] = int(dataset["did"])
-            dataset["version"] = int(dataset["version"])
-
-            # The number of qualities can range from 0 to infinity
-            for quality in dataset_.get("oml:quality", []):
-                try:
-                    dataset[quality["@name"]] = int(quality["#text"])
-                except ValueError:
-                    dataset[quality["@name"]] = float(quality["#text"])
-            datasets[dataset["did"]] = dataset
-
-        return pd.DataFrame.from_dict(datasets, orient="index").astype(
-            {
-                "did": int,
-                "version": int,
-                "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
-            }
-        )
+        return self._parse_list_xml(xml_string)
 
     def delete(self, dataset_id: int) -> bool:
         """Delete dataset with id `dataset_id` from the OpenML server.
@@ -131,11 +108,45 @@ def delete(self, dataset_id: int) -> bool:
         bool
             True if the deletion was successful. False otherwise.
         """
-        return openml.utils._delete_entity("data", dataset_id)
+        url_suffix = f"data/{dataset_id}"
+        try:
+            result_xml = self._http.delete(url_suffix)
+            result = xmltodict.parse(result_xml)
+            return "oml:data_delete" in result
+        except OpenMLServerException as e:
+            # https://github.com/openml/OpenML/blob/21f6188d08ac24fcd2df06ab94cf421c946971b0/openml_OS/views/pages/api_new/v1/xml/pre.php
+            # Most exceptions are descriptive enough to be raised as their standard
+            # OpenMLServerException, however there are two cases where we add information:
+            #  - a generic "failed" message, we direct them to the right issue board
+            #  - when the user successfully authenticates with the server,
+            #    but user is not allowed to take the requested action,
+            #    in which case we specify a OpenMLNotAuthorizedError.
+            by_other_user = [323, 353, 393, 453, 594]
+            has_dependent_entities = [324, 326, 327, 328, 354, 454, 464, 595]
+            unknown_reason = [325, 355, 394, 455, 593]
+            if e.code in by_other_user:
+                raise OpenMLNotAuthorizedError(
+                    message=("The data can not be deleted because it was not uploaded by you."),
+                ) from e
+            if e.code in has_dependent_entities:
+                raise OpenMLNotAuthorizedError(
+                    message=(
+                        f"The data can not be deleted because "
+                        f"it still has associated entities: {e.message}"
+                    ),
+                ) from e
+            if e.code in unknown_reason:
+                raise OpenMLServerError(
+                    message=(
+                        "The data can not be deleted for unknown reason,"
+                        " please open an issue at: https://github.com/openml/openml/issues/new"
+                    ),
+                ) from e
+            raise e
 
     def edit(  # noqa: PLR0913
         self,
-        data_id: int,
+        dataset_id: int,
         description: str | None = None,
         creator: str | None = None,
         contributor: str | None = None,
@@ -150,7 +161,7 @@ def edit(  # noqa: PLR0913
     ) -> int:
         """Edits an OpenMLDataset.
 
-        In addition to providing the dataset id of the dataset to edit (through data_id),
+        In addition to providing the dataset id of the dataset to edit (through dataset_id),
         you must specify a value for at least one of the optional function arguments,
         i.e. one value for a field to edit.
 
@@ -166,7 +177,7 @@ def edit(  # noqa: PLR0913
 
         Parameters
         ----------
-        data_id : int
+        dataset_id : int
             ID of the dataset.
         description : str
             Description of the dataset.
@@ -205,11 +216,11 @@ def edit(  # noqa: PLR0913
         -------
         Dataset id
         """
-        if not isinstance(data_id, int):
-            raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
+        if not isinstance(dataset_id, int):
+            raise TypeError(f"`dataset_id` must be of type `int`, not {type(dataset_id)}.")
 
         # compose data edit parameters as xml
-        form_data = {"data_id": data_id}  # type: openml._api_calls.DATA_TYPE
+        form_data = {"data_id": dataset_id}  # type: openml._api_calls.DATA_TYPE
         xml = OrderedDict()  # type: 'OrderedDict[str, OrderedDict]'
         xml["oml:data_edit_parameters"] = OrderedDict()
         xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml"
@@ -235,10 +246,10 @@ def edit(  # noqa: PLR0913
         }  # type: openml._api_calls.FILE_ELEMENTS_TYPE
         result_xml = self._http.post("data/edit", data=form_data, files=file_elements).text
         result = xmltodict.parse(result_xml)
-        data_id = result["oml:data_edit"]["oml:id"]
-        return int(data_id)
+        dataset_id = result["oml:data_edit"]["oml:id"]
+        return int(dataset_id)
 
-    def fork(self, data_id: int) -> int:
+    def fork(self, dataset_id: int) -> int:
         """
         Creates a new dataset version, with the authenticated user as the new owner.
         The forked dataset can have distinct dataset meta-data,
@@ -259,7 +270,7 @@ def fork(self, data_id: int) -> int:
 
         Parameters
         ----------
-        data_id : int
+        dataset_id : int
             id of the dataset to be forked
 
         Returns
@@ -267,16 +278,16 @@ def fork(self, data_id: int) -> int:
         Dataset id of the forked dataset
 
         """
-        if not isinstance(data_id, int):
-            raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
+        if not isinstance(dataset_id, int):
+            raise TypeError(f"`dataset_id` must be of type `int`, not {type(dataset_id)}.")
         # compose data fork parameters
-        form_data = {"data_id": data_id}
+        form_data = {"data_id": dataset_id}
         result_xml = self._http.post("data/fork", data=form_data).text
         result = xmltodict.parse(result_xml)
-        data_id = result["oml:data_fork"]["oml:id"]
-        return int(data_id)
+        dataset_id = result["oml:data_fork"]["oml:id"]
+        return int(dataset_id)
 
-    def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None:
+    def status_update(self, dataset_id: int, status: Literal["active", "deactivated"]) -> None:
         """
         Updates the status of a dataset to either 'active' or 'deactivated'.
         Please see the OpenML API documentation for a description of the status
@@ -285,7 +296,7 @@ def status_update(self, data_id: int, status: Literal["active", "deactivated"])
 
         Parameters
         ----------
-        data_id : int
+        dataset_id : int
             The data id of the dataset
         status : str,
             'active' or 'deactivated'
@@ -294,12 +305,12 @@ def status_update(self, data_id: int, status: Literal["active", "deactivated"])
         if status not in legal_status:
             raise ValueError(f"Illegal status value. Legal values: {legal_status}")
 
-        data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status}
+        data: openml._api_calls.DATA_TYPE = {"data_id": dataset_id, "status": status}
         result_xml = self._http.post("data/status/update", data=data).text
         result = xmltodict.parse(result_xml)
         server_data_id = result["oml:data_status_update"]["oml:id"]
         server_status = result["oml:data_status_update"]["oml:status"]
-        if status != server_status or int(data_id) != int(server_data_id):
+        if status != server_status or int(dataset_id) != int(server_data_id):
             # This should never happen
             raise ValueError("Data id/status does not collide")
 
@@ -376,7 +387,7 @@ def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset:
             parquet_file=str(parquet_file) if parquet_file is not None else None,
         )
 
-    def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool:
+    def feature_add_ontology(self, dataset_id: int, index: int, ontology: str) -> bool:
         """
         An ontology describes the concept that are described in a feature. An
         ontology is defined by an URL where the information is provided. Adds
@@ -386,7 +397,7 @@ def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool:
 
         Parameters
         ----------
-        data_id : int
+        dataset_id : int
             id of the dataset to which the feature belongs
         index : int
             index of the feature in dataset (0-based)
@@ -398,7 +409,7 @@ def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool:
         True or throws an OpenML server exception
         """
         upload_data: dict[str, int | str] = {
-            "data_id": data_id,
+            "data_id": dataset_id,
             "index": index,
             "ontology": ontology,
         }
@@ -406,7 +417,7 @@ def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool:
         # an error will be thrown in case the request was unsuccessful
         return True
 
-    def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool:
+    def feature_remove_ontology(self, dataset_id: int, index: int, ontology: str) -> bool:
         """
         Removes an existing ontology (URL) from a given dataset feature (defined
         by a dataset id and index). The dataset has to exists on OpenML and needs
@@ -415,7 +426,7 @@ def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bo
 
         Parameters
         ----------
-        data_id : int
+        dataset_id : int
             id of the dataset to which the feature belongs
         index : int
             index of the feature in dataset (0-based)
@@ -427,7 +438,7 @@ def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bo
         True or throws an OpenML server exception
         """
         upload_data: dict[str, int | str] = {
-            "data_id": data_id,
+            "data_id": dataset_id,
             "index": index,
             "ontology": ontology,
         }
@@ -435,6 +446,133 @@ def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bo
         # an error will be thrown in case the request was unsuccessful
         return True
 
+    def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]:
+        path = f"data/features/{dataset_id}"
+        xml = self._http.get(path, use_cache=True).text
+
+        return self._parse_features_xml(xml)
+
+    def get_qualities(self, dataset_id: int) -> dict[str, float] | None:
+        path = f"data/qualities/{dataset_id!s}"
+        try:
+            self._http.get(path, use_cache=True).text
+        except OpenMLServerException as e:
+            if e.code == 362 and str(e) == "No qualities found - None":
+                # quality file stays as None
+                logger.warning(f"No qualities found for dataset {dataset_id}")
+                return None
+
+            raise e
+
+        return self._parse_qualities_xml()
+
+    def parse_features_file(
+        self, features_file: Path, features_pickle_file: Path
+    ) -> dict[int, OpenMLDataFeature]:
+        if features_file.suffix != ".xml":
+            # TODO (Shrivaths) can only parse xml warn/ raise exception
+            raise NotImplementedError()
+
+        with Path(features_file).open("r", encoding="utf8") as fh:
+            features_xml = fh.read()
+
+        features = self._parse_features_xml(features_xml)
+
+        with features_pickle_file.open("wb") as fh_binary:
+            pickle.dump(features, fh_binary)
+
+        return features
+
+    def parse_qualities_file(
+        self, qualities_file: Path, qualities_pickle_file: Path
+    ) -> dict[int, OpenMLDataFeature]:
+        if qualities_file.suffix != ".xml":
+            # TODO (Shrivaths) can only parse xml warn/ raise exception
+            raise NotImplementedError()
+
+        with Path(qualities_file).open("r", encoding="utf8") as fh:
+            qualities_xml = fh.read()
+
+        qualities = self._parse_qualities_xml(qualities_xml)
+
+        with qualities_pickle_file.open("wb") as fh_binary:
+            pickle.dump(qualities, fh_binary)
+
+        return qualities
+
+    def _parse_features_xml(self, features_xml_string: str) -> dict[int, OpenMLDataFeature]:
+        xml_dict = xmltodict.parse(
+            features_xml_string,
+            force_list=("oml:feature", "oml:nominal_value"),
+            strip_whitespace=False,
+        )
+        features_xml = xml_dict["oml:data_features"]
+
+        features: dict[int, OpenMLDataFeature] = {}
+        for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
+            nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
+            feature = OpenMLDataFeature(
+                int(xmlfeature["oml:index"]),
+                xmlfeature["oml:name"],
+                xmlfeature["oml:data_type"],
+                xmlfeature.get("oml:nominal_value"),
+                int(nr_missing),
+                xmlfeature.get("oml:ontology"),
+            )
+            if idx != feature.index:
+                raise ValueError("Data features not provided in right order")
+            features[feature.index] = feature
+
+        return features
+
+    def _parse_qualities_xml(self, qualities_xml: str) -> dict[str, float] | None:
+        xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
+        qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
+        qualities_ = {}
+        for xmlquality in qualities:
+            name = xmlquality["oml:name"]
+            if xmlquality.get("oml:value", None) is None or xmlquality["oml:value"] == "null":
+                value = float("NaN")
+            else:
+                value = float(xmlquality["oml:value"])
+            qualities_[name] = value
+        return qualities_
+
+    def _parse_list_xml(self, xml_string: str) -> pd.DataFrame:
+        datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
+        # Minimalistic check if the XML is useful
+        assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type(
+            datasets_dict["oml:data"],
+        )
+        assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[
+            "oml:data"
+        ]["@xmlns:oml"]
+
+        datasets = {}
+        for dataset_ in datasets_dict["oml:data"]["oml:dataset"]:
+            ignore_attribute = ["oml:file_id", "oml:quality"]
+            dataset = {
+                k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute
+            }
+            dataset["did"] = int(dataset["did"])
+            dataset["version"] = int(dataset["version"])
+
+            # The number of qualities can range from 0 to infinity
+            for quality in dataset_.get("oml:quality", []):
+                try:
+                    dataset[quality["@name"]] = int(quality["#text"])
+                except ValueError:
+                    dataset[quality["@name"]] = float(quality["#text"])
+            datasets[dataset["did"]] = dataset
+
+        return pd.DataFrame.from_dict(datasets, orient="index").astype(
+            {
+                "did": int,
+                "version": int,
+                "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
+            }
+        )
+
 
 class DatasetsV2(DatasetsAPI):
     def get(
@@ -457,6 +595,8 @@ def list(
         self,
         limit: int,
         offset: int,
+        *,
+        dataset_id: list[int] | None = None,  # type: ignore
         **kwargs: Any,
     ) -> pd.DataFrame:
         """
@@ -473,12 +613,12 @@ def list(
             The maximum number of datasets to show.
         offset : int
             The number of datasets to skip, starting from the first.
-        data_id : list, optional
+        dataset_id: list[int], optional
 
         kwargs : dict, optional
             Legal filter operators (keys in the dict):
             tag, status, limit, offset, data_name, data_version, number_instances,
-            number_features, number_classes, number_missing_values, data_id.
+            number_features, number_classes, number_missing_values.
 
         Returns
         -------
@@ -490,7 +630,8 @@ def list(
             json["pagination"]["limit"] = limit
         if offset is not None:
             json["pagination"]["offset"] = offset
-
+        if dataset_id is not None:
+            json["data_id"] = dataset_id
         if kwargs is not None:
             for operator, value in kwargs.items():
                 if value is not None:
@@ -501,35 +642,14 @@ def list(
         # Minimalistic check if the JSON is useful
         assert isinstance(datasets_list, list), type(datasets_list)
 
-        datasets = {}
-        for dataset_ in datasets_list:
-            ignore_attribute = ["file_id", "quality"]
-            dataset = {k: v for (k, v) in dataset_.items() if k not in ignore_attribute}
-            dataset["did"] = int(dataset["did"])
-            dataset["version"] = int(dataset["version"])
-
-            # The number of qualities can range from 0 to infinity
-            for quality in dataset_.get("quality", []):
-                try:
-                    dataset[quality["name"]] = int(quality["value"])
-                except ValueError:
-                    dataset[quality["name"]] = float(quality["value"])
-            datasets[dataset["did"]] = dataset
-
-        return pd.DataFrame.from_dict(datasets, orient="index").astype(
-            {
-                "did": int,
-                "version": int,
-                "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
-            }
-        )
+        return self._parse_list_json(datasets_list)
 
     def delete(self, dataset_id: int) -> bool:
         raise NotImplementedError()
 
     def edit(  # noqa: PLR0913
         self,
-        data_id: int,
+        dataset_id: int,
         description: str | None = None,
         creator: str | None = None,
         contributor: str | None = None,
@@ -544,10 +664,10 @@ def edit(  # noqa: PLR0913
     ) -> int:
         raise NotImplementedError()
 
-    def fork(self, data_id: int) -> int:
+    def fork(self, dataset_id: int) -> int:
         raise NotImplementedError()
 
-    def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None:
+    def status_update(self, dataset_id: int, status: Literal["active", "deactivated"]) -> None:
         """
         Updates the status of a dataset to either 'active' or 'deactivated'.
         Please see the OpenML API documentation for a description of the status
@@ -556,7 +676,7 @@ def status_update(self, data_id: int, status: Literal["active", "deactivated"])
 
         Parameters
         ----------
-        data_id : int
+        dataset_id : int
             The data id of the dataset
         status : str,
             'active' or 'deactivated'
@@ -565,11 +685,11 @@ def status_update(self, data_id: int, status: Literal["active", "deactivated"])
         if status not in legal_status:
             raise ValueError(f"Illegal status value. Legal values: {legal_status}")
 
-        data: openml._api_calls.DATA_TYPE = {"dataset_id": data_id, "status": status}
+        data: openml._api_calls.DATA_TYPE = {"dataset_id": dataset_id, "status": status}
         result = self._http.post("datasets/status/update", json=data).json()
         server_data_id = result["dataset_id"]
         server_status = result["status"]
-        if status != server_status or int(data_id) != int(server_data_id):
+        if status != server_status or int(dataset_id) != int(server_data_id):
             # This should never happen
             raise ValueError("Data id/status does not collide")
 
@@ -643,8 +763,129 @@ def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset:
             parquet_file=str(parquet_file) if parquet_file is not None else None,
         )
 
-    def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool:
+    def feature_add_ontology(self, dataset_id: int, index: int, ontology: str) -> bool:
         raise NotImplementedError()
 
-    def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool:
+    def feature_remove_ontology(self, dataset_id: int, index: int, ontology: str) -> bool:
         raise NotImplementedError()
+
+    def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]:
+        path = f"dataset/features/{dataset_id}"
+        json = self._http.get(path, use_cache=True).json()
+        features: dict[int, OpenMLDataFeature] = {}
+        for idx, xmlfeature in enumerate(json["data_features"]["feature"]):
+            nr_missing = xmlfeature.get("number_of_missing_values", 0)
+            feature = OpenMLDataFeature(
+                int(xmlfeature["index"]),
+                xmlfeature["name"],
+                xmlfeature["data_type"],
+                xmlfeature.get("nominal_value"),
+                int(nr_missing),
+                xmlfeature.get("ontology"),
+            )
+            if idx != feature.index:
+                raise ValueError("Data features not provided in right order")
+            features[feature.index] = feature
+
+        return features
+
+    def get_qualities(self, dataset_id: int) -> dict[str, float] | None:
+        path = f"dataset/qualities/{dataset_id!s}"
+        try:
+            qualities_json = self._http.get(path, use_cache=True).json()
+        except OpenMLServerException as e:
+            if e.code == 362 and str(e) == "No qualities found - None":
+                logger.warning(f"No qualities found for dataset {dataset_id}")
+                return None
+
+            raise e
+
+        return self._parse_features_json(qualities_json)
+
+    def parse_features_file(
+        self, features_file: Path, features_pickle_file: Path
+    ) -> dict[int, OpenMLDataFeature]:
+        if features_file.suffix != ".json":
+            # can fallback to v1 if the file is .xml
+            raise NotImplementedError()
+
+        with Path(features_file).open("r", encoding="utf8") as fh:
+            features_json = json.load(fh)
+
+        features = self._parse_features_json(features_json)
+
+        with features_pickle_file.open("wb") as fh_binary:
+            pickle.dump(features, fh_binary)
+
+        return features
+
+    def parse_qualities_file(
+        self, qualities_file: Path, qualities_pickle_file: Path
+    ) -> dict[str, float] | None:
+        if qualities_file.suffix != ".json":
+            # can fallback to v1 if the file is .xml
+            raise NotImplementedError()
+
+        with Path(qualities_file).open("r", encoding="utf8") as fh:
+            qualities_json = json.load(fh)
+
+        qualities = self._parse_qualities_json(qualities_json)
+
+        with qualities_pickle_file.open("wb") as fh_binary:
+            pickle.dump(qualities, fh_binary)
+
+        return qualities
+
+    def _parse_features_json(self: dict) -> dict[int, OpenMLDataFeature]:
+        features: dict[int, OpenMLDataFeature] = {}
+        for idx, xmlfeature in enumerate(self["data_features"]["feature"]):
+            nr_missing = xmlfeature.get("number_of_missing_values", 0)
+            feature = OpenMLDataFeature(
+                int(xmlfeature["index"]),
+                xmlfeature["name"],
+                xmlfeature["data_type"],
+                xmlfeature.get("nominal_value"),
+                int(nr_missing),
+                xmlfeature.get("ontology"),
+            )
+            if idx != feature.index:
+                raise ValueError("Data features not provided in right order")
+            features[feature.index] = feature
+
+        return features
+
+    def _parse_qualities_json(self: dict) -> dict[str, float] | None:
+        qualities = self["data_qualities"]["quality"]
+        qualities_ = {}
+        for quality in qualities:
+            name = quality["name"]
+            if quality.get("value", None) is None or quality["value"] == "null":
+                value = float("NaN")
+            else:
+                value = float(quality["value"])
+            qualities_[name] = value
+        return qualities_
+
+    def _parse_list_json(self, datasets_list: list) -> pd.DataFrame:  # type: ignore
+        datasets = {}
+        for dataset_ in datasets_list:
+            ignore_attribute = ["file_id", "quality"]
+            dataset = {k: v for (k, v) in dataset_.items() if k not in ignore_attribute}
+            dataset["did"] = int(dataset["did"])
+            dataset["version"] = int(dataset["version"])
+
+            # The number of qualities can range from 0 to infinity
+            for quality in dataset_.get("quality", []):
+                try:
+                    dataset[quality["name"]] = int(quality["value"])
+                except ValueError:
+                    dataset[quality["name"]] = float(quality["value"])
+            datasets[dataset["did"]] = dataset
+
+        return pd.DataFrame.from_dict(datasets, orient="index").astype(
+            {
+                "did": int,
+                "version": int,
+                "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
+            }
+        )
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index fa83d2b8a..f8bbf9be5 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -15,7 +15,6 @@
 import numpy as np
 import pandas as pd
 import scipy.sparse
-import xmltodict
 
 from openml.base import OpenMLBase
 from openml.config import OPENML_SKIP_PARQUET_ENV_VAR
@@ -798,7 +797,7 @@ def _load_features(self) -> None:
         """Load the features metadata from the server and store it in the dataset object."""
         # Delayed Import to avoid circular imports or having to import all of dataset.functions to
         # import OpenMLDataset.
-        from openml.datasets.functions import _get_dataset_features_file
+        from openml._api import api_context
 
         if self.dataset_id is None:
             raise ValueError(
@@ -806,13 +805,12 @@ def _load_features(self) -> None:
                 "metadata.",
             )
 
-        features_file = _get_dataset_features_file(None, self.dataset_id)
-        self._features = _read_features(features_file)
+        self._features = api_context.backend.datasets.get_features(self.dataset_id)
 
     def _load_qualities(self) -> None:
         """Load qualities information from the server and store it in the dataset object."""
         # same reason as above for _load_features
-        from openml.datasets.functions import _get_dataset_qualities_file
+        from openml._api import api_context
 
         if self.dataset_id is None:
             raise ValueError(
@@ -820,12 +818,12 @@ def _load_qualities(self) -> None:
                 "metadata.",
             )
 
-        qualities_file = _get_dataset_qualities_file(None, self.dataset_id)
+        qualities = api_context.backend.datasets.get_qualities(self.dataset_id)
 
-        if qualities_file is None:
+        if qualities is None:
             self._no_qualities_found = True
         else:
-            self._qualities = _read_qualities(qualities_file)
+            self._qualities = qualities
 
     def retrieve_class_labels(self, target_name: str = "class") -> None | list[str]:
         """Reads the datasets arff to determine the class-labels.
@@ -992,39 +990,9 @@ def _read_features(features_file: Path) -> dict[int, OpenMLDataFeature]:
             return pickle.load(fh_binary)  # type: ignore  # noqa: S301
 
     except:  # noqa: E722
-        with Path(features_file).open("r", encoding="utf8") as fh:
-            features_xml_string = fh.read()
+        from openml._api import api_context
 
-        features = _parse_features_xml(features_xml_string)
-
-        with features_pickle_file.open("wb") as fh_binary:
-            pickle.dump(features, fh_binary)
-
-        return features
-
-
-def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature]:
-    xml_dict = xmltodict.parse(
-        features_xml_string, force_list=("oml:feature", "oml:nominal_value"), strip_whitespace=False
-    )
-    features_xml = xml_dict["oml:data_features"]
-
-    features: dict[int, OpenMLDataFeature] = {}
-    for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
-        nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
-        feature = OpenMLDataFeature(
-            int(xmlfeature["oml:index"]),
-            xmlfeature["oml:name"],
-            xmlfeature["oml:data_type"],
-            xmlfeature.get("oml:nominal_value"),
-            int(nr_missing),
-            xmlfeature.get("oml:ontology"),
-        )
-        if idx != feature.index:
-            raise ValueError("Data features not provided in right order")
-        features[feature.index] = feature
-
-    return features
+        return api_context.backend.datasets.parse_features_file(features_file, features_pickle_file)
 
 
 # TODO(eddiebergman): Should this really exist?
@@ -1046,29 +1014,8 @@ def _read_qualities(qualities_file: str | Path) -> dict[str, float]:
         with qualities_pickle_file.open("rb") as fh_binary:
             return pickle.load(fh_binary)  # type: ignore  # noqa: S301
     except:  # noqa: E722
-        with qualities_file.open(encoding="utf8") as fh:
-            qualities_xml = fh.read()
+        from openml._api import api_context
 
-        qualities = _parse_qualities_xml(qualities_xml)
-        with qualities_pickle_file.open("wb") as fh_binary:
-            pickle.dump(qualities, fh_binary)
-
-        return qualities
-
-
-def _check_qualities(qualities: list[dict[str, str]]) -> dict[str, float]:
-    qualities_ = {}
-    for xmlquality in qualities:
-        name = xmlquality["oml:name"]
-        if xmlquality.get("oml:value", None) is None or xmlquality["oml:value"] == "null":
-            value = float("NaN")
-        else:
-            value = float(xmlquality["oml:value"])
-        qualities_[name] = value
-    return qualities_
-
-
-def _parse_qualities_xml(qualities_xml: str) -> dict[str, float]:
-    xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
-    qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
-    return _check_qualities(qualities)
+        return api_context.backend.datasets.parse_qualities_file(
+            qualities_file, qualities_pickle_file
+        )
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 6ede42ea9..0eb30b3db 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -88,7 +88,7 @@ def list_datasets(
 
     Parameters
     ----------
-    data_id : list, optional
+    dataset_id : list, optional
         A list of data ids, to specify which datasets should be
         listed
     offset : int, optional
@@ -717,7 +717,7 @@ def status_update(data_id: int, status: Literal["active", "deactivated"]) -> Non
     if status not in legal_status:
         raise ValueError(f"Illegal status value. Legal values: {legal_status}")
 
-    api_context.backend.datasets.status_update(data_id=data_id, status=status)
+    api_context.backend.datasets.status_update(dataset_id=data_id, status=status)
 
 
 def edit_dataset(
@@ -840,7 +840,7 @@ def fork_dataset(data_id: int) -> int:
     """
     from openml._api import api_context
 
-    return api_context.backend.datasets.fork(data_id=data_id)
+    return api_context.backend.datasets.fork(dataset_id=data_id)
 
 
 def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool:

From 4c75e16890a76d8fbc0ddc125a267d23ddaded44 Mon Sep 17 00:00:00 2001
From: geetu040 <raoarmaghanshakir040@gmail.com>
Date: Thu, 15 Jan 2026 14:51:22 +0500
Subject: [PATCH 12/15] undo changes in tasks/functions.py

---
 openml/tasks/functions.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index a794ad56d..e9b879ae4 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -12,7 +12,6 @@
 
 import openml._api_calls
 import openml.utils
-from openml._api import api_context
 from openml.datasets import get_dataset
 from openml.exceptions import OpenMLCacheException
 
@@ -445,16 +444,11 @@ def _get_task_description(task_id: int) -> OpenMLTask:
     except OpenMLCacheException:
         _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
         xml_file = _cache_dir / "task.xml"
-        result = api_context.backend.tasks.get(task_id, return_response=True)
+        task_xml = openml._api_calls._perform_api_call("task/%d" % task_id, "get")
 
-        if isinstance(result, tuple):
-            task, response = result
-            with xml_file.open("w", encoding="utf8") as fh:
-                fh.write(response.text)
-        else:
-            task = result
-
-        return task
+        with xml_file.open("w", encoding="utf8") as fh:
+            fh.write(task_xml)
+        return _create_task_from_xml(task_xml)
 
 
 def _create_task_from_xml(xml: str) -> OpenMLTask:

From 3e7c415f01a9f19feabf007bac709f9fc4fe3886 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Sun, 18 Jan 2026 22:03:42 +0530
Subject: [PATCH 13/15] Download methods

---
 openml/_api/http/client.py        | 151 ++++++++++++++++++++++++++++++
 openml/_api/resources/base.py     |  14 ++-
 openml/_api/resources/datasets.py | 107 ++++++++++++---------
 3 files changed, 226 insertions(+), 46 deletions(-)

diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
index a90e93933..bab7b20a8 100644
--- a/openml/_api/http/client.py
+++ b/openml/_api/http/client.py
@@ -1,11 +1,19 @@
 from __future__ import annotations
 
+import contextlib
+import shutil
+import urllib
+import urllib.parse
+import zipfile
+from collections.abc import Callable
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from urllib.parse import urlencode, urljoin, urlparse
 
+import minio
 import requests
 from requests import Response
+from urllib3 import ProxyManager
 
 from openml.__version__ import __version__
 from openml._api.config import settings
@@ -13,6 +21,9 @@
 if TYPE_CHECKING:
     from openml._api.config import APIConfig
 
+import openml.config
+from openml.utils import ProgressBar
+
 
 class CacheMixin:
     @property
@@ -149,3 +160,143 @@ def delete(
             use_api_key=True,
             **request_kwargs,
         )
+
+    def download(
+        self,
+        url: str,
+        handler: Callable[[Response, Path, str], Path],
+        encoding: str = "utf-8",
+    ) -> Path:
+        response = self.get(url)
+        dir_path = self._get_cache_dir(url, {})
+        dir_path = dir_path.expanduser()
+        if handler is not None:
+            return handler(response, dir_path, encoding)
+
+        return self._text_handler(response, dir_path, encoding, url)
+
+    def _text_handler(self, response: Response, path: Path, encoding: str) -> Path:
+        if path.is_dir():
+            path = path / "response.txt"
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with path.open("w", encoding=encoding) as f:
+            f.write(response.text)
+        return path
+
+
+class MinIOClient(CacheMixin):
+    def __init__(self) -> None:
+        self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"}
+
+    def download_minio_file(
+        self,
+        source: str,
+        destination: str | Path | None = None,
+        exists_ok: bool = True,  # noqa: FBT002
+        proxy: str | None = "auto",
+    ) -> str:
+        """Download file ``source`` from a MinIO Bucket and store it at ``destination``.
+
+        Parameters
+        ----------
+        source : str
+            URL to a file in a MinIO bucket.
+        destination : str | Path
+            Path to store the file to, if a directory is provided the original filename is used.
+        exists_ok : bool, optional (default=True)
+            If False, raise FileExists if a file already exists in ``destination``.
+        proxy: str, optional (default = "auto")
+            The proxy server to use. By default it's "auto" which uses ``requests`` to
+            automatically find the proxy to use. Pass None or the environment variable
+            ``no_proxy="*"`` to disable proxies.
+        """
+        destination = self._get_cache_dir(source, {}) if destination is None else Path(destination)
+        parsed_url = urllib.parse.urlparse(source)
+
+        # expect path format: /BUCKET/path/to/file.ext
+        bucket, object_name = parsed_url.path[1:].split("/", maxsplit=1)
+        if destination.is_dir():
+            destination = Path(destination, object_name)
+        if destination.is_file() and not exists_ok:
+            raise FileExistsError(f"File already exists in {destination}.")
+
+        destination = destination.expanduser()
+        destination.parent.mkdir(parents=True, exist_ok=True)
+
+        if proxy == "auto":
+            resolved_proxies = requests.utils.get_environ_proxies(parsed_url.geturl())
+            proxy = requests.utils.select_proxy(parsed_url.geturl(), resolved_proxies)  # type: ignore
+
+        proxy_client = ProxyManager(proxy) if proxy else None
+
+        client = minio.Minio(endpoint=parsed_url.netloc, secure=False, http_client=proxy_client)
+        try:
+            client.fget_object(
+                bucket_name=bucket,
+                object_name=object_name,
+                file_path=str(destination),
+                progress=ProgressBar() if openml.config.show_progress else None,
+                request_headers=self.headers,
+            )
+            if destination.is_file() and destination.suffix == ".zip":
+                with zipfile.ZipFile(destination, "r") as zip_ref:
+                    zip_ref.extractall(destination.parent)
+
+        except minio.error.S3Error as e:
+            if e.message is not None and e.message.startswith("Object does not exist"):
+                raise FileNotFoundError(f"Object at '{source}' does not exist.") from e
+            # e.g. permission error, or a bucket does not exist (which is also interpreted as a
+            # permission error on minio level).
+            raise FileNotFoundError("Bucket does not exist or is private.") from e
+
+        return str(destination)
+
+    def download_minio_bucket(self, source: str, destination: str | Path) -> None:
+        """Download file ``source`` from a MinIO Bucket and store it at ``destination``.
+
+        Does not redownload files which already exist.
+
+        Parameters
+        ----------
+        source : str
+            URL to a MinIO bucket.
+        destination : str | Path
+            Path to a directory to store the bucket content in.
+        """
+        destination = self._get_cache_dir(source, {}) if destination is None else Path(destination)
+        parsed_url = urllib.parse.urlparse(source)
+
+        # expect path format: /BUCKET/path/to/file.ext
+        _, bucket, *prefixes, _file = parsed_url.path.split("/")
+        prefix = "/".join(prefixes)
+
+        client = minio.Minio(endpoint=parsed_url.netloc, secure=False)
+
+        for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
+            if file_object.object_name is None:
+                raise ValueError(f"Object name is None for object {file_object!r}")
+            if file_object.etag is None:
+                raise ValueError(f"Object etag is None for object {file_object!r}")
+
+            marker = destination / file_object.etag
+            if marker.exists():
+                continue
+
+            file_destination = destination / file_object.object_name.rsplit("/", 1)[1]
+            if (file_destination.parent / file_destination.stem).exists():
+                # Marker is missing but archive exists means the server archive changed
+                # force a refresh
+                shutil.rmtree(file_destination.parent / file_destination.stem)
+
+            with contextlib.suppress(FileExistsError):
+                self.download_minio_file(
+                    source=source.rsplit("/", 1)[0]
+                    + "/"
+                    + file_object.object_name.rsplit("/", 1)[1],
+                    destination=file_destination,
+                    exists_ok=False,
+                )
+
+            if file_destination.is_file() and file_destination.suffix == ".zip":
+                file_destination.unlink()
+                marker.touch()
diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py
index 3030ce6ff..79f7ddfe8 100644
--- a/openml/_api/resources/base.py
+++ b/openml/_api/resources/base.py
@@ -2,8 +2,7 @@
 
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
-from typing_extensions import Literal
+from typing import TYPE_CHECKING, Any, Literal
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -87,7 +86,16 @@ def parse_features_file(
     @abstractmethod
     def parse_qualities_file(
         self, qualities_file: Path, qualities_pickle_file: Path
-    ) -> dict[str, float] | None: ...
+    ) -> dict[str, float]: ...
+
+    @abstractmethod
+    def download_file(self, url_ext: str, encoding: str = "utf-8") -> Path: ...
+
+    @abstractmethod
+    def download_features_file(self, dataset_id: int) -> Path: ...
+
+    @abstractmethod
+    def download_qualities_file(self, dataset_id: int) -> Path: ...
 
 
 class TasksAPI(ResourceAPI, ABC):
diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py
index 58883f626..91f9fed30 100644
--- a/openml/_api/resources/datasets.py
+++ b/openml/_api/resources/datasets.py
@@ -1,12 +1,12 @@
 from __future__ import annotations
 
+import builtins
 import json
 import logging
 import pickle
 from collections import OrderedDict
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
-from typing_extensions import Literal
+from typing import TYPE_CHECKING, Any, Literal
 
 from openml._api.resources.base import DatasetsAPI
 from openml.datasets.data_feature import OpenMLDataFeature
@@ -48,7 +48,7 @@ def list(
         limit: int,
         offset: int,
         *,
-        data_id: list[int] | None = None,  # type: ignore
+        data_id: builtins.list[int] | None = None,
         **kwargs: Any,
     ) -> pd.DataFrame:
         """
@@ -153,7 +153,7 @@ def edit(  # noqa: PLR0913
         collection_date: str | None = None,
         language: str | None = None,
         default_target_attribute: str | None = None,
-        ignore_attribute: str | list[str] | None = None,  # type: ignore
+        ignore_attribute: str | builtins.list[str] | None = None,
         citation: str | None = None,
         row_id_attribute: str | None = None,
         original_data_url: str | None = None,
@@ -314,7 +314,7 @@ def status_update(self, dataset_id: int, status: Literal["active", "deactivated"
             # This should never happen
             raise ValueError("Data id/status does not collide")
 
-    def list_qualities(self) -> list[str]:  # type: ignore
+    def list_qualities(self) -> builtins.list[str]:
         """Return list of data qualities available.
 
         The function performs an API call to retrieve the entire list of
@@ -455,7 +455,7 @@ def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]:
     def get_qualities(self, dataset_id: int) -> dict[str, float] | None:
         path = f"data/qualities/{dataset_id!s}"
         try:
-            self._http.get(path, use_cache=True).text
+            xml = self._http.get(path, use_cache=True).text
         except OpenMLServerException as e:
             if e.code == 362 and str(e) == "No qualities found - None":
                 # quality file stays as None
@@ -464,7 +464,7 @@ def get_qualities(self, dataset_id: int) -> dict[str, float] | None:
 
             raise e
 
-        return self._parse_qualities_xml()
+        return self._parse_qualities_xml(xml)
 
     def parse_features_file(
         self, features_file: Path, features_pickle_file: Path
@@ -485,7 +485,7 @@ def parse_features_file(
 
     def parse_qualities_file(
         self, qualities_file: Path, qualities_pickle_file: Path
-    ) -> dict[int, OpenMLDataFeature]:
+    ) -> dict[str, float]:
         if qualities_file.suffix != ".xml":
             # TODO (Shrivaths) can only parse xml warn/ raise exception
             raise NotImplementedError()
@@ -525,7 +525,7 @@ def _parse_features_xml(self, features_xml_string: str) -> dict[int, OpenMLDataF
 
         return features
 
-    def _parse_qualities_xml(self, qualities_xml: str) -> dict[str, float] | None:
+    def _parse_qualities_xml(self, qualities_xml: str) -> dict[str, float]:
         xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
         qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
         qualities_ = {}
@@ -573,6 +573,24 @@ def _parse_list_xml(self, xml_string: str) -> pd.DataFrame:
             }
         )
 
+    def download_file(self, url_ext: str, encoding: str = "utf-8") -> Path:
+        def __handler(response: Response, path: Path, encoding: str) -> Path:
+            file_path = path / "response.xml"
+            file_path.parent.mkdir(parents=True, exist_ok=True)
+            with file_path.open("w", encoding=encoding) as f:
+                f.write(response.text)
+            return file_path
+
+        return self._http.download(url_ext, __handler, encoding)
+
+    def download_features_file(self, dataset_id: int) -> Path:
+        path = f"data/features/{dataset_id}"
+        return self.download_file(path)
+
+    def download_qualities_file(self, dataset_id: int) -> Path:
+        path = f"data/qualities/{dataset_id}"
+        return self.download_file(path)
+
 
 class DatasetsV2(DatasetsAPI):
     def get(
@@ -596,7 +614,7 @@ def list(
         limit: int,
         offset: int,
         *,
-        dataset_id: list[int] | None = None,  # type: ignore
+        dataset_id: builtins.list[int] | None = None,
         **kwargs: Any,
     ) -> pd.DataFrame:
         """
@@ -656,7 +674,7 @@ def edit(  # noqa: PLR0913
         collection_date: str | None = None,
         language: str | None = None,
         default_target_attribute: str | None = None,
-        ignore_attribute: str | list[str] | None = None,  # type: ignore
+        ignore_attribute: str | builtins.list[str] | None = None,
         citation: str | None = None,
         row_id_attribute: str | None = None,
         original_data_url: str | None = None,
@@ -693,7 +711,7 @@ def status_update(self, dataset_id: int, status: Literal["active", "deactivated"
             # This should never happen
             raise ValueError("Data id/status does not collide")
 
-    def list_qualities(self) -> list[str]:  # type: ignore
+    def list_qualities(self) -> builtins.list[str]:
         """Return list of data qualities available.
 
         The function performs an API call to retrieve the entire list of
@@ -770,27 +788,13 @@ def feature_remove_ontology(self, dataset_id: int, index: int, ontology: str) ->
         raise NotImplementedError()
 
     def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]:
-        path = f"dataset/features/{dataset_id}"
+        path = f"datasets/features/{dataset_id}"
         json = self._http.get(path, use_cache=True).json()
-        features: dict[int, OpenMLDataFeature] = {}
-        for idx, xmlfeature in enumerate(json["data_features"]["feature"]):
-            nr_missing = xmlfeature.get("number_of_missing_values", 0)
-            feature = OpenMLDataFeature(
-                int(xmlfeature["index"]),
-                xmlfeature["name"],
-                xmlfeature["data_type"],
-                xmlfeature.get("nominal_value"),
-                int(nr_missing),
-                xmlfeature.get("ontology"),
-            )
-            if idx != feature.index:
-                raise ValueError("Data features not provided in right order")
-            features[feature.index] = feature
 
-        return features
+        return self._parse_features_json(json)
 
     def get_qualities(self, dataset_id: int) -> dict[str, float] | None:
-        path = f"dataset/qualities/{dataset_id!s}"
+        path = f"datasets/qualities/{dataset_id!s}"
         try:
             qualities_json = self._http.get(path, use_cache=True).json()
         except OpenMLServerException as e:
@@ -800,7 +804,7 @@ def get_qualities(self, dataset_id: int) -> dict[str, float] | None:
 
             raise e
 
-        return self._parse_features_json(qualities_json)
+        return self._parse_qualities_json(qualities_json)
 
     def parse_features_file(
         self, features_file: Path, features_pickle_file: Path
@@ -821,7 +825,7 @@ def parse_features_file(
 
     def parse_qualities_file(
         self, qualities_file: Path, qualities_pickle_file: Path
-    ) -> dict[str, float] | None:
+    ) -> dict[str, float]:
         if qualities_file.suffix != ".json":
             # can fallback to v1 if the file is .xml
             raise NotImplementedError()
@@ -836,17 +840,17 @@ def parse_qualities_file(
 
         return qualities
 
-    def _parse_features_json(self: dict) -> dict[int, OpenMLDataFeature]:
+    def _parse_features_json(self, features_json: dict) -> dict[int, OpenMLDataFeature]:
         features: dict[int, OpenMLDataFeature] = {}
-        for idx, xmlfeature in enumerate(self["data_features"]["feature"]):
-            nr_missing = xmlfeature.get("number_of_missing_values", 0)
+        for idx, jsonfeatures in enumerate(features_json):
+            nr_missing = jsonfeatures.get("number_of_missing_values", 0)
             feature = OpenMLDataFeature(
-                int(xmlfeature["index"]),
-                xmlfeature["name"],
-                xmlfeature["data_type"],
-                xmlfeature.get("nominal_value"),
+                int(jsonfeatures["index"]),
+                jsonfeatures["name"],
+                jsonfeatures["data_type"],
+                jsonfeatures.get("nominal_values"),
                 int(nr_missing),
-                xmlfeature.get("ontology"),
+                jsonfeatures.get("ontology"),
             )
             if idx != feature.index:
                 raise ValueError("Data features not provided in right order")
@@ -854,10 +858,9 @@ def _parse_features_json(self: dict) -> dict[int, OpenMLDataFeature]:
 
         return features
 
-    def _parse_qualities_json(self: dict) -> dict[str, float] | None:
-        qualities = self["data_qualities"]["quality"]
+    def _parse_qualities_json(self, qualities_json: dict) -> dict[str, float]:
         qualities_ = {}
-        for quality in qualities:
+        for quality in qualities_json:
             name = quality["name"]
             if quality.get("value", None) is None or quality["value"] == "null":
                 value = float("NaN")
@@ -866,7 +869,7 @@ def _parse_qualities_json(self: dict) -> dict[str, float] | None:
             qualities_[name] = value
         return qualities_
 
-    def _parse_list_json(self, datasets_list: list) -> pd.DataFrame:  # type: ignore
+    def _parse_list_json(self, datasets_list: builtins.list) -> pd.DataFrame:
         datasets = {}
         for dataset_ in datasets_list:
             ignore_attribute = ["file_id", "quality"]
@@ -889,3 +892,21 @@ def _parse_list_json(self, datasets_list: list) -> pd.DataFrame:  # type: ignore
                 "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
             }
         )
+
+    def download_file(self, url_ext: str, encoding: str = "utf-8") -> Path:
+        def __handler(response: Response, path: Path, encoding: str) -> Path:
+            file_path = path / "response.json"
+            file_path.parent.mkdir(parents=True, exist_ok=True)
+            with file_path.open("w", encoding=encoding) as f:
+                json.dump(response.json(), f, indent=4)
+            return file_path
+
+        return self._http.download(url_ext, __handler, encoding)
+
+    def download_features_file(self, dataset_id: int) -> Path:
+        path = f"datasets/features/{dataset_id}"
+        return self.download_file(path)
+
+    def download_qualities_file(self, dataset_id: int) -> Path:
+        path = f"datasets/qualities/{dataset_id}"
+        return self.download_file(path)

From 8933cd873b79ec5676a30ca9b7ab418710347591 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Mon, 19 Jan 2026 13:47:55 +0530
Subject: [PATCH 14/15] Refactored functions

---
 openml/_api/http/__init__.py      |   4 +-
 openml/_api/http/client.py        |  10 +-
 openml/_api/resources/base.py     |  32 +++-
 openml/_api/resources/datasets.py | 262 +++++++++++++++++++++++++----
 openml/_api/runtime/core.py       |   8 +-
 openml/datasets/functions.py      | 264 +++---------------------------
 6 files changed, 288 insertions(+), 292 deletions(-)

diff --git a/openml/_api/http/__init__.py b/openml/_api/http/__init__.py
index 8e6d1e4ce..c92ce14c3 100644
--- a/openml/_api/http/__init__.py
+++ b/openml/_api/http/__init__.py
@@ -1,3 +1,3 @@
-from openml._api.http.client import HTTPClient
+from openml._api.http.client import HTTPClient, MinIOClient
 
-__all__ = ["HTTPClient"]
+__all__ = ["HTTPClient", "MinIOClient"]
diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
index bab7b20a8..ea812e0bb 100644
--- a/openml/_api/http/client.py
+++ b/openml/_api/http/client.py
@@ -164,7 +164,7 @@ def delete(
     def download(
         self,
         url: str,
-        handler: Callable[[Response, Path, str], Path],
+        handler: Callable[[Response, Path, str], Path] | None = None,
         encoding: str = "utf-8",
     ) -> Path:
         response = self.get(url)
@@ -173,7 +173,7 @@ def download(
         if handler is not None:
             return handler(response, dir_path, encoding)
 
-        return self._text_handler(response, dir_path, encoding, url)
+        return self._text_handler(response, dir_path, encoding)
 
     def _text_handler(self, response: Response, path: Path, encoding: str) -> Path:
         if path.is_dir():
@@ -194,7 +194,7 @@ def download_minio_file(
         destination: str | Path | None = None,
         exists_ok: bool = True,  # noqa: FBT002
         proxy: str | None = "auto",
-    ) -> str:
+    ) -> Path:
         """Download file ``source`` from a MinIO Bucket and store it at ``destination``.
 
         Parameters
@@ -249,9 +249,9 @@ def download_minio_file(
             # permission error on minio level).
             raise FileNotFoundError("Bucket does not exist or is private.") from e
 
-        return str(destination)
+        return destination
 
-    def download_minio_bucket(self, source: str, destination: str | Path) -> None:
+    def download_minio_bucket(self, source: str, destination: str | Path | None = None) -> None:
         """Download file ``source`` from a MinIO Bucket and store it at ``destination``.
 
         Does not redownload files which already exist.
diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py
index 79f7ddfe8..5ad143db8 100644
--- a/openml/_api/resources/base.py
+++ b/openml/_api/resources/base.py
@@ -8,7 +8,7 @@
     import pandas as pd
     from requests import Response
 
-    from openml._api.http import HTTPClient
+    from openml._api.http import HTTPClient, MinIOClient
     from openml.datasets.dataset import OpenMLDataFeature, OpenMLDataset
     from openml.tasks.task import OpenMLTask
 
@@ -19,13 +19,20 @@ def __init__(self, http: HTTPClient):
 
 
 class DatasetsAPI(ResourceAPI, ABC):
+    def __init__(self, http: HTTPClient, minio: MinIOClient):
+        self._minio = minio
+        super().__init__(http)
+
     @abstractmethod
-    def get(
+    def get(  # noqa: PLR0913
         self,
-        dataset_id: int | str,
-        *,
-        return_response: bool = False,
-    ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ...
+        dataset_id: int,
+        download_data: bool = False,  # noqa: FBT002
+        cache_format: Literal["pickle", "feather"] = "pickle",
+        download_qualities: bool = False,  # noqa: FBT002
+        download_features_meta_data: bool = False,  # noqa: FBT002
+        download_all_files: bool = False,  # noqa: FBT002
+    ) -> OpenMLDataset: ...
 
     @abstractmethod
     def list(
@@ -97,6 +104,19 @@ def download_features_file(self, dataset_id: int) -> Path: ...
     @abstractmethod
     def download_qualities_file(self, dataset_id: int) -> Path: ...
 
+    @abstractmethod
+    def download_dataset_parquet(
+        self,
+        description: dict | OpenMLDataset,
+        download_all_files: bool = False,  # noqa: FBT002
+    ) -> Path | None: ...
+
+    @abstractmethod
+    def download_dataset_arff(
+        self,
+        description: dict | OpenMLDataset,
+    ) -> Path: ...
+
 
 class TasksAPI(ResourceAPI, ABC):
     @abstractmethod
diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py
index 91f9fed30..1387bfd54 100644
--- a/openml/_api/resources/datasets.py
+++ b/openml/_api/resources/datasets.py
@@ -1,17 +1,29 @@
+# ruff: noqa: PLR0913
 from __future__ import annotations
 
 import builtins
 import json
 import logging
+import os
 import pickle
 from collections import OrderedDict
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal
 
+import minio
+import urllib3
+
 from openml._api.resources.base import DatasetsAPI
+from openml.config import OPENML_SKIP_PARQUET_ENV_VAR
 from openml.datasets.data_feature import OpenMLDataFeature
 from openml.datasets.dataset import OpenMLDataset
-from openml.exceptions import OpenMLNotAuthorizedError, OpenMLServerError, OpenMLServerException
+from openml.exceptions import (
+    OpenMLHashException,
+    OpenMLNotAuthorizedError,
+    OpenMLPrivateDatasetError,
+    OpenMLServerError,
+    OpenMLServerException,
+)
 
 if TYPE_CHECKING:
     from requests import Response
@@ -30,18 +42,55 @@
 class DatasetsV1(DatasetsAPI):
     def get(
         self,
-        dataset_id: int | str,
-        *,
-        return_response: bool = False,
-    ) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
+        dataset_id: int,
+        download_data: bool = False,  # noqa: FBT002
+        cache_format: Literal["pickle", "feather"] = "pickle",
+        download_qualities: bool = False,  # noqa: FBT002
+        download_features_meta_data: bool = False,  # noqa: FBT002
+        download_all_files: bool = False,  # noqa: FBT002
+    ) -> OpenMLDataset:
         path = f"data/{dataset_id}"
         response = self._http.get(path)
         xml_content = response.text
-        dataset = self._create_dataset_from_xml(xml_content)
-        if return_response:
-            return dataset, response
+        description = xmltodict.parse(xml_content)["oml:data_set_description"]
+
+        try:
+            features_file = None
+            qualities_file = None
+
+            if download_features_meta_data:
+                features_file = self.download_features_file(dataset_id)
+            if download_qualities:
+                qualities_file = self.download_qualities_file(dataset_id)
+
+            parquet_file = None
+            skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true"
+            download_parquet = "oml:parquet_url" in description and not skip_parquet
+            if download_parquet and (download_data or download_all_files):
+                try:
+                    parquet_file = self.download_dataset_parquet(
+                        description,
+                        download_all_files=download_all_files,
+                    )
+                except urllib3.exceptions.MaxRetryError:
+                    parquet_file = None
+
+            arff_file = None
+            if parquet_file is None and download_data:
+                if download_parquet:
+                    logger.warning("Failed to download parquet, fallback on ARFF.")
+                arff_file = self.download_dataset_arff(description)
+        except OpenMLServerException as e:
+            # if there was an exception
+            # check if the user had access to the dataset
+            if e.code == NO_ACCESS_GRANTED_ERRCODE:
+                raise OpenMLPrivateDatasetError(e.message) from None
+
+            raise e
 
-        return dataset
+        return self._create_dataset_from_xml(
+            description, features_file, qualities_file, arff_file, parquet_file, cache_format
+        )
 
     def list(
         self,
@@ -144,7 +193,7 @@ def delete(self, dataset_id: int) -> bool:
                 ) from e
             raise e
 
-    def edit(  # noqa: PLR0913
+    def edit(
         self,
         dataset_id: int,
         description: str | None = None,
@@ -336,7 +385,15 @@ def list_qualities(self) -> builtins.list[str]:
 
         return qualities["oml:data_qualities_list"]["oml:quality"]
 
-    def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset:
+    def _create_dataset_from_xml(
+        self,
+        description: dict,
+        features_file: Path | None = None,
+        qualities_file: Path | None = None,
+        arff_file: Path | None = None,
+        parquet_file: Path | None = None,
+        cache_format: Literal["pickle", "feather"] = "pickle",
+    ) -> OpenMLDataset:
         """Create a dataset given a xml string.
 
         Parameters
@@ -348,14 +405,6 @@ def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset:
         -------
         OpenMLDataset
         """
-        description = xmltodict.parse(xml)["oml:data_set_description"]
-
-        # TODO file path after download, cache_format default = 'pickle'
-        arff_file = None
-        features_file = None
-        parquet_file = None
-        qualities_file = None
-
         return OpenMLDataset(
             description["oml:name"],
             description.get("oml:description"),
@@ -375,6 +424,7 @@ def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset:
             version_label=description.get("oml:version_label"),
             citation=description.get("oml:citation"),
             tag=description.get("oml:tag"),
+            cache_format=cache_format,
             visibility=description.get("oml:visibility"),
             original_data_url=description.get("oml:original_data_url"),
             paper_url=description.get("oml:paper_url"),
@@ -591,23 +641,110 @@ def download_qualities_file(self, dataset_id: int) -> Path:
         path = f"data/qualities/{dataset_id}"
         return self.download_file(path)
 
+    def download_dataset_parquet(
+        self,
+        description: dict | OpenMLDataset,
+        download_all_files: bool = False,  # noqa: FBT002
+    ) -> Path | None:
+        if isinstance(description, dict):
+            url = str(description.get("oml:parquet_url"))
+        elif isinstance(description, OpenMLDataset):
+            url = str(description._parquet_url)
+            assert description.dataset_id is not None
+        else:
+            raise TypeError("`description` should be either OpenMLDataset or Dict.")
+
+        if download_all_files:
+            self._minio.download_minio_bucket(source=url)
+
+        try:
+            output_file_path = self._minio.download_minio_file(
+                source=url,
+            )
+        except (FileNotFoundError, urllib3.exceptions.MaxRetryError, minio.error.ServerError) as e:
+            logger.warning(f"Could not download file from {url}: {e}")
+            return None
+        return output_file_path
+
+    def download_dataset_arff(
+        self,
+        description: dict | OpenMLDataset,
+    ) -> Path:
+        if isinstance(description, dict):
+            # TODO md5_checksum_fixture = description.get("oml:md5_checksum")
+            url = str(description["oml:url"])
+            did = int(description.get("oml:id"))  # type: ignore
+        elif isinstance(description, OpenMLDataset):
+            # TODO md5_checksum_fixture = description.md5_checksum
+            assert description.url is not None
+            assert description.dataset_id is not None
+
+            url = description.url
+            did = int(description.dataset_id)
+        else:
+            raise TypeError("`description` should be either OpenMLDataset or Dict.")
+
+        try:
+            output_file_path = self._http.download(url)
+        except OpenMLHashException as e:
+            additional_info = f" Raised when downloading dataset {did}."
+            e.args = (e.args[0] + additional_info,)
+            raise e
+
+        return output_file_path
+
 
 class DatasetsV2(DatasetsAPI):
     def get(
         self,
-        dataset_id: int | str,
-        *,
-        return_response: bool = False,
-    ) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
-        path = f"data/{dataset_id}"
+        dataset_id: int,
+        download_data: bool = False,  # noqa: FBT002
+        cache_format: Literal["pickle", "feather"] = "pickle",
+        download_qualities: bool = False,  # noqa: FBT002
+        download_features_meta_data: bool = False,  # noqa: FBT002
+        download_all_files: bool = False,  # noqa: FBT002
+    ) -> OpenMLDataset:
+        path = f"datasets/{dataset_id}"
         response = self._http.get(path)
         json_content = response.json()
-        dataset = self._create_dataset_from_json(json_content)
 
-        if return_response:
-            return dataset, response
+        try:
+            features_file = None
+            qualities_file = None
+
+            if download_features_meta_data:
+                features_file = self.download_features_file(dataset_id)
+            if download_qualities:
+                qualities_file = self.download_qualities_file(dataset_id)
+
+            parquet_file = None
+            skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true"
+            download_parquet = "parquet_url" in json_content and not skip_parquet
+            if download_parquet and (download_data or download_all_files):
+                try:
+                    parquet_file = self.download_dataset_parquet(
+                        json_content,
+                        download_all_files=download_all_files,
+                    )
+                except urllib3.exceptions.MaxRetryError:
+                    parquet_file = None
+
+            arff_file = None
+            if parquet_file is None and download_data:
+                if download_parquet:
+                    logger.warning("Failed to download parquet, fallback on ARFF.")
+                arff_file = self.download_dataset_arff(json_content)
+        except OpenMLServerException as e:
+            # if there was an exception
+            # check if the user had access to the dataset
+            if e.code == NO_ACCESS_GRANTED_ERRCODE:
+                raise OpenMLPrivateDatasetError(e.message) from None
+
+            raise e
 
-        return dataset
+        return self._create_dataset_from_json(
+            json_content, features_file, qualities_file, arff_file, parquet_file, cache_format
+        )
 
     def list(
         self,
@@ -665,7 +802,7 @@ def list(
     def delete(self, dataset_id: int) -> bool:
         raise NotImplementedError()
 
-    def edit(  # noqa: PLR0913
+    def edit(
         self,
         dataset_id: int,
         description: str | None = None,
@@ -732,7 +869,15 @@ def list_qualities(self) -> builtins.list[str]:
 
         return qualities["data_qualities_list"]["quality"]
 
-    def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset:
+    def _create_dataset_from_json(
+        self,
+        json_content: dict,
+        features_file: Path | None = None,
+        qualities_file: Path | None = None,
+        arff_file: Path | None = None,
+        parquet_file: Path | None = None,
+        cache_format: Literal["pickle", "feather"] = "pickle",
+    ) -> OpenMLDataset:
         """Create a dataset given a json.
 
         Parameters
@@ -744,12 +889,6 @@ def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset:
         -------
         OpenMLDataset
         """
-        # TODO file path after download, cache_format default = 'pickle'
-        arff_file = None
-        features_file = None
-        parquet_file = None
-        qualities_file = None
-
         return OpenMLDataset(
             json_content["name"],
             json_content.get("description"),
@@ -769,6 +908,7 @@ def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset:
             version_label=json_content.get("version_label"),
             citation=json_content.get("citation"),
             tag=json_content.get("tag"),
+            cache_format=cache_format,
             visibility=json_content.get("visibility"),
             original_data_url=json_content.get("original_data_url"),
             paper_url=json_content.get("paper_url"),
@@ -910,3 +1050,53 @@ def download_features_file(self, dataset_id: int) -> Path:
     def download_qualities_file(self, dataset_id: int) -> Path:
         path = f"datasets/qualities/{dataset_id}"
         return self.download_file(path)
+
+    def download_dataset_parquet(
+        self,
+        description: dict | OpenMLDataset,
+        download_all_files: bool = False,  # noqa: FBT002
+    ) -> Path | None:
+        if isinstance(description, dict):
+            url = str(description.get("parquet_url"))
+        elif isinstance(description, OpenMLDataset):
+            url = str(description._parquet_url)
+            assert description.dataset_id is not None
+        else:
+            raise TypeError("`description` should be either OpenMLDataset or Dict.")
+
+        if download_all_files:
+            self._minio.download_minio_bucket(source=url)
+
+        try:
+            output_file_path = self._minio.download_minio_file(source=url)
+        except (FileNotFoundError, urllib3.exceptions.MaxRetryError, minio.error.ServerError) as e:
+            logger.warning(f"Could not download file from {url}: {e}")
+            return None
+        return output_file_path
+
+    def download_dataset_arff(
+        self,
+        description: dict | OpenMLDataset,
+    ) -> Path:
+        if isinstance(description, dict):
+            url = str(description["url"])
+            did = int(description.get("oml:id"))  # type: ignore
+        elif isinstance(description, OpenMLDataset):
+            assert description.url is not None
+            assert description.dataset_id is not None
+
+            url = description.url
+            did = int(description.dataset_id)
+        else:
+            raise TypeError("`description` should be either OpenMLDataset or Dict.")
+
+        try:
+            output_file_path = self._http.download(
+                url,
+            )
+        except OpenMLHashException as e:
+            additional_info = f" Raised when downloading dataset {did}."
+            e.args = (e.args[0] + additional_info,)
+            raise e
+
+        return output_file_path
diff --git a/openml/_api/runtime/core.py b/openml/_api/runtime/core.py
index 98b587411..47ebb2b5b 100644
--- a/openml/_api/runtime/core.py
+++ b/openml/_api/runtime/core.py
@@ -3,7 +3,7 @@
 from typing import TYPE_CHECKING
 
 from openml._api.config import settings
-from openml._api.http.client import HTTPClient
+from openml._api.http.client import HTTPClient, MinIOClient
 from openml._api.resources import (
     DatasetsV1,
     DatasetsV2,
@@ -24,9 +24,9 @@ def __init__(self, *, datasets: DatasetsAPI, tasks: TasksAPI):
 def build_backend(version: str, *, strict: bool) -> APIBackend:
     v1_http = HTTPClient(config=settings.api.v1)
     v2_http = HTTPClient(config=settings.api.v2)
-
+    minio = MinIOClient()
     v1 = APIBackend(
-        datasets=DatasetsV1(v1_http),
+        datasets=DatasetsV1(v1_http, minio),
         tasks=TasksV1(v1_http),
     )
 
@@ -34,7 +34,7 @@ def build_backend(version: str, *, strict: bool) -> APIBackend:
         return v1
 
     v2 = APIBackend(
-        datasets=DatasetsV2(v2_http),
+        datasets=DatasetsV2(v2_http, minio),
         tasks=TasksV2(v2_http),
     )
 
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 3786e54d6..8b3fbd732 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import logging
-import os
 import warnings
 from functools import partial
 from pathlib import Path
@@ -11,21 +10,15 @@
 from typing import TYPE_CHECKING, Any, Literal
 
 import arff
-import minio.error
 import numpy as np
 import pandas as pd
-import urllib3
 import xmltodict
 from scipy.sparse import coo_matrix
 
 import openml._api_calls
 import openml.utils
-from openml.config import OPENML_SKIP_PARQUET_ENV_VAR
 from openml.exceptions import (
-    OpenMLHashException,
-    OpenMLPrivateDatasetError,
     OpenMLServerError,
-    OpenMLServerException,
 )
 from openml.utils import (
     _create_cache_directory_for_id,
@@ -281,7 +274,7 @@ def get_datasets(
 
 
 @openml.utils.thread_safe_if_oslo_installed
-def get_dataset(  # noqa: C901, PLR0912
+def get_dataset(
     dataset_id: int | str,
     download_data: bool = False,  # noqa: FBT002
     version: int | None = None,
@@ -382,59 +375,15 @@ def get_dataset(  # noqa: C901, PLR0912
         if did_cache_dir.exists():
             _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)
 
-    did_cache_dir = _create_cache_directory_for_id(
-        DATASETS_CACHE_DIR_NAME,
-        dataset_id,
-    )
-
-    remove_dataset_cache = True
-    try:
-        description = _get_dataset_description(did_cache_dir, dataset_id)
-        features_file = None
-        qualities_file = None
-
-        if download_features_meta_data:
-            features_file = _get_dataset_features_file(did_cache_dir, dataset_id)
-        if download_qualities:
-            qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
-
-        parquet_file = None
-        skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true"
-        download_parquet = "oml:parquet_url" in description and not skip_parquet
-        if download_parquet and (download_data or download_all_files):
-            try:
-                parquet_file = _get_dataset_parquet(
-                    description,
-                    download_all_files=download_all_files,
-                )
-            except urllib3.exceptions.MaxRetryError:
-                parquet_file = None
-
-        arff_file = None
-        if parquet_file is None and download_data:
-            if download_parquet:
-                logger.warning("Failed to download parquet, fallback on ARFF.")
-            arff_file = _get_dataset_arff(description)
-
-        remove_dataset_cache = False
-    except OpenMLServerException as e:
-        # if there was an exception
-        # check if the user had access to the dataset
-        if e.code == NO_ACCESS_GRANTED_ERRCODE:
-            raise OpenMLPrivateDatasetError(e.message) from None
-
-        raise e
-    finally:
-        if remove_dataset_cache:
-            _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)
+    from openml._api import api_context
 
-    return _create_dataset_from_description(
-        description,
-        features_file,
-        qualities_file,
-        arff_file,
-        parquet_file,
+    return api_context.backend.datasets.get(
+        dataset_id,
+        download_data,
         cache_format,
+        download_qualities,
+        download_features_meta_data,
+        download_all_files,
     )
 
 
@@ -988,7 +937,7 @@ def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str,
 
 def _get_dataset_parquet(
     description: dict | OpenMLDataset,
-    cache_directory: Path | None = None,
+    cache_directory: Path | None = None,  # noqa: ARG001
     download_all_files: bool = False,  # noqa: FBT002
 ) -> Path | None:
     """Return the path to the local parquet file of the dataset. If is not cached, it is downloaded.
@@ -1018,47 +967,14 @@ def _get_dataset_parquet(
     output_filename : Path, optional
         Location of the Parquet file if successfully downloaded, None otherwise.
     """
-    if isinstance(description, dict):
-        url = str(description.get("oml:parquet_url"))
-        did = int(description.get("oml:id"))  # type: ignore
-    elif isinstance(description, OpenMLDataset):
-        url = str(description._parquet_url)
-        assert description.dataset_id is not None
-
-        did = int(description.dataset_id)
-    else:
-        raise TypeError("`description` should be either OpenMLDataset or Dict.")
-
-    if cache_directory is None:
-        cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did)
-
-    output_file_path = cache_directory / f"dataset_{did}.pq"
-
-    old_file_path = cache_directory / "dataset.pq"
-    if old_file_path.is_file():
-        old_file_path.rename(output_file_path)
-
-    # The call below skips files already on disk, so avoids downloading the parquet file twice.
-    # To force the old behavior of always downloading everything, use `force_refresh_cache`
-    # of `get_dataset`
-    if download_all_files:
-        openml._api_calls._download_minio_bucket(source=url, destination=cache_directory)
+    from openml._api import api_context
 
-    if not output_file_path.is_file():
-        try:
-            openml._api_calls._download_minio_file(
-                source=url,
-                destination=output_file_path,
-            )
-        except (FileNotFoundError, urllib3.exceptions.MaxRetryError, minio.error.ServerError) as e:
-            logger.warning(f"Could not download file from {url}: {e}")
-            return None
-    return output_file_path
+    return api_context.backend.datasets.download_dataset_parquet(description, download_all_files)
 
 
 def _get_dataset_arff(
     description: dict | OpenMLDataset,
-    cache_directory: Path | None = None,
+    cache_directory: Path | None = None,  # noqa: ARG001
 ) -> Path:
     """Return the path to the local arff file of the dataset. If is not cached, it is downloaded.
 
@@ -1082,47 +998,15 @@ def _get_dataset_arff(
     output_filename : Path
         Location of ARFF file.
     """
-    if isinstance(description, dict):
-        md5_checksum_fixture = description.get("oml:md5_checksum")
-        url = str(description["oml:url"])
-        did = int(description.get("oml:id"))  # type: ignore
-    elif isinstance(description, OpenMLDataset):
-        md5_checksum_fixture = description.md5_checksum
-        assert description.url is not None
-        assert description.dataset_id is not None
-
-        url = description.url
-        did = int(description.dataset_id)
-    else:
-        raise TypeError("`description` should be either OpenMLDataset or Dict.")
-
-    save_cache_directory = (
-        _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did)
-        if cache_directory is None
-        else Path(cache_directory)
-    )
-    output_file_path = save_cache_directory / "dataset.arff"
-
-    try:
-        openml._api_calls._download_text_file(
-            source=url,
-            output_path=output_file_path,
-            md5_checksum=md5_checksum_fixture,
-        )
-    except OpenMLHashException as e:
-        additional_info = f" Raised when downloading dataset {did}."
-        e.args = (e.args[0] + additional_info,)
-        raise e
-
-    return output_file_path
-
+    from openml._api import api_context
 
-def _get_features_xml(dataset_id: int) -> str:
-    url_extension = f"data/features/{dataset_id}"
-    return openml._api_calls._perform_api_call(url_extension, "get")
+    return api_context.backend.datasets.download_dataset_arff(description)
 
 
-def _get_dataset_features_file(did_cache_dir: str | Path | None, dataset_id: int) -> Path:
+def _get_dataset_features_file(
+    did_cache_dir: str | Path | None,  # noqa: ARG001
+    dataset_id: int,
+) -> Path:
     """API call to load dataset features. Loads from cache or downloads them.
 
     Features are feature descriptions for each column.
@@ -1143,28 +1027,14 @@ def _get_dataset_features_file(did_cache_dir: str | Path | None, dataset_id: int
     Path
         Path of the cached dataset feature file
     """
-    did_cache_dir = Path(did_cache_dir) if did_cache_dir is not None else None
-    if did_cache_dir is None:
-        did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id)
-
-    features_file = did_cache_dir / "features.xml"
-
-    # Dataset features aren't subject to change...
-    if not features_file.is_file():
-        features_xml = _get_features_xml(dataset_id)
-        with features_file.open("w", encoding="utf8") as fh:
-            fh.write(features_xml)
-
-    return features_file
-
+    # cache directory not used here anymore
+    from openml._api import api_context
 
-def _get_qualities_xml(dataset_id: int) -> str:
-    url_extension = f"data/qualities/{dataset_id!s}"
-    return openml._api_calls._perform_api_call(url_extension, "get")
+    return api_context.backend.datasets.download_features_file(dataset_id)
 
 
 def _get_dataset_qualities_file(
-    did_cache_dir: str | Path | None,
+    did_cache_dir: str | Path | None,  # noqa: ARG001
     dataset_id: int,
 ) -> Path | None:
     """Get the path for the dataset qualities file, or None if no qualities exist.
@@ -1187,94 +1057,10 @@ def _get_dataset_qualities_file(
     str
         Path of the cached qualities file
     """
-    save_did_cache_dir = (
-        _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id)
-        if did_cache_dir is None
-        else Path(did_cache_dir)
-    )
-
-    # Dataset qualities are subject to change and must be fetched every time
-    qualities_file = save_did_cache_dir / "qualities.xml"
-    try:
-        with qualities_file.open(encoding="utf8") as fh:
-            qualities_xml = fh.read()
-    except OSError:
-        try:
-            qualities_xml = _get_qualities_xml(dataset_id)
-            with qualities_file.open("w", encoding="utf8") as fh:
-                fh.write(qualities_xml)
-        except OpenMLServerException as e:
-            if e.code == 362 and str(e) == "No qualities found - None":
-                # quality file stays as None
-                logger.warning(f"No qualities found for dataset {dataset_id}")
-                return None
-
-            raise e
-
-    return qualities_file
-
-
-def _create_dataset_from_description(
-    description: dict[str, str],
-    features_file: Path | None = None,
-    qualities_file: Path | None = None,
-    arff_file: Path | None = None,
-    parquet_file: Path | None = None,
-    cache_format: Literal["pickle", "feather"] = "pickle",
-) -> OpenMLDataset:
-    """Create a dataset object from a description dict.
-
-    Parameters
-    ----------
-    description : dict
-        Description of a dataset in xml dict.
-    features_file : str
-        Path of the dataset features as xml file.
-    qualities_file : list
-        Path of the dataset qualities as xml file.
-    arff_file : string, optional
-        Path of dataset ARFF file.
-    parquet_file : string, optional
-        Path of dataset Parquet file.
-    cache_format: string, optional
-        Caching option for datasets (feather/pickle)
+    # cache directory not used here anymore
+    from openml._api import api_context
 
-    Returns
-    -------
-    dataset : dataset object
-        Dataset object from dict and ARFF.
-    """
-    return OpenMLDataset(
-        description["oml:name"],
-        description.get("oml:description"),
-        data_format=description["oml:format"],  # type: ignore
-        dataset_id=int(description["oml:id"]),
-        version=int(description["oml:version"]),
-        creator=description.get("oml:creator"),
-        contributor=description.get("oml:contributor"),
-        collection_date=description.get("oml:collection_date"),
-        upload_date=description.get("oml:upload_date"),
-        language=description.get("oml:language"),
-        licence=description.get("oml:licence"),
-        url=description["oml:url"],
-        default_target_attribute=description.get("oml:default_target_attribute"),
-        row_id_attribute=description.get("oml:row_id_attribute"),
-        ignore_attribute=description.get("oml:ignore_attribute"),
-        version_label=description.get("oml:version_label"),
-        citation=description.get("oml:citation"),
-        tag=description.get("oml:tag"),
-        visibility=description.get("oml:visibility"),
-        original_data_url=description.get("oml:original_data_url"),
-        paper_url=description.get("oml:paper_url"),
-        update_comment=description.get("oml:update_comment"),
-        md5_checksum=description.get("oml:md5_checksum"),
-        data_file=str(arff_file) if arff_file is not None else None,
-        cache_format=cache_format,
-        features_file=str(features_file) if features_file is not None else None,
-        qualities_file=str(qualities_file) if qualities_file is not None else None,
-        parquet_url=description.get("oml:parquet_url"),
-        parquet_file=str(parquet_file) if parquet_file is not None else None,
-    )
+    return api_context.backend.datasets.download_features_file(dataset_id)
 
 
 def _get_online_dataset_arff(dataset_id: int) -> str | None:

From dfa0ab79063673a65d2903374394c90cfdd27a54 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Tue, 20 Jan 2026 21:11:09 +0530
Subject: [PATCH 15/15] Update todos, topic endpoints

---
 openml/_api/http/client.py        |  2 +-
 openml/_api/resources/base.py     |  6 ++++++
 openml/_api/resources/datasets.py | 23 ++++++++++++++++++++-
 openml/datasets/functions.py      | 33 ++++++++++++++++---------------
 4 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py
index ea812e0bb..1256b5d84 100644
--- a/openml/_api/http/client.py
+++ b/openml/_api/http/client.py
@@ -267,7 +267,7 @@ def download_minio_bucket(self, source: str, destination: str | Path | None = No
         parsed_url = urllib.parse.urlparse(source)
 
         # expect path format: /BUCKET/path/to/file.ext
-        _, bucket, *prefixes, _file = parsed_url.path.split("/")
+        _, bucket, *prefixes, _ = parsed_url.path.split("/")
         prefix = "/".join(prefixes)
 
         client = minio.Minio(endpoint=parsed_url.netloc, secure=False)
diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py
index 5ad143db8..703351485 100644
--- a/openml/_api/resources/base.py
+++ b/openml/_api/resources/base.py
@@ -117,6 +117,12 @@ def download_dataset_arff(
         description: dict | OpenMLDataset,
     ) -> Path: ...
 
+    @abstractmethod
+    def add_topic(self, data_id: int, topic: str) -> int: ...
+
+    @abstractmethod
+    def delete_topic(self, data_id: int, topic: str) -> int: ...
+
 
 class TasksAPI(ResourceAPI, ABC):
     @abstractmethod
diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py
index 1387bfd54..a7ff60555 100644
--- a/openml/_api/resources/datasets.py
+++ b/openml/_api/resources/datasets.py
@@ -28,11 +28,12 @@
 if TYPE_CHECKING:
     from requests import Response
 
-    import openml
 
 import pandas as pd
 import xmltodict
 
+import openml
+
 logger = logging.getLogger(__name__)
 
 
@@ -693,6 +694,20 @@ def download_dataset_arff(
 
         return output_file_path
 
+    def add_topic(self, data_id: int, topic: str) -> int:
+        form_data = {"data_id": data_id, "topic": topic}  # type: openml._api_calls.DATA_TYPE
+        result_xml = openml._api_calls._perform_api_call("data/topicadd", "post", data=form_data)
+        result = xmltodict.parse(result_xml)
+        data_id = result["oml:data_topic"]["oml:id"]
+        return int(data_id)
+
+    def delete_topic(self, data_id: int, topic: str) -> int:
+        form_data = {"data_id": data_id, "topic": topic}  # type: openml._api_calls.DATA_TYPE
+        result_xml = openml._api_calls._perform_api_call("data/topicdelete", "post", data=form_data)
+        result = xmltodict.parse(result_xml)
+        data_id = result["oml:data_topic"]["oml:id"]
+        return int(data_id)
+
 
 class DatasetsV2(DatasetsAPI):
     def get(
@@ -1100,3 +1115,9 @@ def download_dataset_arff(
             raise e
 
         return output_file_path
+
+    def add_topic(self, data_id: int, topic: str) -> int:
+        raise NotImplementedError()
+
+    def delete_topic(self, data_id: int, topic: str) -> int:
+        raise NotImplementedError()
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 8b3fbd732..dd626eb08 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -80,7 +80,7 @@ def list_datasets(
 
     Parameters
     ----------
-    dataset_id : list, optional
+    data_id : list, optional
         A list of data ids, to specify which datasets should be
         listed
     offset : int, optional
@@ -842,6 +842,7 @@ def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> boo
     return api_context.backend.datasets.feature_remove_ontology(data_id, index, ontology)
 
 
+# TODO used only in tests
 def _topic_add_dataset(data_id: int, topic: str) -> int:
     """
     Adds a topic for a dataset.
@@ -858,15 +859,12 @@ def _topic_add_dataset(data_id: int, topic: str) -> int:
     -------
     Dataset id
     """
-    if not isinstance(data_id, int):
-        raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
-    form_data = {"data_id": data_id, "topic": topic}  # type: openml._api_calls.DATA_TYPE
-    result_xml = openml._api_calls._perform_api_call("data/topicadd", "post", data=form_data)
-    result = xmltodict.parse(result_xml)
-    data_id = result["oml:data_topic"]["oml:id"]
-    return int(data_id)
+    from openml._api import api_context
+
+    return api_context.backend.datasets.add_topic(data_id, topic)
 
 
+# TODO used only in tests
 def _topic_delete_dataset(data_id: int, topic: str) -> int:
     """
     Removes a topic from a dataset.
@@ -883,15 +881,12 @@ def _topic_delete_dataset(data_id: int, topic: str) -> int:
     -------
     Dataset id
     """
-    if not isinstance(data_id, int):
-        raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
-    form_data = {"data_id": data_id, "topic": topic}  # type: openml._api_calls.DATA_TYPE
-    result_xml = openml._api_calls._perform_api_call("data/topicdelete", "post", data=form_data)
-    result = xmltodict.parse(result_xml)
-    data_id = result["oml:data_topic"]["oml:id"]
-    return int(data_id)
+    from openml._api import api_context
 
+    return api_context.backend.datasets.delete_topic(data_id, topic)
 
+
+# TODO used by tests only
 def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str, Any]:
     """Get the dataset description as xml dictionary.
 
@@ -935,6 +930,7 @@ def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str,
     return description  # type: ignore
 
 
+# TODO remove cache dir
 def _get_dataset_parquet(
     description: dict | OpenMLDataset,
     cache_directory: Path | None = None,  # noqa: ARG001
@@ -972,6 +968,7 @@ def _get_dataset_parquet(
     return api_context.backend.datasets.download_dataset_parquet(description, download_all_files)
 
 
+# TODO remove cache dir
 def _get_dataset_arff(
     description: dict | OpenMLDataset,
     cache_directory: Path | None = None,  # noqa: ARG001
@@ -1003,6 +1000,7 @@ def _get_dataset_arff(
     return api_context.backend.datasets.download_dataset_arff(description)
 
 
+# TODO remove cache dir
 def _get_dataset_features_file(
     did_cache_dir: str | Path | None,  # noqa: ARG001
     dataset_id: int,
@@ -1033,6 +1031,7 @@ def _get_dataset_features_file(
     return api_context.backend.datasets.download_features_file(dataset_id)
 
 
+# TODO remove cache dir
 def _get_dataset_qualities_file(
     did_cache_dir: str | Path | None,  # noqa: ARG001
     dataset_id: int,
@@ -1060,9 +1059,10 @@ def _get_dataset_qualities_file(
     # cache directory not used here anymore
     from openml._api import api_context
 
-    return api_context.backend.datasets.download_features_file(dataset_id)
+    return api_context.backend.datasets.download_qualities_file(dataset_id)
 
 
+# TODO used only in tests
 def _get_online_dataset_arff(dataset_id: int) -> str | None:
     """Download the ARFF file for a given dataset id
     from the OpenML website.
@@ -1085,6 +1085,7 @@ def _get_online_dataset_arff(dataset_id: int) -> str | None:
     )
 
 
+# TODO used only in tests
 def _get_online_dataset_format(dataset_id: int) -> str:
     """Get the dataset format for a given dataset id from the OpenML website.