From 0159f474c6bbc15f20d52bc946bd252bd852b196 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 30 Dec 2025 09:11:27 +0500 Subject: [PATCH 01/15] set up folder structure and base code --- openml/_api/__init__.py | 8 +++ openml/_api/config.py | 5 ++ openml/_api/http/__init__.py | 1 + openml/_api/http/client.py | 23 ++++++ openml/_api/http/utils.py | 0 openml/_api/resources/__init__.py | 2 + openml/_api/resources/base.py | 22 ++++++ openml/_api/resources/datasets.py | 13 ++++ openml/_api/resources/tasks.py | 113 ++++++++++++++++++++++++++++++ openml/_api/runtime/core.py | 58 +++++++++++++++ openml/_api/runtime/fallback.py | 5 ++ openml/tasks/functions.py | 8 ++- 12 files changed, 255 insertions(+), 3 deletions(-) create mode 100644 openml/_api/__init__.py create mode 100644 openml/_api/config.py create mode 100644 openml/_api/http/__init__.py create mode 100644 openml/_api/http/client.py create mode 100644 openml/_api/http/utils.py create mode 100644 openml/_api/resources/__init__.py create mode 100644 openml/_api/resources/base.py create mode 100644 openml/_api/resources/datasets.py create mode 100644 openml/_api/resources/tasks.py create mode 100644 openml/_api/runtime/core.py create mode 100644 openml/_api/runtime/fallback.py diff --git a/openml/_api/__init__.py b/openml/_api/__init__.py new file mode 100644 index 000000000..5089f94dd --- /dev/null +++ b/openml/_api/__init__.py @@ -0,0 +1,8 @@ +from openml._api.runtime.core import APIContext + + +def set_api_version(version: str, strict=False): + api_context.set_version(version=version, strict=strict) + + +api_context = APIContext() diff --git a/openml/_api/config.py b/openml/_api/config.py new file mode 100644 index 000000000..bd93c3cad --- /dev/null +++ b/openml/_api/config.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +API_V1_SERVER = "https://www.openml.org/api/v1/xml" +API_V2_SERVER = "http://127.0.0.1:8001" +API_KEY = "..." diff --git a/openml/_api/http/__init__.py b/openml/_api/http/__init__.py new file mode 100644 index 000000000..fde2a5b0a --- /dev/null +++ b/openml/_api/http/__init__.py @@ -0,0 +1 @@ +from openml._api.http.client import HTTPClient diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py new file mode 100644 index 000000000..81a9213e3 --- /dev/null +++ b/openml/_api/http/client.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +import requests + +from openml.__version__ import __version__ + + +class HTTPClient: + def __init__(self, base_url: str): + self.base_url = base_url + self.headers = {"user-agent": f"openml-python/{__version__}"} + + def get(self, path, params=None): + url = f"{self.base_url}/{path}" + return requests.get(url, params=params, headers=self.headers) + + def post(self, path, data=None, files=None): + url = f"{self.base_url}/{path}" + return requests.post(url, data=data, files=files, headers=self.headers) + + def delete(self, path, params=None): + url = f"{self.base_url}/{path}" + return requests.delete(url, params=params, headers=self.headers) diff --git a/openml/_api/http/utils.py b/openml/_api/http/utils.py new file mode 100644 index 000000000..e69de29bb diff --git a/openml/_api/resources/__init__.py b/openml/_api/resources/__init__.py new file mode 100644 index 000000000..078fc5998 --- /dev/null +++ b/openml/_api/resources/__init__.py @@ -0,0 +1,2 @@ +from openml._api.resources.datasets import DatasetsV1, DatasetsV2 +from openml._api.resources.tasks import TasksV1, TasksV2 diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py new file mode 100644 index 000000000..1fae27665 --- /dev/null +++ b/openml/_api/resources/base.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from openml._api.http import HTTPClient + + +class ResourceAPI: + def __init__(self, http: HTTPClient): + self._http = http + + +class DatasetsAPI(ResourceAPI, ABC): + @abstractmethod + def get(self, id: int) -> dict: ... + + +class TasksAPI(ResourceAPI, ABC): + @abstractmethod + def get(self, id: int) -> dict: ... diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py new file mode 100644 index 000000000..cd1bb595a --- /dev/null +++ b/openml/_api/resources/datasets.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from openml._api.resources.base import DatasetsAPI + + +class DatasetsV1(DatasetsAPI): + def get(self, id): + pass + + +class DatasetsV2(DatasetsAPI): + def get(self, id): + pass diff --git a/openml/_api/resources/tasks.py b/openml/_api/resources/tasks.py new file mode 100644 index 000000000..b0e9afbf8 --- /dev/null +++ b/openml/_api/resources/tasks.py @@ -0,0 +1,113 @@ +from __future__ import annotations + +import xmltodict + +from openml._api.resources.base import TasksAPI +from openml.tasks.task import ( + OpenMLClassificationTask, + OpenMLClusteringTask, + OpenMLLearningCurveTask, + OpenMLRegressionTask, + OpenMLTask, + TaskType, +) + + +class TasksV1(TasksAPI): + def get(self, id, return_response=False): + path = f"task/{id}" + response = self._http.get(path) + xml_content = response.content + task = self._create_task_from_xml(xml_content) + + if return_response: + return task, response + + return task + + def _create_task_from_xml(self, xml: str) -> OpenMLTask: + """Create a task given a xml string. + + Parameters + ---------- + xml : string + Task xml representation. + + Returns + ------- + OpenMLTask + """ + dic = xmltodict.parse(xml)["oml:task"] + estimation_parameters = {} + inputs = {} + # Due to the unordered structure we obtain, we first have to extract + # the possible keys of oml:input; dic["oml:input"] is a list of + # OrderedDicts + + # Check if there is a list of inputs + if isinstance(dic["oml:input"], list): + for input_ in dic["oml:input"]: + name = input_["@name"] + inputs[name] = input_ + # Single input case + elif isinstance(dic["oml:input"], dict): + name = dic["oml:input"]["@name"] + inputs[name] = dic["oml:input"] + + evaluation_measures = None + if "evaluation_measures" in inputs: + evaluation_measures = inputs["evaluation_measures"]["oml:evaluation_measures"][ + "oml:evaluation_measure" + ] + + task_type = TaskType(int(dic["oml:task_type_id"])) + common_kwargs = { + "task_id": dic["oml:task_id"], + "task_type": dic["oml:task_type"], + "task_type_id": task_type, + "data_set_id": inputs["source_data"]["oml:data_set"]["oml:data_set_id"], + "evaluation_measure": evaluation_measures, + } + # TODO: add OpenMLClusteringTask? + if task_type in ( + TaskType.SUPERVISED_CLASSIFICATION, + TaskType.SUPERVISED_REGRESSION, + TaskType.LEARNING_CURVE, + ): + # Convert some more parameters + for parameter in inputs["estimation_procedure"]["oml:estimation_procedure"][ + "oml:parameter" + ]: + name = parameter["@name"] + text = parameter.get("#text", "") + estimation_parameters[name] = text + + common_kwargs["estimation_procedure_type"] = inputs["estimation_procedure"][ + "oml:estimation_procedure" + ]["oml:type"] + common_kwargs["estimation_procedure_id"] = int( + inputs["estimation_procedure"]["oml:estimation_procedure"]["oml:id"] + ) + + common_kwargs["estimation_parameters"] = estimation_parameters + common_kwargs["target_name"] = inputs["source_data"]["oml:data_set"][ + "oml:target_feature" + ] + common_kwargs["data_splits_url"] = inputs["estimation_procedure"][ + "oml:estimation_procedure" + ]["oml:data_splits_url"] + + cls = { + TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask, + TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask, + TaskType.CLUSTERING: OpenMLClusteringTask, + TaskType.LEARNING_CURVE: OpenMLLearningCurveTask, + }.get(task_type) + if cls is None: + raise NotImplementedError(f"Task type {common_kwargs['task_type']} not supported.") + return cls(**common_kwargs) # type: ignore + + +class TasksV2(TasksAPI): + def get(self, id): + pass diff --git a/openml/_api/runtime/core.py b/openml/_api/runtime/core.py new file mode 100644 index 000000000..80f35587c --- /dev/null +++ b/openml/_api/runtime/core.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from openml._api.config import ( + API_V1_SERVER, + API_V2_SERVER, +) +from openml._api.http.client import HTTPClient +from openml._api.resources import ( + DatasetsV1, + DatasetsV2, + TasksV1, + TasksV2, +) +from openml._api.runtime.fallback import FallbackProxy + + +class APIBackend: + def __init__(self, *, datasets, tasks): + self.datasets = datasets + self.tasks = tasks + + +def build_backend(version: str, strict: bool) -> APIBackend: + v1_http = HTTPClient(API_V1_SERVER) + v2_http = HTTPClient(API_V2_SERVER) + + v1 = APIBackend( + datasets=DatasetsV1(v1_http), + tasks=TasksV1(v1_http), + ) + + if version == "v1": + return v1 + + v2 = APIBackend( + datasets=DatasetsV2(v2_http), + tasks=TasksV2(v2_http), + ) + + if strict: + return v2 + + return APIBackend( + datasets=FallbackProxy(v2.datasets, v1.datasets), + tasks=FallbackProxy(v2.tasks, v1.tasks), + ) + + +class APIContext: + def __init__(self): + self._backend = build_backend("v1", strict=False) + + def set_version(self, version: str, strict: bool = False): + self._backend = build_backend(version, strict) + + @property + def backend(self): + return self._backend diff --git a/openml/_api/runtime/fallback.py b/openml/_api/runtime/fallback.py new file mode 100644 index 000000000..56e96a966 --- /dev/null +++ b/openml/_api/runtime/fallback.py @@ -0,0 +1,5 @@ +from __future__ import annotations + + +class FallbackProxy: + pass diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index d2bf5e946..91be65965 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -12,6 +12,7 @@ import openml._api_calls import openml.utils +from openml._api import api_context from openml.datasets import get_dataset from openml.exceptions import OpenMLCacheException @@ -442,11 +443,12 @@ def _get_task_description(task_id: int) -> OpenMLTask: except OpenMLCacheException: _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id) xml_file = _cache_dir / "task.xml" - task_xml = openml._api_calls._perform_api_call("task/%d" % task_id, "get") + task, response = api_context.backend.tasks.get(task_id, return_response=True) with xml_file.open("w", encoding="utf8") as fh: - fh.write(task_xml) - return _create_task_from_xml(task_xml) + fh.write(response.text) + + return task def _create_task_from_xml(xml: str) -> OpenMLTask: From 52ef37999fad8509e5e85b8512e442bd9dc69e04 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Mon, 5 Jan 2026 12:48:58 +0500 Subject: [PATCH 02/15] fix pre-commit --- openml/_api/__init__.py | 2 +- openml/_api/http/__init__.py | 2 ++ openml/_api/http/client.py | 32 +++++++++++++++++++++++-------- openml/_api/resources/__init__.py | 2 ++ openml/_api/resources/base.py | 13 +++++++++++-- openml/_api/resources/datasets.py | 15 +++++++++++---- openml/_api/resources/tasks.py | 25 +++++++++++++++++++----- openml/_api/runtime/__init__.py | 0 openml/_api/runtime/core.py | 23 +++++++++++----------- openml/_api/runtime/fallback.py | 9 ++++++++- openml/tasks/functions.py | 12 ++++++++---- 11 files changed, 99 insertions(+), 36 deletions(-) create mode 100644 openml/_api/runtime/__init__.py diff --git a/openml/_api/__init__.py b/openml/_api/__init__.py index 5089f94dd..881f40671 100644 --- a/openml/_api/__init__.py +++ b/openml/_api/__init__.py @@ -1,7 +1,7 @@ from openml._api.runtime.core import APIContext -def set_api_version(version: str, strict=False): +def set_api_version(version: str, *, strict: bool = False) -> None: api_context.set_version(version=version, strict=strict) diff --git a/openml/_api/http/__init__.py b/openml/_api/http/__init__.py index fde2a5b0a..8e6d1e4ce 100644 --- a/openml/_api/http/__init__.py +++ b/openml/_api/http/__init__.py @@ -1 +1,3 @@ from openml._api.http.client import HTTPClient + +__all__ = ["HTTPClient"] diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py index 81a9213e3..dea5de809 100644 --- a/openml/_api/http/client.py +++ b/openml/_api/http/client.py @@ -1,23 +1,39 @@ from __future__ import annotations +from typing import Any, Mapping + import requests +from requests import Response from openml.__version__ import __version__ class HTTPClient: - def __init__(self, base_url: str): + def __init__(self, base_url: str) -> None: self.base_url = base_url - self.headers = {"user-agent": f"openml-python/{__version__}"} + self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"} - def get(self, path, params=None): + def get( + self, + path: str, + params: Mapping[str, Any] | None = None, + ) -> Response: url = f"{self.base_url}/{path}" - return requests.get(url, params=params, headers=self.headers) + return requests.get(url, params=params, headers=self.headers, timeout=10) - def post(self, path, data=None, files=None): + def post( + self, + path: str, + data: Mapping[str, Any] | None = None, + files: Any = None, + ) -> Response: url = f"{self.base_url}/{path}" - return requests.post(url, data=data, files=files, headers=self.headers) + return requests.post(url, data=data, files=files, headers=self.headers, timeout=10) - def delete(self, path, params=None): + def delete( + self, + path: str, + params: Mapping[str, Any] | None = None, + ) -> Response: url = f"{self.base_url}/{path}" - return requests.delete(url, params=params, headers=self.headers) + return requests.delete(url, params=params, headers=self.headers, timeout=10) diff --git a/openml/_api/resources/__init__.py b/openml/_api/resources/__init__.py index 078fc5998..b1af3c1a8 100644 --- a/openml/_api/resources/__init__.py +++ b/openml/_api/resources/__init__.py @@ -1,2 +1,4 @@ from openml._api.resources.datasets import DatasetsV1, DatasetsV2 from openml._api.resources.tasks import TasksV1, TasksV2 + +__all__ = ["DatasetsV1", "DatasetsV2", "TasksV1", "TasksV2"] diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py index 1fae27665..6fbf8977d 100644 --- a/openml/_api/resources/base.py +++ b/openml/_api/resources/base.py @@ -4,7 +4,11 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: + from requests import Response + from openml._api.http import HTTPClient + from openml.datasets.dataset import OpenMLDataset + from openml.tasks.task import OpenMLTask class ResourceAPI: @@ -14,9 +18,14 @@ def __init__(self, http: HTTPClient): class DatasetsAPI(ResourceAPI, ABC): @abstractmethod - def get(self, id: int) -> dict: ... + def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ... class TasksAPI(ResourceAPI, ABC): @abstractmethod - def get(self, id: int) -> dict: ... + def get( + self, + task_id: int, + *, + return_response: bool = False, + ) -> OpenMLTask | tuple[OpenMLTask, Response]: ... diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py index cd1bb595a..9ff1ec278 100644 --- a/openml/_api/resources/datasets.py +++ b/openml/_api/resources/datasets.py @@ -1,13 +1,20 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from openml._api.resources.base import DatasetsAPI +if TYPE_CHECKING: + from responses import Response + + from openml.datasets.dataset import OpenMLDataset + class DatasetsV1(DatasetsAPI): - def get(self, id): - pass + def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: + raise NotImplementedError class DatasetsV2(DatasetsAPI): - def get(self, id): - pass + def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: + raise NotImplementedError diff --git a/openml/_api/resources/tasks.py b/openml/_api/resources/tasks.py index b0e9afbf8..f494fb9a3 100644 --- a/openml/_api/resources/tasks.py +++ b/openml/_api/resources/tasks.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import xmltodict from openml._api.resources.base import TasksAPI @@ -12,12 +14,20 @@ TaskType, ) +if TYPE_CHECKING: + from requests import Response + class TasksV1(TasksAPI): - def get(self, id, return_response=False): - path = f"task/{id}" + def get( + self, + task_id: int, + *, + return_response: bool = False, + ) -> OpenMLTask | tuple[OpenMLTask, Response]: + path = f"task/{task_id}" response = self._http.get(path) - xml_content = response.content + xml_content = response.text task = self._create_task_from_xml(xml_content) if return_response: @@ -109,5 +119,10 @@ def _create_task_from_xml(self, xml: str) -> OpenMLTask: class TasksV2(TasksAPI): - def get(self, id): - pass + def get( + self, + task_id: int, + *, + return_response: bool = False, + ) -> OpenMLTask | tuple[OpenMLTask, Response]: + raise NotImplementedError diff --git a/openml/_api/runtime/__init__.py b/openml/_api/runtime/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/openml/_api/runtime/core.py b/openml/_api/runtime/core.py index 80f35587c..aa09a69db 100644 --- a/openml/_api/runtime/core.py +++ b/openml/_api/runtime/core.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from openml._api.config import ( API_V1_SERVER, API_V2_SERVER, @@ -11,16 +13,18 @@ TasksV1, TasksV2, ) -from openml._api.runtime.fallback import FallbackProxy + +if TYPE_CHECKING: + from openml._api.resources.base import DatasetsAPI, TasksAPI class APIBackend: - def __init__(self, *, datasets, tasks): + def __init__(self, *, datasets: DatasetsAPI, tasks: TasksAPI): self.datasets = datasets self.tasks = tasks -def build_backend(version: str, strict: bool) -> APIBackend: +def build_backend(version: str, *, strict: bool) -> APIBackend: v1_http = HTTPClient(API_V1_SERVER) v2_http = HTTPClient(API_V2_SERVER) @@ -40,19 +44,16 @@ def build_backend(version: str, strict: bool) -> APIBackend: if strict: return v2 - return APIBackend( - datasets=FallbackProxy(v2.datasets, v1.datasets), - tasks=FallbackProxy(v2.tasks, v1.tasks), - ) + return v1 class APIContext: - def __init__(self): + def __init__(self) -> None: self._backend = build_backend("v1", strict=False) - def set_version(self, version: str, strict: bool = False): - self._backend = build_backend(version, strict) + def set_version(self, version: str, *, strict: bool = False) -> None: + self._backend = build_backend(version=version, strict=strict) @property - def backend(self): + def backend(self) -> APIBackend: return self._backend diff --git a/openml/_api/runtime/fallback.py b/openml/_api/runtime/fallback.py index 56e96a966..1bc99d270 100644 --- a/openml/_api/runtime/fallback.py +++ b/openml/_api/runtime/fallback.py @@ -1,5 +1,12 @@ from __future__ import annotations +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from openml._api.resources.base import ResourceAPI + class FallbackProxy: - pass + def __init__(self, primary: ResourceAPI, fallback: ResourceAPI): + self._primary = primary + self._fallback = fallback diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index ef67f75bf..a794ad56d 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -445,10 +445,14 @@ def _get_task_description(task_id: int) -> OpenMLTask: except OpenMLCacheException: _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id) xml_file = _cache_dir / "task.xml" - task, response = api_context.backend.tasks.get(task_id, return_response=True) - - with xml_file.open("w", encoding="utf8") as fh: - fh.write(response.text) + result = api_context.backend.tasks.get(task_id, return_response=True) + + if isinstance(result, tuple): + task, response = result + with xml_file.open("w", encoding="utf8") as fh: + fh.write(response.text) + else: + task = result return task From f7ba710a9a3c457ec7c48ec45fa174c9194eeb98 Mon Sep 17 00:00:00 2001 From: JATAYU000 Date: Tue, 6 Jan 2026 16:24:35 +0530 Subject: [PATCH 03/15] Merge base migration pr, ruff --- openml/_api/http/client.py | 5 +- openml/_api/resources/base.py | 70 ++++- openml/_api/resources/datasets.py | 440 +++++++++++++++++++++++++++++- 3 files changed, 504 insertions(+), 11 deletions(-) diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py index dea5de809..b0d3c911f 100644 --- a/openml/_api/http/client.py +++ b/openml/_api/http/client.py @@ -25,10 +25,13 @@ def post( self, path: str, data: Mapping[str, Any] | None = None, + json: dict | None = None, files: Any = None, ) -> Response: url = f"{self.base_url}/{path}" - return requests.post(url, data=data, files=files, headers=self.headers, timeout=10) + return requests.post( + url, data=data, json=json, files=files, headers=self.headers, timeout=10 + ) def delete( self, diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py index 6fbf8977d..9d480b06a 100644 --- a/openml/_api/resources/base.py +++ b/openml/_api/resources/base.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: + import pandas as pd from requests import Response from openml._api.http import HTTPClient @@ -18,7 +19,74 @@ def __init__(self, http: HTTPClient): class DatasetsAPI(ResourceAPI, ABC): @abstractmethod - def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ... + def get( + self, dataset_id: int, *, return_response: bool + ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ... + + @abstractmethod + def list( # noqa: PLR0913 + self, + data_id: list[int] | None = None, + offset: int | None = None, + size: int | None = None, + status: str | None = None, + tag: str | None = None, + data_name: str | None = None, + data_version: int | None = None, + number_instances: int | str | None = None, + number_features: int | str | None = None, + number_classes: int | str | None = None, + number_missing_values: int | str | None = None, + ) -> pd.DataFrame: ... + + def _name_to_id( + self, + dataset_name: str, + version: int | None = None, + error_if_multiple: bool = False, # noqa: FBT001, FBT002 + ) -> int: + """Attempt to find the dataset id of the dataset with the given name. + + If multiple datasets with the name exist, and ``error_if_multiple`` is ``False``, + then return the least recent still active dataset. + + Raises an error if no dataset with the name is found. + Raises an error if a version is specified but it could not be found. + + Parameters + ---------- + dataset_name : str + The name of the dataset for which to find its id. + version : int, optional + Version to retrieve. If not specified, the oldest active version is returned. + error_if_multiple : bool (default=False) + If `False`, if multiple datasets match, return the least recent active dataset. + If `True`, if multiple datasets match, raise an error. + download_qualities : bool, optional (default=True) + If `True`, also download qualities.xml file. If False it skip the qualities.xml. + + Returns + ------- + int + The id of the dataset. + """ + status = None if version is not None else "active" + candidates = self.list( + data_name=dataset_name, + status=status, + data_version=version, + ) + if error_if_multiple and len(candidates) > 1: + msg = f"Multiple active datasets exist with name '{dataset_name}'." + raise ValueError(msg) + + if candidates.empty: + no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'" + and_version = f" and version '{version}'." if version is not None else "." + raise RuntimeError(no_dataset_for_name + and_version) + + # Dataset ids are chronological so we can just sort based on ids (instead of version) + return candidates["did"].min() # type: ignore class TasksAPI(ResourceAPI, ABC): diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py index 9ff1ec278..f985cd75a 100644 --- a/openml/_api/resources/datasets.py +++ b/openml/_api/resources/datasets.py @@ -1,20 +1,442 @@ from __future__ import annotations -from typing import TYPE_CHECKING - -from openml._api.resources.base import DatasetsAPI +from functools import partial +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: - from responses import Response + from requests import Response - from openml.datasets.dataset import OpenMLDataset +import pandas as pd +import xmltodict + +import openml.utils +from openml._api.resources.base import DatasetsAPI +from openml.datasets.dataset import OpenMLDataset class DatasetsV1(DatasetsAPI): - def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: - raise NotImplementedError + def get( + self, dataset_id: int, *, return_response: bool = False + ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: + path = f"data/{dataset_id}" + response = self._http.get(path) + xml_content = response.text # .text returns str, .content returns bytes + dataset = self._create_dataset_from_xml(xml_content) + + if return_response: + return dataset, response + + return dataset + + def list( # noqa: PLR0913 + self, + data_id: list[int] | None = None, + offset: int | None = None, + size: int | None = None, + status: str | None = None, + tag: str | None = None, + data_name: str | None = None, + data_version: int | None = None, + number_instances: int | str | None = None, + number_features: int | str | None = None, + number_classes: int | str | None = None, + number_missing_values: int | str | None = None, + ) -> pd.DataFrame: + """Return a dataframe of all dataset which are on OpenML. + + Supports large amount of results. + + Parameters + ---------- + data_id : list, optional + A list of data ids, to specify which datasets should be + listed + offset : int, optional + The number of datasets to skip, starting from the first. + size : int, optional + The maximum number of datasets to show. + status : str, optional + Should be {active, in_preparation, deactivated}. By + default active datasets are returned, but also datasets + from another status can be requested. + tag : str, optional + data_name : str, optional + data_version : int, optional + number_instances : int | str, optional + number_features : int | str, optional + number_classes : int | str, optional + number_missing_values : int | str, optional + + Returns + ------- + datasets: dataframe + Each row maps to a dataset + Each column contains the following information: + - dataset id + - name + - format + - status + If qualities are calculated for the dataset, some of + these are also included as columns. + """ + listing_call = partial( + self._list_datasets, + data_id=data_id, + status=status, + tag=tag, + data_name=data_name, + data_version=data_version, + number_instances=number_instances, + number_features=number_features, + number_classes=number_classes, + number_missing_values=number_missing_values, + ) + batches = openml.utils._list_all(listing_call, offset=offset, limit=size) + if len(batches) == 0: + return pd.DataFrame() + + return pd.concat(batches) + + def _list_datasets( + self, + limit: int, + offset: int, + *, + data_id: list[int] | None = None, # type: ignore + **kwargs: Any, + ) -> pd.DataFrame: + """ + Perform api call to return a list of all datasets. + + Parameters + ---------- + The arguments that are lists are separated from the single value + ones which are put into the kwargs. + display_errors is also separated from the kwargs since it has a + default value. + + limit : int + The maximum number of datasets to show. + offset : int + The number of datasets to skip, starting from the first. + data_id : list, optional + + kwargs : dict, optional + Legal filter operators (keys in the dict): + tag, status, limit, offset, data_name, data_version, number_instances, + number_features, number_classes, number_missing_values. + + Returns + ------- + datasets : dataframe + """ + api_call = "data/list" + + if limit is not None: + api_call += f"/limit/{limit}" + if offset is not None: + api_call += f"/offset/{offset}" + + if kwargs is not None: + for operator, value in kwargs.items(): + if value is not None: + api_call += f"/{operator}/{value}" + if data_id is not None: + api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}" + return self.__list_datasets(api_call=api_call) + + def __list_datasets(self, api_call: str) -> pd.DataFrame: + xml_string = self._http.get(api_call).text + datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",)) + + # Minimalistic check if the XML is useful + assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type( + datasets_dict["oml:data"], + ) + assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[ + "oml:data" + ]["@xmlns:oml"] + + datasets = {} + for dataset_ in datasets_dict["oml:data"]["oml:dataset"]: + ignore_attribute = ["oml:file_id", "oml:quality"] + dataset = { + k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute + } + dataset["did"] = int(dataset["did"]) + dataset["version"] = int(dataset["version"]) + + # The number of qualities can range from 0 to infinity + for quality in dataset_.get("oml:quality", []): + try: + dataset[quality["@name"]] = int(quality["#text"]) + except ValueError: + dataset[quality["@name"]] = float(quality["#text"]) + datasets[dataset["did"]] = dataset + + return pd.DataFrame.from_dict(datasets, orient="index").astype( + { + "did": int, + "version": int, + "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), + } + ) + + def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset: + """Create a dataset given a xml string. + + Parameters + ---------- + xml : string + Dataset xml representation. + + Returns + ------- + OpenMLDataset + """ + description = xmltodict.parse(xml)["oml:data_set_description"] + + # TODO file path after download, cache_format default = 'pickle' + arff_file = None + features_file = None + parquet_file = None + qualities_file = None + + return OpenMLDataset( + description["oml:name"], + description.get("oml:description"), + data_format=description["oml:format"], + dataset_id=int(description["oml:id"]), + version=int(description["oml:version"]), + creator=description.get("oml:creator"), + contributor=description.get("oml:contributor"), + collection_date=description.get("oml:collection_date"), + upload_date=description.get("oml:upload_date"), + language=description.get("oml:language"), + licence=description.get("oml:licence"), + url=description["oml:url"], + default_target_attribute=description.get("oml:default_target_attribute"), + row_id_attribute=description.get("oml:row_id_attribute"), + ignore_attribute=description.get("oml:ignore_attribute"), + version_label=description.get("oml:version_label"), + citation=description.get("oml:citation"), + tag=description.get("oml:tag"), + visibility=description.get("oml:visibility"), + original_data_url=description.get("oml:original_data_url"), + paper_url=description.get("oml:paper_url"), + update_comment=description.get("oml:update_comment"), + md5_checksum=description.get("oml:md5_checksum"), + data_file=str(arff_file) if arff_file is not None else None, + features_file=str(features_file) if features_file is not None else None, + qualities_file=str(qualities_file) if qualities_file is not None else None, + parquet_url=description.get("oml:parquet_url"), + parquet_file=str(parquet_file) if parquet_file is not None else None, + ) class DatasetsV2(DatasetsAPI): - def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: - raise NotImplementedError + def get( + self, dataset_id: int, *, return_response: bool = False + ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: + path = f"datasets/{dataset_id}" + response = self._http.get(path) + json_content = response.json() + dataset = self._create_dataset_from_json(json_content) + + if return_response: + return dataset, response + + return dataset + + def list( # noqa: PLR0913 + self, + data_id: list[int] | None = None, + offset: int | None = None, + size: int | None = None, + status: str | None = None, + tag: str | None = None, + data_name: str | None = None, + data_version: int | None = None, + number_instances: int | str | None = None, + number_features: int | str | None = None, + number_classes: int | str | None = None, + number_missing_values: int | str | None = None, + ) -> pd.DataFrame: + """Return a dataframe of all dataset which are on OpenML. + + Supports large amount of results. + + Parameters + ---------- + data_id : list, optional + A list of data ids, to specify which datasets should be + listed + offset : int, optional + The number of datasets to skip, starting from the first. + size : int, optional + The maximum number of datasets to show. + status : str, optional + Should be {active, in_preparation, deactivated}. By + default active datasets are returned, but also datasets + from another status can be requested. + tag : str, optional + data_name : str, optional + data_version : int, optional + number_instances : int | str, optional + number_features : int | str, optional + number_classes : int | str, optional + number_missing_values : int | str, optional + + Returns + ------- + datasets: dataframe + Each row maps to a dataset + Each column contains the following information: + - dataset id + - name + - format + - status + If qualities are calculated for the dataset, some of + these are also included as columns. + """ + listing_call = partial( + self._list_datasets, + data_id=data_id, + status=status, + tag=tag, + data_name=data_name, + data_version=data_version, + number_instances=number_instances, + number_features=number_features, + number_classes=number_classes, + number_missing_values=number_missing_values, + ) + batches = openml.utils._list_all(listing_call, offset=offset, limit=size) + if len(batches) == 0: + return pd.DataFrame() + + return pd.concat(batches) + + def _list_datasets( + self, + limit: int, + offset: int, + **kwargs: Any, + ) -> pd.DataFrame: + """ + Perform api call to return a list of all datasets. + + Parameters + ---------- + The arguments that are lists are separated from the single value + ones which are put into the kwargs. + display_errors is also separated from the kwargs since it has a + default value. + + limit : int + The maximum number of datasets to show. + offset : int + The number of datasets to skip, starting from the first. + data_id : list, optional + + kwargs : dict, optional + Legal filter operators (keys in the dict): + tag, status, limit, offset, data_name, data_version, number_instances, + number_features, number_classes, number_missing_values, data_id. + + Returns + ------- + datasets : dataframe + """ + json: dict[str, Any] = {"pagination": {}} + + if limit is not None: + json["pagination"]["limit"] = limit + if offset is not None: + json["pagination"]["offset"] = offset + + if kwargs is not None: + for operator, value in kwargs.items(): + if value is not None: + json[operator] = value + + return self.__list_datasets(json=json) + + def __list_datasets(self, json: dict) -> pd.DataFrame: + api_call = "datasets/list" + datasets_list = self._http.post(api_call, json=json).json() + + # Minimalistic check if the JSON is useful + assert isinstance(datasets_list, list), type(datasets_list) + + datasets = {} + for dataset_ in datasets_list: + ignore_attribute = ["file_id", "quality"] + dataset = {k: v for (k, v) in dataset_.items() if k not in ignore_attribute} + dataset["did"] = int(dataset["did"]) + dataset["version"] = int(dataset["version"]) + + # The number of qualities can range from 0 to infinity + for quality in dataset_.get("quality", []): + try: + dataset[quality["name"]] = int(quality["text"]) + except ValueError: + dataset[quality["name"]] = float(quality["text"]) + datasets[dataset["did"]] = dataset + + return pd.DataFrame.from_dict(datasets, orient="index").astype( + { + "did": int, + "version": int, + "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), + } + ) + + def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset: + """Create a dataset given a json. + + Parameters + ---------- + json_content : dict + Dataset dict/json representation. + + Returns + ------- + OpenMLDataset + """ + # TODO file path after download, cache_format default = 'pickle' + arff_file = None + features_file = None + parquet_file = None + qualities_file = None + + return OpenMLDataset( + json_content["name"], + json_content.get("description"), + data_format=json_content["format"], + dataset_id=int(json_content["id"]), + version=int(json_content["version"]), + creator=json_content.get("creator"), + contributor=json_content.get("contributor"), + collection_date=json_content.get("collection_date"), + upload_date=json_content.get("upload_date"), + language=json_content.get("language"), + licence=json_content.get("licence"), + url=json_content["url"], + default_target_attribute=json_content.get("default_target_attribute"), + row_id_attribute=json_content.get("row_id_attribute"), + ignore_attribute=json_content.get("ignore_attribute"), + version_label=json_content.get("version_label"), + citation=json_content.get("citation"), + tag=json_content.get("tag"), + visibility=json_content.get("visibility"), + original_data_url=json_content.get("original_data_url"), + paper_url=json_content.get("paper_url"), + update_comment=json_content.get("update_comment"), + md5_checksum=json_content.get("md5_checksum"), + data_file=str(arff_file) if arff_file is not None else None, + features_file=str(features_file) if features_file is not None else None, + qualities_file=str(qualities_file) if qualities_file is not None else None, + parquet_url=json_content.get("parquet_url"), + parquet_file=str(parquet_file) if parquet_file is not None else None, + ) From 5dfcbce55a027d19cd502ea7bb3d521c2b1bca29 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 7 Jan 2026 22:14:31 +0500 Subject: [PATCH 04/15] refactor --- openml/_api/config.py | 62 +++++++++++++++++++++++++++++++++++-- openml/_api/http/client.py | 18 +++++++---- openml/_api/runtime/core.py | 9 ++---- 3 files changed, 74 insertions(+), 15 deletions(-) diff --git a/openml/_api/config.py b/openml/_api/config.py index bd93c3cad..1431f66b1 100644 --- a/openml/_api/config.py +++ b/openml/_api/config.py @@ -1,5 +1,61 @@ from __future__ import annotations -API_V1_SERVER = "https://www.openml.org/api/v1/xml" -API_V2_SERVER = "http://127.0.0.1:8001" -API_KEY = "..." +from dataclasses import dataclass +from typing import Literal + +DelayMethod = Literal["human", "robot"] + + +@dataclass +class APIConfig: + server: str + base_url: str + key: str + + +@dataclass +class APISettings: + v1: APIConfig + v2: APIConfig + + +@dataclass +class ConnectionConfig: + retries: int = 3 + delay_method: DelayMethod = "human" + delay_time: int = 1 # seconds + + def __post_init__(self) -> None: + if self.delay_method not in ("human", "robot"): + raise ValueError(f"delay_method must be 'human' or 'robot', got {self.delay_method}") + + +@dataclass +class CacheConfig: + dir: str = "~/.openml/cache" + ttl: int = 60 * 60 * 24 * 7 # one week + + +@dataclass +class Settings: + api: APISettings + connection: ConnectionConfig + cache: CacheConfig + + +settings = Settings( + api=APISettings( + v1=APIConfig( + server="https://www.openml.org/", + base_url="api/v1/xml/", + key="...", + ), + v2=APIConfig( + server="http://127.0.0.1:8001/", + base_url="", + key="...", + ), + ), + connection=ConnectionConfig(), + cache=CacheConfig(), +) diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py index dea5de809..74e08c709 100644 --- a/openml/_api/http/client.py +++ b/openml/_api/http/client.py @@ -1,24 +1,30 @@ from __future__ import annotations -from typing import Any, Mapping +from typing import TYPE_CHECKING, Any, Mapping import requests from requests import Response from openml.__version__ import __version__ +if TYPE_CHECKING: + from openml._api.config import APIConfig + class HTTPClient: - def __init__(self, base_url: str) -> None: - self.base_url = base_url + def __init__(self, config: APIConfig) -> None: + self.config = config self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"} + def _create_url(self, path: str) -> str: + return self.config.server + self.config.base_url + path + def get( self, path: str, params: Mapping[str, Any] | None = None, ) -> Response: - url = f"{self.base_url}/{path}" + url = self._create_url(path) return requests.get(url, params=params, headers=self.headers, timeout=10) def post( @@ -27,7 +33,7 @@ def post( data: Mapping[str, Any] | None = None, files: Any = None, ) -> Response: - url = f"{self.base_url}/{path}" + url = self._create_url(path) return requests.post(url, data=data, files=files, headers=self.headers, timeout=10) def delete( @@ -35,5 +41,5 @@ def delete( path: str, params: Mapping[str, Any] | None = None, ) -> Response: - url = f"{self.base_url}/{path}" + url = self._create_url(path) return requests.delete(url, params=params, headers=self.headers, timeout=10) diff --git a/openml/_api/runtime/core.py b/openml/_api/runtime/core.py index aa09a69db..98b587411 100644 --- a/openml/_api/runtime/core.py +++ b/openml/_api/runtime/core.py @@ -2,10 +2,7 @@ from typing import TYPE_CHECKING -from openml._api.config import ( - API_V1_SERVER, - API_V2_SERVER, -) +from openml._api.config import settings from openml._api.http.client import HTTPClient from openml._api.resources import ( DatasetsV1, @@ -25,8 +22,8 @@ def __init__(self, *, datasets: DatasetsAPI, tasks: TasksAPI): def build_backend(version: str, *, strict: bool) -> APIBackend: - v1_http = HTTPClient(API_V1_SERVER) - v2_http = HTTPClient(API_V2_SERVER) + v1_http = HTTPClient(config=settings.api.v1) + v2_http = HTTPClient(config=settings.api.v2) v1 = APIBackend( datasets=DatasetsV1(v1_http), From 2acbe9992cf95bfc103ff4fa0c360a58c1842870 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 7 Jan 2026 22:24:03 +0500 Subject: [PATCH 05/15] implement cache_dir --- openml/_api/http/client.py | 74 +++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 8 deletions(-) diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py index 74e08c709..49b05c88e 100644 --- a/openml/_api/http/client.py +++ b/openml/_api/http/client.py @@ -1,36 +1,93 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Mapping +from pathlib import Path +from typing import TYPE_CHECKING, Any +from urllib.parse import urlencode, urljoin, urlparse import requests from requests import Response from openml.__version__ import __version__ +from openml._api.config import settings if TYPE_CHECKING: from openml._api.config import APIConfig -class HTTPClient: +class CacheMixin: + @property + def dir(self) -> str: + return settings.cache.dir + + @property + def ttl(self) -> int: + return settings.cache.ttl + + def _get_cache_directory(self, url: str, params: dict[str, Any]) -> Path: + parsed_url = urlparse(url) + netloc_parts = parsed_url.netloc.split(".")[::-1] # reverse domain + path_parts = parsed_url.path.strip("/").split("/") + + # remove api_key and serialize params if any + filtered_params = {k: v for k, v in params.items() if k != "api_key"} + params_part = [urlencode(filtered_params)] if filtered_params else [] + + return Path(self.dir).joinpath(*netloc_parts, *path_parts, *params_part) + + def _get_cache_response(self, url: str, params: dict[str, Any]) -> Response | None: # noqa: ARG002 + return None + + def _set_cache_response(self, url: str, params: dict[str, Any], response: Response) -> None: # noqa: ARG002 + return None + + +class HTTPClient(CacheMixin): def __init__(self, config: APIConfig) -> None: self.config = config self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"} - def _create_url(self, path: str) -> str: - return self.config.server + self.config.base_url + path + @property + def server(self) -> str: + return self.config.server + + @property + def base_url(self) -> str: + return self.config.base_url + + def _create_url(self, path: str) -> Any: + return urljoin(self.server, urljoin(self.base_url, path)) def get( self, path: str, - params: Mapping[str, Any] | None = None, + *, + params: dict[str, Any] | None = None, + use_cache: bool = False, + use_api_key: bool = False, ) -> Response: url = self._create_url(path) - return requests.get(url, params=params, headers=self.headers, timeout=10) + params = dict(params) if params is not None else {} + + if use_api_key: + params["api_key"] = self.config.key + + if use_cache: + response = self._get_cache_response(url, params) + if response: + return response + + response = requests.get(url, params=params, headers=self.headers, timeout=10) + + if use_cache: + self._set_cache_response(url, params, response) + + return response def post( self, path: str, - data: Mapping[str, Any] | None = None, + *, + data: dict[str, Any] | None = None, files: Any = None, ) -> Response: url = self._create_url(path) @@ -39,7 +96,8 @@ def post( def delete( self, path: str, - params: Mapping[str, Any] | None = None, + *, + params: dict[str, Any] | None = None, ) -> Response: url = self._create_url(path) return requests.delete(url, params=params, headers=self.headers, timeout=10) From af99880a9e16a49833c63084c9e9267c112b6b91 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 7 Jan 2026 23:42:17 +0500 Subject: [PATCH 06/15] refactor --- openml/_api/config.py | 1 + openml/_api/http/client.py | 100 +++++++++++++++++++++++++++---------- 2 files changed, 75 insertions(+), 26 deletions(-) diff --git a/openml/_api/config.py b/openml/_api/config.py index 1431f66b1..848fe8da1 100644 --- a/openml/_api/config.py +++ b/openml/_api/config.py @@ -11,6 +11,7 @@ class APIConfig: server: str base_url: str key: str + timeout: int = 10 # seconds @dataclass diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py index 49b05c88e..a90e93933 100644 --- a/openml/_api/http/client.py +++ b/openml/_api/http/client.py @@ -23,7 +23,7 @@ def dir(self) -> str: def ttl(self) -> int: return settings.cache.ttl - def _get_cache_directory(self, url: str, params: dict[str, Any]) -> Path: + def _get_cache_dir(self, url: str, params: dict[str, Any]) -> Path: parsed_url = urlparse(url) netloc_parts = parsed_url.netloc.split(".")[::-1] # reverse domain path_parts = parsed_url.path.strip("/").split("/") @@ -34,10 +34,10 @@ def _get_cache_directory(self, url: str, params: dict[str, Any]) -> Path: return Path(self.dir).joinpath(*netloc_parts, *path_parts, *params_part) - def _get_cache_response(self, url: str, params: dict[str, Any]) -> Response | None: # noqa: ARG002 - return None + def _get_cache_response(self, cache_dir: Path) -> Response: # noqa: ARG002 + return Response() - def _set_cache_response(self, url: str, params: dict[str, Any], response: Response) -> None: # noqa: ARG002 + def _set_cache_response(self, cache_dir: Path, response: Response) -> None: # noqa: ARG002 return None @@ -54,50 +54,98 @@ def server(self) -> str: def base_url(self) -> str: return self.config.base_url - def _create_url(self, path: str) -> Any: - return urljoin(self.server, urljoin(self.base_url, path)) + @property + def key(self) -> str: + return self.config.key - def get( + @property + def timeout(self) -> int: + return self.config.timeout + + def request( self, + method: str, path: str, *, - params: dict[str, Any] | None = None, use_cache: bool = False, use_api_key: bool = False, + **request_kwargs: Any, ) -> Response: - url = self._create_url(path) - params = dict(params) if params is not None else {} + url = urljoin(self.server, urljoin(self.base_url, path)) + params = request_kwargs.pop("params", {}) + params = params.copy() if use_api_key: - params["api_key"] = self.config.key + params["api_key"] = self.key - if use_cache: - response = self._get_cache_response(url, params) - if response: - return response + headers = request_kwargs.pop("headers", {}) + headers = headers.copy() + headers.update(self.headers) + + timeout = request_kwargs.pop("timeout", self.timeout) + cache_dir = self._get_cache_dir(url, params) - response = requests.get(url, params=params, headers=self.headers, timeout=10) + if use_cache: + try: + return self._get_cache_response(cache_dir) + # TODO: handle ttl expired error + except Exception: + raise + + response = requests.request( + method=method, + url=url, + params=params, + headers=headers, + timeout=timeout, + **request_kwargs, + ) if use_cache: - self._set_cache_response(url, params, response) + self._set_cache_response(cache_dir, response) return response - def post( + def get( self, path: str, *, - data: dict[str, Any] | None = None, - files: Any = None, + use_cache: bool = False, + use_api_key: bool = False, + **request_kwargs: Any, ) -> Response: - url = self._create_url(path) - return requests.post(url, data=data, files=files, headers=self.headers, timeout=10) + # TODO: remove override when cache is implemented + use_cache = False + return self.request( + method="GET", + path=path, + use_cache=use_cache, + use_api_key=use_api_key, + **request_kwargs, + ) + + def post( + self, + path: str, + **request_kwargs: Any, + ) -> Response: + return self.request( + method="POST", + path=path, + use_cache=False, + use_api_key=True, + **request_kwargs, + ) def delete( self, path: str, - *, - params: dict[str, Any] | None = None, + **request_kwargs: Any, ) -> Response: - url = self._create_url(path) - return requests.delete(url, params=params, headers=self.headers, timeout=10) + return self.request( + method="DELETE", + path=path, + use_cache=False, + use_api_key=True, + **request_kwargs, + ) From 8964517d5fa9b656dc1473adfc09e9a56c524073 Mon Sep 17 00:00:00 2001 From: JATAYU000 Date: Thu, 8 Jan 2026 09:14:49 +0530 Subject: [PATCH 07/15] edit, fork, delete updated --- openml/_api/resources/base.py | 36 +++- openml/_api/resources/datasets.py | 278 +++++++++++++++++++++++++++++- 2 files changed, 304 insertions(+), 10 deletions(-) diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py index 9d480b06a..5a74239d1 100644 --- a/openml/_api/resources/base.py +++ b/openml/_api/resources/base.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from typing import TYPE_CHECKING +from typing_extensions import Literal if TYPE_CHECKING: import pandas as pd @@ -20,13 +21,18 @@ def __init__(self, http: HTTPClient): class DatasetsAPI(ResourceAPI, ABC): @abstractmethod def get( - self, dataset_id: int, *, return_response: bool + self, + dataset_id: int | str, + version: int | None = None, + error_if_multiple: bool = False, # noqa: FBT002, FBT001 + *, + return_response: bool = False, ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ... @abstractmethod def list( # noqa: PLR0913 self, - data_id: list[int] | None = None, + data_id: list[int] | None = None, # type: ignore offset: int | None = None, size: int | None = None, status: str | None = None, @@ -39,6 +45,32 @@ def list( # noqa: PLR0913 number_missing_values: int | str | None = None, ) -> pd.DataFrame: ... + @abstractmethod + def delete(self, dataset_id: int) -> bool: ... + + @abstractmethod + def edit( # noqa: PLR0913 + self, + data_id: int, + description: str | None = None, + creator: str | None = None, + contributor: str | None = None, + collection_date: str | None = None, + language: str | None = None, + default_target_attribute: str | None = None, + ignore_attribute: str | list[str] | None = None, # type: ignore + citation: str | None = None, + row_id_attribute: str | None = None, + original_data_url: str | None = None, + paper_url: str | None = None, + ) -> int: ... + + @abstractmethod + def fork(self, data_id: int) -> int: ... + + @abstractmethod + def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: ... + def _name_to_id( self, dataset_name: str, diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py index f985cd75a..5414fba43 100644 --- a/openml/_api/resources/datasets.py +++ b/openml/_api/resources/datasets.py @@ -1,7 +1,9 @@ from __future__ import annotations +from collections import OrderedDict from functools import partial from typing import TYPE_CHECKING, Any +from typing_extensions import Literal if TYPE_CHECKING: from requests import Response @@ -16,11 +18,23 @@ class DatasetsV1(DatasetsAPI): def get( - self, dataset_id: int, *, return_response: bool = False + self, + dataset_id: int | str, + version: int | None = None, + error_if_multiple: bool = False, # noqa: FBT002, FBT001 + *, + return_response: bool = False, ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: - path = f"data/{dataset_id}" + if isinstance(dataset_id, int): + resolved_id = dataset_id + elif dataset_id.isdigit(): + resolved_id = int(dataset_id) + else: + resolved_id = self._name_to_id(dataset_id, version, error_if_multiple) + + path = f"data/{resolved_id}" response = self._http.get(path) - xml_content = response.text # .text returns str, .content returns bytes + xml_content = response.text dataset = self._create_dataset_from_xml(xml_content) if return_response: @@ -97,6 +111,194 @@ def list( # noqa: PLR0913 return pd.concat(batches) + def delete(self, dataset_id: int) -> bool: + """Delete dataset with id `dataset_id` from the OpenML server. + + This can only be done if you are the owner of the dataset and + no tasks are attached to the dataset. + + Parameters + ---------- + dataset_id : int + OpenML id of the dataset + + Returns + ------- + bool + True if the deletion was successful. False otherwise. + """ + return openml.utils._delete_entity("data", dataset_id) + + def edit( # noqa: PLR0913 + self, + data_id: int, + description: str | None = None, + creator: str | None = None, + contributor: str | None = None, + collection_date: str | None = None, + language: str | None = None, + default_target_attribute: str | None = None, + ignore_attribute: str | list[str] | None = None, # type: ignore + citation: str | None = None, + row_id_attribute: str | None = None, + original_data_url: str | None = None, + paper_url: str | None = None, + ) -> int: + """Edits an OpenMLDataset. + + In addition to providing the dataset id of the dataset to edit (through data_id), + you must specify a value for at least one of the optional function arguments, + i.e. one value for a field to edit. + + This function allows editing of both non-critical and critical fields. + Critical fields are default_target_attribute, ignore_attribute, row_id_attribute. + + - Editing non-critical data fields is allowed for all authenticated users. + - Editing critical fields is allowed only for the owner, provided there are no tasks + associated with this dataset. + + If dataset has tasks or if the user is not the owner, the only way + to edit critical fields is to use fork_dataset followed by edit_dataset. + + Parameters + ---------- + data_id : int + ID of the dataset. + description : str + Description of the dataset. + creator : str + The person who created the dataset. + contributor : str + People who contributed to the current version of the dataset. + collection_date : str + The date the data was originally collected, given by the uploader. + language : str + Language in which the data is represented. + Starts with 1 upper case letter, rest lower case, e.g. 'English'. + default_target_attribute : str + The default target attribute, if it exists. + Can have multiple values, comma separated. + ignore_attribute : str | list + Attributes that should be excluded in modelling, + such as identifiers and indexes. + citation : str + Reference(s) that should be cited when building on this data. + row_id_attribute : str, optional + The attribute that represents the row-id column, if present in the + dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not + specified, the index of the dataframe will be used as the + ``row_id_attribute``. If the name of the index is ``None``, it will + be discarded. + + .. versionadded: 0.8 + Inference of ``row_id_attribute`` from a dataframe. + original_data_url : str, optional + For derived data, the url to the original dataset. + paper_url : str, optional + Link to a paper describing the dataset. + + Returns + ------- + Dataset id + """ + if not isinstance(data_id, int): + raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") + + # compose data edit parameters as xml + form_data = {"data_id": data_id} # type: openml._api_calls.DATA_TYPE + xml = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' + xml["oml:data_edit_parameters"] = OrderedDict() + xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml" + xml["oml:data_edit_parameters"]["oml:description"] = description + xml["oml:data_edit_parameters"]["oml:creator"] = creator + xml["oml:data_edit_parameters"]["oml:contributor"] = contributor + xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date + xml["oml:data_edit_parameters"]["oml:language"] = language + xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute + xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute + xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute + xml["oml:data_edit_parameters"]["oml:citation"] = citation + xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url + xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url + + # delete None inputs + for k in list(xml["oml:data_edit_parameters"]): + if not xml["oml:data_edit_parameters"][k]: + del xml["oml:data_edit_parameters"][k] + + file_elements = { + "edit_parameters": ("description.xml", xmltodict.unparse(xml)), + } # type: openml._api_calls.FILE_ELEMENTS_TYPE + result_xml = self._http.post("data/edit", data=form_data, files=file_elements).text + result = xmltodict.parse(result_xml) + data_id = result["oml:data_edit"]["oml:id"] + return int(data_id) + + def fork(self, data_id: int) -> int: + """ + Creates a new dataset version, with the authenticated user as the new owner. + The forked dataset can have distinct dataset meta-data, + but the actual data itself is shared with the original version. + + This API is intended for use when a user is unable to edit the critical fields of a dataset + through the edit_dataset API. + (Critical fields are default_target_attribute, ignore_attribute, row_id_attribute.) + + Specifically, this happens when the user is: + 1. Not the owner of the dataset. + 2. User is the owner of the dataset, but the dataset has tasks. + + In these two cases the only way to edit critical fields is: + 1. STEP 1: Fork the dataset using fork_dataset API + 2. STEP 2: Call edit_dataset API on the forked version. + + + Parameters + ---------- + data_id : int + id of the dataset to be forked + + Returns + ------- + Dataset id of the forked dataset + + """ + if not isinstance(data_id, int): + raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") + # compose data fork parameters + form_data = {"data_id": data_id} + result_xml = self._http.post("data/fork", data=form_data).text + result = xmltodict.parse(result_xml) + data_id = result["oml:data_fork"]["oml:id"] + return int(data_id) + + def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: + """ + Updates the status of a dataset to either 'active' or 'deactivated'. + Please see the OpenML API documentation for a description of the status + and all legal status transitions: + https://docs.openml.org/concepts/data/#dataset-status + + Parameters + ---------- + data_id : int + The data id of the dataset + status : str, + 'active' or 'deactivated' + """ + legal_status = {"active", "deactivated"} + if status not in legal_status: + raise ValueError(f"Illegal status value. Legal values: {legal_status}") + + data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status} + result_xml = self._http.post("data/status/update", data=data).text + result = xmltodict.parse(result_xml) + server_data_id = result["oml:data_status_update"]["oml:id"] + server_status = result["oml:data_status_update"]["oml:status"] + if status != server_status or int(data_id) != int(server_data_id): + # This should never happen + raise ValueError("Data id/status does not collide") + def _list_datasets( self, limit: int, @@ -236,9 +438,21 @@ def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset: class DatasetsV2(DatasetsAPI): def get( - self, dataset_id: int, *, return_response: bool = False + self, + dataset_id: int | str, + version: int | None = None, + error_if_multiple: bool = False, # noqa: FBT002, FBT001 + *, + return_response: bool = False, ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: - path = f"datasets/{dataset_id}" + if isinstance(dataset_id, int): + resolved_id = dataset_id + elif dataset_id.isdigit(): + resolved_id = int(dataset_id) + else: + resolved_id = self._name_to_id(dataset_id, version, error_if_multiple) + + path = f"data/{resolved_id}" response = self._http.get(path) json_content = response.json() dataset = self._create_dataset_from_json(json_content) @@ -317,6 +531,55 @@ def list( # noqa: PLR0913 return pd.concat(batches) + def delete(self, dataset_id: int) -> bool: + raise NotImplementedError() + + def edit( # noqa: PLR0913 + self, + data_id: int, + description: str | None = None, + creator: str | None = None, + contributor: str | None = None, + collection_date: str | None = None, + language: str | None = None, + default_target_attribute: str | None = None, + ignore_attribute: str | list[str] | None = None, # type: ignore + citation: str | None = None, + row_id_attribute: str | None = None, + original_data_url: str | None = None, + paper_url: str | None = None, + ) -> int: + raise NotImplementedError() + + def fork(self, data_id: int) -> int: + raise NotImplementedError() + + def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: + """ + Updates the status of a dataset to either 'active' or 'deactivated'. + Please see the OpenML API documentation for a description of the status + and all legal status transitions: + https://docs.openml.org/concepts/data/#dataset-status + + Parameters + ---------- + data_id : int + The data id of the dataset + status : str, + 'active' or 'deactivated' + """ + legal_status = {"active", "deactivated"} + if status not in legal_status: + raise ValueError(f"Illegal status value. Legal values: {legal_status}") + + data: openml._api_calls.DATA_TYPE = {"dataset_id": data_id, "status": status} + result = self._http.post("datasets/status/update", json=data).json() + server_data_id = result["dataset_id"] + server_status = result["status"] + if status != server_status or int(data_id) != int(server_data_id): + # This should never happen + raise ValueError("Data id/status does not collide") + def _list_datasets( self, limit: int, @@ -365,7 +628,6 @@ def _list_datasets( def __list_datasets(self, json: dict) -> pd.DataFrame: api_call = "datasets/list" datasets_list = self._http.post(api_call, json=json).json() - # Minimalistic check if the JSON is useful assert isinstance(datasets_list, list), type(datasets_list) @@ -379,9 +641,9 @@ def __list_datasets(self, json: dict) -> pd.DataFrame: # The number of qualities can range from 0 to infinity for quality in dataset_.get("quality", []): try: - dataset[quality["name"]] = int(quality["text"]) + dataset[quality["name"]] = int(quality["value"]) except ValueError: - dataset[quality["name"]] = float(quality["text"]) + dataset[quality["name"]] = float(quality["value"]) datasets[dataset["did"]] = dataset return pd.DataFrame.from_dict(datasets, orient="index").astype( From 1c2fa9996aa0024af93ab1819877836b6ab803f2 Mon Sep 17 00:00:00 2001 From: JATAYU000 Date: Thu, 8 Jan 2026 15:57:09 +0530 Subject: [PATCH 08/15] Added features, updated list --- openml/_api/resources/base.py | 76 +---- openml/_api/resources/datasets.py | 494 +++++++++++++----------------- 2 files changed, 234 insertions(+), 336 deletions(-) diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py index 5a74239d1..990dda998 100644 --- a/openml/_api/resources/base.py +++ b/openml/_api/resources/base.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from typing_extensions import Literal if TYPE_CHECKING: @@ -23,26 +23,18 @@ class DatasetsAPI(ResourceAPI, ABC): def get( self, dataset_id: int | str, - version: int | None = None, - error_if_multiple: bool = False, # noqa: FBT002, FBT001 *, return_response: bool = False, ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ... @abstractmethod - def list( # noqa: PLR0913 + def list( self, + limit: int, + offset: int, + *, data_id: list[int] | None = None, # type: ignore - offset: int | None = None, - size: int | None = None, - status: str | None = None, - tag: str | None = None, - data_name: str | None = None, - data_version: int | None = None, - number_instances: int | str | None = None, - number_features: int | str | None = None, - number_classes: int | str | None = None, - number_missing_values: int | str | None = None, + **kwargs: Any, ) -> pd.DataFrame: ... @abstractmethod @@ -71,54 +63,14 @@ def fork(self, data_id: int) -> int: ... @abstractmethod def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: ... - def _name_to_id( - self, - dataset_name: str, - version: int | None = None, - error_if_multiple: bool = False, # noqa: FBT001, FBT002 - ) -> int: - """Attempt to find the dataset id of the dataset with the given name. - - If multiple datasets with the name exist, and ``error_if_multiple`` is ``False``, - then return the least recent still active dataset. - - Raises an error if no dataset with the name is found. - Raises an error if a version is specified but it could not be found. - - Parameters - ---------- - dataset_name : str - The name of the dataset for which to find its id. - version : int, optional - Version to retrieve. If not specified, the oldest active version is returned. - error_if_multiple : bool (default=False) - If `False`, if multiple datasets match, return the least recent active dataset. - If `True`, if multiple datasets match, raise an error. - download_qualities : bool, optional (default=True) - If `True`, also download qualities.xml file. If False it skip the qualities.xml. - - Returns - ------- - int - The id of the dataset. - """ - status = None if version is not None else "active" - candidates = self.list( - data_name=dataset_name, - status=status, - data_version=version, - ) - if error_if_multiple and len(candidates) > 1: - msg = f"Multiple active datasets exist with name '{dataset_name}'." - raise ValueError(msg) - - if candidates.empty: - no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'" - and_version = f" and version '{version}'." if version is not None else "." - raise RuntimeError(no_dataset_for_name + and_version) - - # Dataset ids are chronological so we can just sort based on ids (instead of version) - return candidates["did"].min() # type: ignore + @abstractmethod + def list_qualities(self) -> list[str]: ... # type: ignore + + @abstractmethod + def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: ... + + @abstractmethod + def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool: ... class TasksAPI(ResourceAPI, ABC): diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py index 5414fba43..845212b20 100644 --- a/openml/_api/resources/datasets.py +++ b/openml/_api/resources/datasets.py @@ -1,7 +1,6 @@ from __future__ import annotations from collections import OrderedDict -from functools import partial from typing import TYPE_CHECKING, Any from typing_extensions import Literal @@ -20,19 +19,10 @@ class DatasetsV1(DatasetsAPI): def get( self, dataset_id: int | str, - version: int | None = None, - error_if_multiple: bool = False, # noqa: FBT002, FBT001 *, return_response: bool = False, ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: - if isinstance(dataset_id, int): - resolved_id = dataset_id - elif dataset_id.isdigit(): - resolved_id = int(dataset_id) - else: - resolved_id = self._name_to_id(dataset_id, version, error_if_multiple) - - path = f"data/{resolved_id}" + path = f"data/{dataset_id}" response = self._http.get(path) xml_content = response.text dataset = self._create_dataset_from_xml(xml_content) @@ -42,74 +32,88 @@ def get( return dataset - def list( # noqa: PLR0913 + def list( self, - data_id: list[int] | None = None, - offset: int | None = None, - size: int | None = None, - status: str | None = None, - tag: str | None = None, - data_name: str | None = None, - data_version: int | None = None, - number_instances: int | str | None = None, - number_features: int | str | None = None, - number_classes: int | str | None = None, - number_missing_values: int | str | None = None, + limit: int, + offset: int, + *, + data_id: list[int] | None = None, # type: ignore + **kwargs: Any, ) -> pd.DataFrame: - """Return a dataframe of all dataset which are on OpenML. - - Supports large amount of results. + """ + Perform api call to return a list of all datasets. Parameters ---------- - data_id : list, optional - A list of data ids, to specify which datasets should be - listed - offset : int, optional - The number of datasets to skip, starting from the first. - size : int, optional + The arguments that are lists are separated from the single value + ones which are put into the kwargs. + display_errors is also separated from the kwargs since it has a + default value. + + limit : int The maximum number of datasets to show. - status : str, optional - Should be {active, in_preparation, deactivated}. By - default active datasets are returned, but also datasets - from another status can be requested. - tag : str, optional - data_name : str, optional - data_version : int, optional - number_instances : int | str, optional - number_features : int | str, optional - number_classes : int | str, optional - number_missing_values : int | str, optional + offset : int + The number of datasets to skip, starting from the first. + data_id : list, optional + + kwargs : dict, optional + Legal filter operators (keys in the dict): + tag, status, limit, offset, data_name, data_version, number_instances, + number_features, number_classes, number_missing_values. Returns ------- - datasets: dataframe - Each row maps to a dataset - Each column contains the following information: - - dataset id - - name - - format - - status - If qualities are calculated for the dataset, some of - these are also included as columns. + datasets : dataframe """ - listing_call = partial( - self._list_datasets, - data_id=data_id, - status=status, - tag=tag, - data_name=data_name, - data_version=data_version, - number_instances=number_instances, - number_features=number_features, - number_classes=number_classes, - number_missing_values=number_missing_values, + api_call = "data/list" + + if limit is not None: + api_call += f"/limit/{limit}" + if offset is not None: + api_call += f"/offset/{offset}" + + if kwargs is not None: + for operator, value in kwargs.items(): + if value is not None: + api_call += f"/{operator}/{value}" + if data_id is not None: + api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}" + + xml_string = self._http.get(api_call).text + datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",)) + + # Minimalistic check if the XML is useful + assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type( + datasets_dict["oml:data"], ) - batches = openml.utils._list_all(listing_call, offset=offset, limit=size) - if len(batches) == 0: - return pd.DataFrame() + assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[ + "oml:data" + ]["@xmlns:oml"] + + datasets = {} + for dataset_ in datasets_dict["oml:data"]["oml:dataset"]: + ignore_attribute = ["oml:file_id", "oml:quality"] + dataset = { + k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute + } + dataset["did"] = int(dataset["did"]) + dataset["version"] = int(dataset["version"]) + + # The number of qualities can range from 0 to infinity + for quality in dataset_.get("oml:quality", []): + try: + dataset[quality["@name"]] = int(quality["#text"]) + except ValueError: + dataset[quality["@name"]] = float(quality["#text"]) + datasets[dataset["did"]] = dataset - return pd.concat(batches) + return pd.DataFrame.from_dict(datasets, orient="index").astype( + { + "did": int, + "version": int, + "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), + } + ) def delete(self, dataset_id: int) -> bool: """Delete dataset with id `dataset_id` from the OpenML server. @@ -299,90 +303,27 @@ def status_update(self, data_id: int, status: Literal["active", "deactivated"]) # This should never happen raise ValueError("Data id/status does not collide") - def _list_datasets( - self, - limit: int, - offset: int, - *, - data_id: list[int] | None = None, # type: ignore - **kwargs: Any, - ) -> pd.DataFrame: - """ - Perform api call to return a list of all datasets. - - Parameters - ---------- - The arguments that are lists are separated from the single value - ones which are put into the kwargs. - display_errors is also separated from the kwargs since it has a - default value. + def list_qualities(self) -> list[str]: # type: ignore + """Return list of data qualities available. - limit : int - The maximum number of datasets to show. - offset : int - The number of datasets to skip, starting from the first. - data_id : list, optional - - kwargs : dict, optional - Legal filter operators (keys in the dict): - tag, status, limit, offset, data_name, data_version, number_instances, - number_features, number_classes, number_missing_values. + The function performs an API call to retrieve the entire list of + data qualities that are computed on the datasets uploaded. Returns ------- - datasets : dataframe + list """ - api_call = "data/list" - - if limit is not None: - api_call += f"/limit/{limit}" - if offset is not None: - api_call += f"/offset/{offset}" - - if kwargs is not None: - for operator, value in kwargs.items(): - if value is not None: - api_call += f"/{operator}/{value}" - if data_id is not None: - api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}" - return self.__list_datasets(api_call=api_call) - - def __list_datasets(self, api_call: str) -> pd.DataFrame: + api_call = "data/qualities/list" xml_string = self._http.get(api_call).text - datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",)) - + qualities = xmltodict.parse(xml_string, force_list=("oml:quality")) # Minimalistic check if the XML is useful - assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type( - datasets_dict["oml:data"], - ) - assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[ - "oml:data" - ]["@xmlns:oml"] - - datasets = {} - for dataset_ in datasets_dict["oml:data"]["oml:dataset"]: - ignore_attribute = ["oml:file_id", "oml:quality"] - dataset = { - k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute - } - dataset["did"] = int(dataset["did"]) - dataset["version"] = int(dataset["version"]) + if "oml:data_qualities_list" not in qualities: + raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"') - # The number of qualities can range from 0 to infinity - for quality in dataset_.get("oml:quality", []): - try: - dataset[quality["@name"]] = int(quality["#text"]) - except ValueError: - dataset[quality["@name"]] = float(quality["#text"]) - datasets[dataset["did"]] = dataset + if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list): + raise TypeError('Error in return XML, does not contain "oml:quality" as a list') - return pd.DataFrame.from_dict(datasets, orient="index").astype( - { - "did": int, - "version": int, - "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), - } - ) + return qualities["oml:data_qualities_list"]["oml:quality"] def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset: """Create a dataset given a xml string. @@ -435,24 +376,74 @@ def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset: parquet_file=str(parquet_file) if parquet_file is not None else None, ) + def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: + """ + An ontology describes the concept that are described in a feature. An + ontology is defined by an URL where the information is provided. Adds + an ontology (URL) to a given dataset feature (defined by a dataset id + and index). The dataset has to exists on OpenML and needs to have been + processed by the evaluation engine. + + Parameters + ---------- + data_id : int + id of the dataset to which the feature belongs + index : int + index of the feature in dataset (0-based) + ontology : str + URL to ontology (max. 256 characters) + + Returns + ------- + True or throws an OpenML server exception + """ + upload_data: dict[str, int | str] = { + "data_id": data_id, + "index": index, + "ontology": ontology, + } + self._http.post("data/feature/ontology/add", data=upload_data) + # an error will be thrown in case the request was unsuccessful + return True + + def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool: + """ + Removes an existing ontology (URL) from a given dataset feature (defined + by a dataset id and index). The dataset has to exists on OpenML and needs + to have been processed by the evaluation engine. Ontology needs to be + attached to the specific fearure. + + Parameters + ---------- + data_id : int + id of the dataset to which the feature belongs + index : int + index of the feature in dataset (0-based) + ontology : str + URL to ontology (max. 256 characters) + + Returns + ------- + True or throws an OpenML server exception + """ + upload_data: dict[str, int | str] = { + "data_id": data_id, + "index": index, + "ontology": ontology, + } + self._http.post("data/feature/ontology/remove", data=upload_data) + # an error will be thrown in case the request was unsuccessful + return True + class DatasetsV2(DatasetsAPI): def get( self, dataset_id: int | str, - version: int | None = None, - error_if_multiple: bool = False, # noqa: FBT002, FBT001 *, return_response: bool = False, ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: - if isinstance(dataset_id, int): - resolved_id = dataset_id - elif dataset_id.isdigit(): - resolved_id = int(dataset_id) - else: - resolved_id = self._name_to_id(dataset_id, version, error_if_multiple) - - path = f"data/{resolved_id}" + path = f"data/{dataset_id}" response = self._http.get(path) json_content = response.json() dataset = self._create_dataset_from_json(json_content) @@ -462,125 +453,7 @@ def get( return dataset - def list( # noqa: PLR0913 - self, - data_id: list[int] | None = None, - offset: int | None = None, - size: int | None = None, - status: str | None = None, - tag: str | None = None, - data_name: str | None = None, - data_version: int | None = None, - number_instances: int | str | None = None, - number_features: int | str | None = None, - number_classes: int | str | None = None, - number_missing_values: int | str | None = None, - ) -> pd.DataFrame: - """Return a dataframe of all dataset which are on OpenML. - - Supports large amount of results. - - Parameters - ---------- - data_id : list, optional - A list of data ids, to specify which datasets should be - listed - offset : int, optional - The number of datasets to skip, starting from the first. - size : int, optional - The maximum number of datasets to show. - status : str, optional - Should be {active, in_preparation, deactivated}. By - default active datasets are returned, but also datasets - from another status can be requested. - tag : str, optional - data_name : str, optional - data_version : int, optional - number_instances : int | str, optional - number_features : int | str, optional - number_classes : int | str, optional - number_missing_values : int | str, optional - - Returns - ------- - datasets: dataframe - Each row maps to a dataset - Each column contains the following information: - - dataset id - - name - - format - - status - If qualities are calculated for the dataset, some of - these are also included as columns. - """ - listing_call = partial( - self._list_datasets, - data_id=data_id, - status=status, - tag=tag, - data_name=data_name, - data_version=data_version, - number_instances=number_instances, - number_features=number_features, - number_classes=number_classes, - number_missing_values=number_missing_values, - ) - batches = openml.utils._list_all(listing_call, offset=offset, limit=size) - if len(batches) == 0: - return pd.DataFrame() - - return pd.concat(batches) - - def delete(self, dataset_id: int) -> bool: - raise NotImplementedError() - - def edit( # noqa: PLR0913 - self, - data_id: int, - description: str | None = None, - creator: str | None = None, - contributor: str | None = None, - collection_date: str | None = None, - language: str | None = None, - default_target_attribute: str | None = None, - ignore_attribute: str | list[str] | None = None, # type: ignore - citation: str | None = None, - row_id_attribute: str | None = None, - original_data_url: str | None = None, - paper_url: str | None = None, - ) -> int: - raise NotImplementedError() - - def fork(self, data_id: int) -> int: - raise NotImplementedError() - - def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: - """ - Updates the status of a dataset to either 'active' or 'deactivated'. - Please see the OpenML API documentation for a description of the status - and all legal status transitions: - https://docs.openml.org/concepts/data/#dataset-status - - Parameters - ---------- - data_id : int - The data id of the dataset - status : str, - 'active' or 'deactivated' - """ - legal_status = {"active", "deactivated"} - if status not in legal_status: - raise ValueError(f"Illegal status value. Legal values: {legal_status}") - - data: openml._api_calls.DATA_TYPE = {"dataset_id": data_id, "status": status} - result = self._http.post("datasets/status/update", json=data).json() - server_data_id = result["dataset_id"] - server_status = result["status"] - if status != server_status or int(data_id) != int(server_data_id): - # This should never happen - raise ValueError("Data id/status does not collide") - - def _list_datasets( + def list( self, limit: int, offset: int, @@ -623,9 +496,6 @@ def _list_datasets( if value is not None: json[operator] = value - return self.__list_datasets(json=json) - - def __list_datasets(self, json: dict) -> pd.DataFrame: api_call = "datasets/list" datasets_list = self._http.post(api_call, json=json).json() # Minimalistic check if the JSON is useful @@ -654,6 +524,76 @@ def __list_datasets(self, json: dict) -> pd.DataFrame: } ) + def delete(self, dataset_id: int) -> bool: + raise NotImplementedError() + + def edit( # noqa: PLR0913 + self, + data_id: int, + description: str | None = None, + creator: str | None = None, + contributor: str | None = None, + collection_date: str | None = None, + language: str | None = None, + default_target_attribute: str | None = None, + ignore_attribute: str | list[str] | None = None, # type: ignore + citation: str | None = None, + row_id_attribute: str | None = None, + original_data_url: str | None = None, + paper_url: str | None = None, + ) -> int: + raise NotImplementedError() + + def fork(self, data_id: int) -> int: + raise NotImplementedError() + + def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: + """ + Updates the status of a dataset to either 'active' or 'deactivated'. + Please see the OpenML API documentation for a description of the status + and all legal status transitions: + https://docs.openml.org/concepts/data/#dataset-status + + Parameters + ---------- + data_id : int + The data id of the dataset + status : str, + 'active' or 'deactivated' + """ + legal_status = {"active", "deactivated"} + if status not in legal_status: + raise ValueError(f"Illegal status value. Legal values: {legal_status}") + + data: openml._api_calls.DATA_TYPE = {"dataset_id": data_id, "status": status} + result = self._http.post("datasets/status/update", json=data).json() + server_data_id = result["dataset_id"] + server_status = result["status"] + if status != server_status or int(data_id) != int(server_data_id): + # This should never happen + raise ValueError("Data id/status does not collide") + + def list_qualities(self) -> list[str]: # type: ignore + """Return list of data qualities available. + + The function performs an API call to retrieve the entire list of + data qualities that are computed on the datasets uploaded. + + Returns + ------- + list + """ + api_call = "datasets/qualities/list" + qualities = self._http.get(api_call).json() + # Minimalistic check if the XML is useful + if "data_qualities_list" not in qualities: + raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"') + + if not isinstance(qualities["data_qualities_list"]["quality"], list): + raise TypeError('Error in return json, does not contain "quality" as a list') + + return qualities["data_qualities_list"]["quality"] + def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset: """Create a dataset given a json. @@ -702,3 +642,9 @@ def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset: parquet_url=json_content.get("parquet_url"), parquet_file=str(parquet_file) if parquet_file is not None else None, ) + + def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: + raise NotImplementedError() + + def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool: + raise NotImplementedError() From 9bcbcb32c232bb35b34e90ad7739de6c938ee5f3 Mon Sep 17 00:00:00 2001 From: JATAYU000 Date: Fri, 9 Jan 2026 13:01:34 +0530 Subject: [PATCH 09/15] Refactor functions, except get --- openml/datasets/functions.py | 181 ++++------------------------------- 1 file changed, 21 insertions(+), 160 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index ac5466a44..23cdefdd2 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -5,7 +5,6 @@ import logging import os import warnings -from collections import OrderedDict from functools import partial from pathlib import Path from pyexpat import ExpatError @@ -22,6 +21,7 @@ import openml._api_calls import openml.utils +from openml._api import api_context from openml.config import OPENML_SKIP_PARQUET_ENV_VAR from openml.exceptions import ( OpenMLHashException, @@ -65,17 +65,7 @@ def list_qualities() -> list[str]: ------- list """ - api_call = "data/qualities/list" - xml_string = openml._api_calls._perform_api_call(api_call, "get") - qualities = xmltodict.parse(xml_string, force_list=("oml:quality")) - # Minimalistic check if the XML is useful - if "oml:data_qualities_list" not in qualities: - raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"') - - if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list): - raise TypeError('Error in return XML, does not contain "oml:quality" as a list') - - return qualities["oml:data_qualities_list"]["oml:quality"] + return api_context.backend.datasets.list_qualities() def list_datasets( @@ -129,7 +119,7 @@ def list_datasets( these are also included as columns. """ listing_call = partial( - _list_datasets, + api_context.backend.datasets.list, data_id=data_id, status=status, tag=tag, @@ -147,92 +137,6 @@ def list_datasets( return pd.concat(batches) -def _list_datasets( - limit: int, - offset: int, - *, - data_id: list[int] | None = None, - **kwargs: Any, -) -> pd.DataFrame: - """ - Perform api call to return a list of all datasets. - - Parameters - ---------- - The arguments that are lists are separated from the single value - ones which are put into the kwargs. - display_errors is also separated from the kwargs since it has a - default value. - - limit : int - The maximum number of datasets to show. - offset : int - The number of datasets to skip, starting from the first. - data_id : list, optional - - kwargs : dict, optional - Legal filter operators (keys in the dict): - tag, status, limit, offset, data_name, data_version, number_instances, - number_features, number_classes, number_missing_values. - - Returns - ------- - datasets : dataframe - """ - api_call = "data/list" - - if limit is not None: - api_call += f"/limit/{limit}" - if offset is not None: - api_call += f"/offset/{offset}" - - if kwargs is not None: - for operator, value in kwargs.items(): - if value is not None: - api_call += f"/{operator}/{value}" - if data_id is not None: - api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}" - return __list_datasets(api_call=api_call) - - -def __list_datasets(api_call: str) -> pd.DataFrame: - xml_string = openml._api_calls._perform_api_call(api_call, "get") - datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",)) - - # Minimalistic check if the XML is useful - assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type( - datasets_dict["oml:data"], - ) - assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[ - "oml:data" - ]["@xmlns:oml"] - - datasets = {} - for dataset_ in datasets_dict["oml:data"]["oml:dataset"]: - ignore_attribute = ["oml:file_id", "oml:quality"] - dataset = { - k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute - } - dataset["did"] = int(dataset["did"]) - dataset["version"] = int(dataset["version"]) - - # The number of qualities can range from 0 to infinity - for quality in dataset_.get("oml:quality", []): - try: - dataset[quality["@name"]] = int(quality["#text"]) - except ValueError: - dataset[quality["@name"]] = float(quality["#text"]) - datasets[dataset["did"]] = dataset - - return pd.DataFrame.from_dict(datasets, orient="index").astype( - { - "did": int, - "version": int, - "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), - } - ) - - def _expand_parameter(parameter: str | list[str] | None) -> list[str]: expanded_parameter = [] if isinstance(parameter, str): @@ -808,14 +712,7 @@ def status_update(data_id: int, status: Literal["active", "deactivated"]) -> Non if status not in legal_status: raise ValueError(f"Illegal status value. Legal values: {legal_status}") - data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status} - result_xml = openml._api_calls._perform_api_call("data/status/update", "post", data=data) - result = xmltodict.parse(result_xml) - server_data_id = result["oml:data_status_update"]["oml:id"] - server_status = result["oml:data_status_update"]["oml:status"] - if status != server_status or int(data_id) != int(server_data_id): - # This should never happen - raise ValueError("Data id/status does not collide") + api_context.backend.datasets.status_update(data_id=data_id, status=status) def edit_dataset( @@ -889,43 +786,20 @@ def edit_dataset( ------- Dataset id """ - if not isinstance(data_id, int): - raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") - - # compose data edit parameters as xml - form_data = {"data_id": data_id} # type: openml._api_calls.DATA_TYPE - xml = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' - xml["oml:data_edit_parameters"] = OrderedDict() - xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml" - xml["oml:data_edit_parameters"]["oml:description"] = description - xml["oml:data_edit_parameters"]["oml:creator"] = creator - xml["oml:data_edit_parameters"]["oml:contributor"] = contributor - xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date - xml["oml:data_edit_parameters"]["oml:language"] = language - xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute - xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute - xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute - xml["oml:data_edit_parameters"]["oml:citation"] = citation - xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url - xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url - - # delete None inputs - for k in list(xml["oml:data_edit_parameters"]): - if not xml["oml:data_edit_parameters"][k]: - del xml["oml:data_edit_parameters"][k] - - file_elements = { - "edit_parameters": ("description.xml", xmltodict.unparse(xml)), - } # type: openml._api_calls.FILE_ELEMENTS_TYPE - result_xml = openml._api_calls._perform_api_call( - "data/edit", - "post", - data=form_data, - file_elements=file_elements, + return api_context.backend.datasets.edit( + data_id, + description, + creator, + contributor, + collection_date, + language, + default_target_attribute, + ignore_attribute, + citation, + row_id_attribute, + original_data_url, + paper_url, ) - result = xmltodict.parse(result_xml) - data_id = result["oml:data_edit"]["oml:id"] - return int(data_id) def fork_dataset(data_id: int) -> int: @@ -957,14 +831,7 @@ def fork_dataset(data_id: int) -> int: Dataset id of the forked dataset """ - if not isinstance(data_id, int): - raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") - # compose data fork parameters - form_data = {"data_id": data_id} # type: openml._api_calls.DATA_TYPE - result_xml = openml._api_calls._perform_api_call("data/fork", "post", data=form_data) - result = xmltodict.parse(result_xml) - data_id = result["oml:data_fork"]["oml:id"] - return int(data_id) + return api_context.backend.datasets.fork(data_id=data_id) def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool: @@ -988,10 +855,7 @@ def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool: ------- True or throws an OpenML server exception """ - upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology} - openml._api_calls._perform_api_call("data/feature/ontology/add", "post", data=upload_data) - # an error will be thrown in case the request was unsuccessful - return True + return api_context.backend.datasets.feature_add_ontology(data_id, index, ontology) def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> bool: @@ -1014,10 +878,7 @@ def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> boo ------- True or throws an OpenML server exception """ - upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology} - openml._api_calls._perform_api_call("data/feature/ontology/remove", "post", data=upload_data) - # an error will be thrown in case the request was unsuccessful - return True + return api_context.backend.datasets.feature_remove_ontology(data_id, index, ontology) def _topic_add_dataset(data_id: int, topic: str) -> int: @@ -1460,4 +1321,4 @@ def delete_dataset(dataset_id: int) -> bool: bool True if the deletion was successful. False otherwise. """ - return openml.utils._delete_entity("data", dataset_id) + return api_context.backend.datasets.delete(dataset_id) From 96df5e30b46ea80633cb9593ceacf36ff10c8308 Mon Sep 17 00:00:00 2001 From: JATAYU000 Date: Fri, 9 Jan 2026 15:38:07 +0530 Subject: [PATCH 10/15] Remove circular import using lazy import --- openml/datasets/functions.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 23cdefdd2..6ede42ea9 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -21,7 +21,6 @@ import openml._api_calls import openml.utils -from openml._api import api_context from openml.config import OPENML_SKIP_PARQUET_ENV_VAR from openml.exceptions import ( OpenMLHashException, @@ -65,6 +64,8 @@ def list_qualities() -> list[str]: ------- list """ + from openml._api import api_context + return api_context.backend.datasets.list_qualities() @@ -118,6 +119,8 @@ def list_datasets( If qualities are calculated for the dataset, some of these are also included as columns. """ + from openml._api import api_context + listing_call = partial( api_context.backend.datasets.list, data_id=data_id, @@ -708,6 +711,8 @@ def status_update(data_id: int, status: Literal["active", "deactivated"]) -> Non status : str, 'active' or 'deactivated' """ + from openml._api import api_context + legal_status = {"active", "deactivated"} if status not in legal_status: raise ValueError(f"Illegal status value. Legal values: {legal_status}") @@ -786,6 +791,8 @@ def edit_dataset( ------- Dataset id """ + from openml._api import api_context + return api_context.backend.datasets.edit( data_id, description, @@ -831,6 +838,8 @@ def fork_dataset(data_id: int) -> int: Dataset id of the forked dataset """ + from openml._api import api_context + return api_context.backend.datasets.fork(data_id=data_id) @@ -855,6 +864,8 @@ def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool: ------- True or throws an OpenML server exception """ + from openml._api import api_context + return api_context.backend.datasets.feature_add_ontology(data_id, index, ontology) @@ -878,6 +889,8 @@ def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> boo ------- True or throws an OpenML server exception """ + from openml._api import api_context + return api_context.backend.datasets.feature_remove_ontology(data_id, index, ontology) @@ -1321,4 +1334,6 @@ def delete_dataset(dataset_id: int) -> bool: bool True if the deletion was successful. False otherwise. """ + from openml._api import api_context + return api_context.backend.datasets.delete(dataset_id) From c955f43c7f2cbb86fdd759b179b6c2dcfcf8b7e5 Mon Sep 17 00:00:00 2001 From: JATAYU000 Date: Wed, 14 Jan 2026 17:47:23 +0530 Subject: [PATCH 11/15] Modify reviews, feature and qualities --- openml/_api/resources/base.py | 27 +- openml/_api/resources/datasets.py | 437 +++++++++++++++++++++++------- openml/datasets/dataset.py | 77 +----- openml/datasets/functions.py | 6 +- 4 files changed, 376 insertions(+), 171 deletions(-) diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py index 990dda998..3030ce6ff 100644 --- a/openml/_api/resources/base.py +++ b/openml/_api/resources/base.py @@ -1,6 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod +from pathlib import Path from typing import TYPE_CHECKING, Any from typing_extensions import Literal @@ -9,7 +10,7 @@ from requests import Response from openml._api.http import HTTPClient - from openml.datasets.dataset import OpenMLDataset + from openml.datasets.dataset import OpenMLDataFeature, OpenMLDataset from openml.tasks.task import OpenMLTask @@ -58,19 +59,35 @@ def edit( # noqa: PLR0913 ) -> int: ... @abstractmethod - def fork(self, data_id: int) -> int: ... + def fork(self, dataset_id: int) -> int: ... @abstractmethod - def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: ... + def status_update(self, dataset_id: int, status: Literal["active", "deactivated"]) -> None: ... @abstractmethod def list_qualities(self) -> list[str]: ... # type: ignore @abstractmethod - def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: ... + def feature_add_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: ... @abstractmethod - def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool: ... + def feature_remove_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: ... + + @abstractmethod + def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]: ... + + @abstractmethod + def get_qualities(self, dataset_id: int) -> dict[str, float] | None: ... + + @abstractmethod + def parse_features_file( + self, features_file: Path, features_pickle_file: Path + ) -> dict[int, OpenMLDataFeature]: ... + + @abstractmethod + def parse_qualities_file( + self, qualities_file: Path, qualities_pickle_file: Path + ) -> dict[str, float] | None: ... class TasksAPI(ResourceAPI, ABC): diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py index 845212b20..58883f626 100644 --- a/openml/_api/resources/datasets.py +++ b/openml/_api/resources/datasets.py @@ -1,18 +1,30 @@ from __future__ import annotations +import json +import logging +import pickle from collections import OrderedDict +from pathlib import Path from typing import TYPE_CHECKING, Any from typing_extensions import Literal +from openml._api.resources.base import DatasetsAPI +from openml.datasets.data_feature import OpenMLDataFeature +from openml.datasets.dataset import OpenMLDataset +from openml.exceptions import OpenMLNotAuthorizedError, OpenMLServerError, OpenMLServerException + if TYPE_CHECKING: from requests import Response + import openml + import pandas as pd import xmltodict -import openml.utils -from openml._api.resources.base import DatasetsAPI -from openml.datasets.dataset import OpenMLDataset +logger = logging.getLogger(__name__) + + +NO_ACCESS_GRANTED_ERRCODE = 112 class DatasetsV1(DatasetsAPI): @@ -26,7 +38,6 @@ def get( response = self._http.get(path) xml_content = response.text dataset = self._create_dataset_from_xml(xml_content) - if return_response: return dataset, response @@ -78,42 +89,8 @@ def list( api_call += f"/{operator}/{value}" if data_id is not None: api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}" - xml_string = self._http.get(api_call).text - datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",)) - - # Minimalistic check if the XML is useful - assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type( - datasets_dict["oml:data"], - ) - assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[ - "oml:data" - ]["@xmlns:oml"] - - datasets = {} - for dataset_ in datasets_dict["oml:data"]["oml:dataset"]: - ignore_attribute = ["oml:file_id", "oml:quality"] - dataset = { - k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute - } - dataset["did"] = int(dataset["did"]) - dataset["version"] = int(dataset["version"]) - - # The number of qualities can range from 0 to infinity - for quality in dataset_.get("oml:quality", []): - try: - dataset[quality["@name"]] = int(quality["#text"]) - except ValueError: - dataset[quality["@name"]] = float(quality["#text"]) - datasets[dataset["did"]] = dataset - - return pd.DataFrame.from_dict(datasets, orient="index").astype( - { - "did": int, - "version": int, - "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), - } - ) + return self._parse_list_xml(xml_string) def delete(self, dataset_id: int) -> bool: """Delete dataset with id `dataset_id` from the OpenML server. @@ -131,11 +108,45 @@ def delete(self, dataset_id: int) -> bool: bool True if the deletion was successful. False otherwise. """ - return openml.utils._delete_entity("data", dataset_id) + url_suffix = f"data/{dataset_id}" + try: + result_xml = self._http.delete(url_suffix) + result = xmltodict.parse(result_xml) + return "oml:data_delete" in result + except OpenMLServerException as e: + # https://github.com/openml/OpenML/blob/21f6188d08ac24fcd2df06ab94cf421c946971b0/openml_OS/views/pages/api_new/v1/xml/pre.php + # Most exceptions are descriptive enough to be raised as their standard + # OpenMLServerException, however there are two cases where we add information: + # - a generic "failed" message, we direct them to the right issue board + # - when the user successfully authenticates with the server, + # but user is not allowed to take the requested action, + # in which case we specify a OpenMLNotAuthorizedError. + by_other_user = [323, 353, 393, 453, 594] + has_dependent_entities = [324, 326, 327, 328, 354, 454, 464, 595] + unknown_reason = [325, 355, 394, 455, 593] + if e.code in by_other_user: + raise OpenMLNotAuthorizedError( + message=("The data can not be deleted because it was not uploaded by you."), + ) from e + if e.code in has_dependent_entities: + raise OpenMLNotAuthorizedError( + message=( + f"The data can not be deleted because " + f"it still has associated entities: {e.message}" + ), + ) from e + if e.code in unknown_reason: + raise OpenMLServerError( + message=( + "The data can not be deleted for unknown reason," + " please open an issue at: https://github.com/openml/openml/issues/new" + ), + ) from e + raise e def edit( # noqa: PLR0913 self, - data_id: int, + dataset_id: int, description: str | None = None, creator: str | None = None, contributor: str | None = None, @@ -150,7 +161,7 @@ def edit( # noqa: PLR0913 ) -> int: """Edits an OpenMLDataset. - In addition to providing the dataset id of the dataset to edit (through data_id), + In addition to providing the dataset id of the dataset to edit (through dataset_id), you must specify a value for at least one of the optional function arguments, i.e. one value for a field to edit. @@ -166,7 +177,7 @@ def edit( # noqa: PLR0913 Parameters ---------- - data_id : int + dataset_id : int ID of the dataset. description : str Description of the dataset. @@ -205,11 +216,11 @@ def edit( # noqa: PLR0913 ------- Dataset id """ - if not isinstance(data_id, int): - raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") + if not isinstance(dataset_id, int): + raise TypeError(f"`dataset_id` must be of type `int`, not {type(dataset_id)}.") # compose data edit parameters as xml - form_data = {"data_id": data_id} # type: openml._api_calls.DATA_TYPE + form_data = {"data_id": dataset_id} # type: openml._api_calls.DATA_TYPE xml = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' xml["oml:data_edit_parameters"] = OrderedDict() xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml" @@ -235,10 +246,10 @@ def edit( # noqa: PLR0913 } # type: openml._api_calls.FILE_ELEMENTS_TYPE result_xml = self._http.post("data/edit", data=form_data, files=file_elements).text result = xmltodict.parse(result_xml) - data_id = result["oml:data_edit"]["oml:id"] - return int(data_id) + dataset_id = result["oml:data_edit"]["oml:id"] + return int(dataset_id) - def fork(self, data_id: int) -> int: + def fork(self, dataset_id: int) -> int: """ Creates a new dataset version, with the authenticated user as the new owner. The forked dataset can have distinct dataset meta-data, @@ -259,7 +270,7 @@ def fork(self, data_id: int) -> int: Parameters ---------- - data_id : int + dataset_id : int id of the dataset to be forked Returns @@ -267,16 +278,16 @@ def fork(self, data_id: int) -> int: Dataset id of the forked dataset """ - if not isinstance(data_id, int): - raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") + if not isinstance(dataset_id, int): + raise TypeError(f"`dataset_id` must be of type `int`, not {type(dataset_id)}.") # compose data fork parameters - form_data = {"data_id": data_id} + form_data = {"data_id": dataset_id} result_xml = self._http.post("data/fork", data=form_data).text result = xmltodict.parse(result_xml) - data_id = result["oml:data_fork"]["oml:id"] - return int(data_id) + dataset_id = result["oml:data_fork"]["oml:id"] + return int(dataset_id) - def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: + def status_update(self, dataset_id: int, status: Literal["active", "deactivated"]) -> None: """ Updates the status of a dataset to either 'active' or 'deactivated'. Please see the OpenML API documentation for a description of the status @@ -285,7 +296,7 @@ def status_update(self, data_id: int, status: Literal["active", "deactivated"]) Parameters ---------- - data_id : int + dataset_id : int The data id of the dataset status : str, 'active' or 'deactivated' @@ -294,12 +305,12 @@ def status_update(self, data_id: int, status: Literal["active", "deactivated"]) if status not in legal_status: raise ValueError(f"Illegal status value. Legal values: {legal_status}") - data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status} + data: openml._api_calls.DATA_TYPE = {"data_id": dataset_id, "status": status} result_xml = self._http.post("data/status/update", data=data).text result = xmltodict.parse(result_xml) server_data_id = result["oml:data_status_update"]["oml:id"] server_status = result["oml:data_status_update"]["oml:status"] - if status != server_status or int(data_id) != int(server_data_id): + if status != server_status or int(dataset_id) != int(server_data_id): # This should never happen raise ValueError("Data id/status does not collide") @@ -376,7 +387,7 @@ def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset: parquet_file=str(parquet_file) if parquet_file is not None else None, ) - def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: + def feature_add_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: """ An ontology describes the concept that are described in a feature. An ontology is defined by an URL where the information is provided. Adds @@ -386,7 +397,7 @@ def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: Parameters ---------- - data_id : int + dataset_id : int id of the dataset to which the feature belongs index : int index of the feature in dataset (0-based) @@ -398,7 +409,7 @@ def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: True or throws an OpenML server exception """ upload_data: dict[str, int | str] = { - "data_id": data_id, + "data_id": dataset_id, "index": index, "ontology": ontology, } @@ -406,7 +417,7 @@ def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: # an error will be thrown in case the request was unsuccessful return True - def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool: + def feature_remove_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: """ Removes an existing ontology (URL) from a given dataset feature (defined by a dataset id and index). The dataset has to exists on OpenML and needs @@ -415,7 +426,7 @@ def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bo Parameters ---------- - data_id : int + dataset_id : int id of the dataset to which the feature belongs index : int index of the feature in dataset (0-based) @@ -427,7 +438,7 @@ def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bo True or throws an OpenML server exception """ upload_data: dict[str, int | str] = { - "data_id": data_id, + "data_id": dataset_id, "index": index, "ontology": ontology, } @@ -435,6 +446,133 @@ def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bo # an error will be thrown in case the request was unsuccessful return True + def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]: + path = f"data/features/{dataset_id}" + xml = self._http.get(path, use_cache=True).text + + return self._parse_features_xml(xml) + + def get_qualities(self, dataset_id: int) -> dict[str, float] | None: + path = f"data/qualities/{dataset_id!s}" + try: + self._http.get(path, use_cache=True).text + except OpenMLServerException as e: + if e.code == 362 and str(e) == "No qualities found - None": + # quality file stays as None + logger.warning(f"No qualities found for dataset {dataset_id}") + return None + + raise e + + return self._parse_qualities_xml() + + def parse_features_file( + self, features_file: Path, features_pickle_file: Path + ) -> dict[int, OpenMLDataFeature]: + if features_file.suffix != ".xml": + # TODO (Shrivaths) can only parse xml warn/ raise exception + raise NotImplementedError() + + with Path(features_file).open("r", encoding="utf8") as fh: + features_xml = fh.read() + + features = self._parse_features_xml(features_xml) + + with features_pickle_file.open("wb") as fh_binary: + pickle.dump(features, fh_binary) + + return features + + def parse_qualities_file( + self, qualities_file: Path, qualities_pickle_file: Path + ) -> dict[int, OpenMLDataFeature]: + if qualities_file.suffix != ".xml": + # TODO (Shrivaths) can only parse xml warn/ raise exception + raise NotImplementedError() + + with Path(qualities_file).open("r", encoding="utf8") as fh: + qualities_xml = fh.read() + + qualities = self._parse_qualities_xml(qualities_xml) + + with qualities_pickle_file.open("wb") as fh_binary: + pickle.dump(qualities, fh_binary) + + return qualities + + def _parse_features_xml(self, features_xml_string: str) -> dict[int, OpenMLDataFeature]: + xml_dict = xmltodict.parse( + features_xml_string, + force_list=("oml:feature", "oml:nominal_value"), + strip_whitespace=False, + ) + features_xml = xml_dict["oml:data_features"] + + features: dict[int, OpenMLDataFeature] = {} + for idx, xmlfeature in enumerate(features_xml["oml:feature"]): + nr_missing = xmlfeature.get("oml:number_of_missing_values", 0) + feature = OpenMLDataFeature( + int(xmlfeature["oml:index"]), + xmlfeature["oml:name"], + xmlfeature["oml:data_type"], + xmlfeature.get("oml:nominal_value"), + int(nr_missing), + xmlfeature.get("oml:ontology"), + ) + if idx != feature.index: + raise ValueError("Data features not provided in right order") + features[feature.index] = feature + + return features + + def _parse_qualities_xml(self, qualities_xml: str) -> dict[str, float] | None: + xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",)) + qualities = xml_as_dict["oml:data_qualities"]["oml:quality"] + qualities_ = {} + for xmlquality in qualities: + name = xmlquality["oml:name"] + if xmlquality.get("oml:value", None) is None or xmlquality["oml:value"] == "null": + value = float("NaN") + else: + value = float(xmlquality["oml:value"]) + qualities_[name] = value + return qualities_ + + def _parse_list_xml(self, xml_string: str) -> pd.DataFrame: + datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",)) + # Minimalistic check if the XML is useful + assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type( + datasets_dict["oml:data"], + ) + assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[ + "oml:data" + ]["@xmlns:oml"] + + datasets = {} + for dataset_ in datasets_dict["oml:data"]["oml:dataset"]: + ignore_attribute = ["oml:file_id", "oml:quality"] + dataset = { + k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute + } + dataset["did"] = int(dataset["did"]) + dataset["version"] = int(dataset["version"]) + + # The number of qualities can range from 0 to infinity + for quality in dataset_.get("oml:quality", []): + try: + dataset[quality["@name"]] = int(quality["#text"]) + except ValueError: + dataset[quality["@name"]] = float(quality["#text"]) + datasets[dataset["did"]] = dataset + + return pd.DataFrame.from_dict(datasets, orient="index").astype( + { + "did": int, + "version": int, + "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), + } + ) + class DatasetsV2(DatasetsAPI): def get( @@ -457,6 +595,8 @@ def list( self, limit: int, offset: int, + *, + dataset_id: list[int] | None = None, # type: ignore **kwargs: Any, ) -> pd.DataFrame: """ @@ -473,12 +613,12 @@ def list( The maximum number of datasets to show. offset : int The number of datasets to skip, starting from the first. - data_id : list, optional + dataset_id: list[int], optional kwargs : dict, optional Legal filter operators (keys in the dict): tag, status, limit, offset, data_name, data_version, number_instances, - number_features, number_classes, number_missing_values, data_id. + number_features, number_classes, number_missing_values. Returns ------- @@ -490,7 +630,8 @@ def list( json["pagination"]["limit"] = limit if offset is not None: json["pagination"]["offset"] = offset - + if dataset_id is not None: + json["data_id"] = dataset_id if kwargs is not None: for operator, value in kwargs.items(): if value is not None: @@ -501,35 +642,14 @@ def list( # Minimalistic check if the JSON is useful assert isinstance(datasets_list, list), type(datasets_list) - datasets = {} - for dataset_ in datasets_list: - ignore_attribute = ["file_id", "quality"] - dataset = {k: v for (k, v) in dataset_.items() if k not in ignore_attribute} - dataset["did"] = int(dataset["did"]) - dataset["version"] = int(dataset["version"]) - - # The number of qualities can range from 0 to infinity - for quality in dataset_.get("quality", []): - try: - dataset[quality["name"]] = int(quality["value"]) - except ValueError: - dataset[quality["name"]] = float(quality["value"]) - datasets[dataset["did"]] = dataset - - return pd.DataFrame.from_dict(datasets, orient="index").astype( - { - "did": int, - "version": int, - "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), - } - ) + return self._parse_list_json(datasets_list) def delete(self, dataset_id: int) -> bool: raise NotImplementedError() def edit( # noqa: PLR0913 self, - data_id: int, + dataset_id: int, description: str | None = None, creator: str | None = None, contributor: str | None = None, @@ -544,10 +664,10 @@ def edit( # noqa: PLR0913 ) -> int: raise NotImplementedError() - def fork(self, data_id: int) -> int: + def fork(self, dataset_id: int) -> int: raise NotImplementedError() - def status_update(self, data_id: int, status: Literal["active", "deactivated"]) -> None: + def status_update(self, dataset_id: int, status: Literal["active", "deactivated"]) -> None: """ Updates the status of a dataset to either 'active' or 'deactivated'. Please see the OpenML API documentation for a description of the status @@ -556,7 +676,7 @@ def status_update(self, data_id: int, status: Literal["active", "deactivated"]) Parameters ---------- - data_id : int + dataset_id : int The data id of the dataset status : str, 'active' or 'deactivated' @@ -565,11 +685,11 @@ def status_update(self, data_id: int, status: Literal["active", "deactivated"]) if status not in legal_status: raise ValueError(f"Illegal status value. Legal values: {legal_status}") - data: openml._api_calls.DATA_TYPE = {"dataset_id": data_id, "status": status} + data: openml._api_calls.DATA_TYPE = {"dataset_id": dataset_id, "status": status} result = self._http.post("datasets/status/update", json=data).json() server_data_id = result["dataset_id"] server_status = result["status"] - if status != server_status or int(data_id) != int(server_data_id): + if status != server_status or int(dataset_id) != int(server_data_id): # This should never happen raise ValueError("Data id/status does not collide") @@ -643,8 +763,129 @@ def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset: parquet_file=str(parquet_file) if parquet_file is not None else None, ) - def feature_add_ontology(self, data_id: int, index: int, ontology: str) -> bool: + def feature_add_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: raise NotImplementedError() - def feature_remove_ontology(self, data_id: int, index: int, ontology: str) -> bool: + def feature_remove_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: raise NotImplementedError() + + def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]: + path = f"dataset/features/{dataset_id}" + json = self._http.get(path, use_cache=True).json() + features: dict[int, OpenMLDataFeature] = {} + for idx, xmlfeature in enumerate(json["data_features"]["feature"]): + nr_missing = xmlfeature.get("number_of_missing_values", 0) + feature = OpenMLDataFeature( + int(xmlfeature["index"]), + xmlfeature["name"], + xmlfeature["data_type"], + xmlfeature.get("nominal_value"), + int(nr_missing), + xmlfeature.get("ontology"), + ) + if idx != feature.index: + raise ValueError("Data features not provided in right order") + features[feature.index] = feature + + return features + + def get_qualities(self, dataset_id: int) -> dict[str, float] | None: + path = f"dataset/qualities/{dataset_id!s}" + try: + qualities_json = self._http.get(path, use_cache=True).json() + except OpenMLServerException as e: + if e.code == 362 and str(e) == "No qualities found - None": + logger.warning(f"No qualities found for dataset {dataset_id}") + return None + + raise e + + return self._parse_features_json(qualities_json) + + def parse_features_file( + self, features_file: Path, features_pickle_file: Path + ) -> dict[int, OpenMLDataFeature]: + if features_file.suffix != ".json": + # can fallback to v1 if the file is .xml + raise NotImplementedError() + + with Path(features_file).open("r", encoding="utf8") as fh: + features_json = json.load(fh) + + features = self._parse_features_json(features_json) + + with features_pickle_file.open("wb") as fh_binary: + pickle.dump(features, fh_binary) + + return features + + def parse_qualities_file( + self, qualities_file: Path, qualities_pickle_file: Path + ) -> dict[str, float] | None: + if qualities_file.suffix != ".json": + # can fallback to v1 if the file is .xml + raise NotImplementedError() + + with Path(qualities_file).open("r", encoding="utf8") as fh: + qualities_json = json.load(fh) + + qualities = self._parse_qualities_json(qualities_json) + + with qualities_pickle_file.open("wb") as fh_binary: + pickle.dump(qualities, fh_binary) + + return qualities + + def _parse_features_json(self: dict) -> dict[int, OpenMLDataFeature]: + features: dict[int, OpenMLDataFeature] = {} + for idx, xmlfeature in enumerate(self["data_features"]["feature"]): + nr_missing = xmlfeature.get("number_of_missing_values", 0) + feature = OpenMLDataFeature( + int(xmlfeature["index"]), + xmlfeature["name"], + xmlfeature["data_type"], + xmlfeature.get("nominal_value"), + int(nr_missing), + xmlfeature.get("ontology"), + ) + if idx != feature.index: + raise ValueError("Data features not provided in right order") + features[feature.index] = feature + + return features + + def _parse_qualities_json(self: dict) -> dict[str, float] | None: + qualities = self["data_qualities"]["quality"] + qualities_ = {} + for quality in qualities: + name = quality["name"] + if quality.get("value", None) is None or quality["value"] == "null": + value = float("NaN") + else: + value = float(quality["value"]) + qualities_[name] = value + return qualities_ + + def _parse_list_json(self, datasets_list: list) -> pd.DataFrame: # type: ignore + datasets = {} + for dataset_ in datasets_list: + ignore_attribute = ["file_id", "quality"] + dataset = {k: v for (k, v) in dataset_.items() if k not in ignore_attribute} + dataset["did"] = int(dataset["did"]) + dataset["version"] = int(dataset["version"]) + + # The number of qualities can range from 0 to infinity + for quality in dataset_.get("quality", []): + try: + dataset[quality["name"]] = int(quality["value"]) + except ValueError: + dataset[quality["name"]] = float(quality["value"]) + datasets[dataset["did"]] = dataset + + return pd.DataFrame.from_dict(datasets, orient="index").astype( + { + "did": int, + "version": int, + "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), + } + ) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index fa83d2b8a..f8bbf9be5 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -15,7 +15,6 @@ import numpy as np import pandas as pd import scipy.sparse -import xmltodict from openml.base import OpenMLBase from openml.config import OPENML_SKIP_PARQUET_ENV_VAR @@ -798,7 +797,7 @@ def _load_features(self) -> None: """Load the features metadata from the server and store it in the dataset object.""" # Delayed Import to avoid circular imports or having to import all of dataset.functions to # import OpenMLDataset. - from openml.datasets.functions import _get_dataset_features_file + from openml._api import api_context if self.dataset_id is None: raise ValueError( @@ -806,13 +805,12 @@ def _load_features(self) -> None: "metadata.", ) - features_file = _get_dataset_features_file(None, self.dataset_id) - self._features = _read_features(features_file) + self._features = api_context.backend.datasets.get_features(self.dataset_id) def _load_qualities(self) -> None: """Load qualities information from the server and store it in the dataset object.""" # same reason as above for _load_features - from openml.datasets.functions import _get_dataset_qualities_file + from openml._api import api_context if self.dataset_id is None: raise ValueError( @@ -820,12 +818,12 @@ def _load_qualities(self) -> None: "metadata.", ) - qualities_file = _get_dataset_qualities_file(None, self.dataset_id) + qualities = api_context.backend.datasets.get_qualities(self.dataset_id) - if qualities_file is None: + if qualities is None: self._no_qualities_found = True else: - self._qualities = _read_qualities(qualities_file) + self._qualities = qualities def retrieve_class_labels(self, target_name: str = "class") -> None | list[str]: """Reads the datasets arff to determine the class-labels. @@ -992,39 +990,9 @@ def _read_features(features_file: Path) -> dict[int, OpenMLDataFeature]: return pickle.load(fh_binary) # type: ignore # noqa: S301 except: # noqa: E722 - with Path(features_file).open("r", encoding="utf8") as fh: - features_xml_string = fh.read() + from openml._api import api_context - features = _parse_features_xml(features_xml_string) - - with features_pickle_file.open("wb") as fh_binary: - pickle.dump(features, fh_binary) - - return features - - -def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature]: - xml_dict = xmltodict.parse( - features_xml_string, force_list=("oml:feature", "oml:nominal_value"), strip_whitespace=False - ) - features_xml = xml_dict["oml:data_features"] - - features: dict[int, OpenMLDataFeature] = {} - for idx, xmlfeature in enumerate(features_xml["oml:feature"]): - nr_missing = xmlfeature.get("oml:number_of_missing_values", 0) - feature = OpenMLDataFeature( - int(xmlfeature["oml:index"]), - xmlfeature["oml:name"], - xmlfeature["oml:data_type"], - xmlfeature.get("oml:nominal_value"), - int(nr_missing), - xmlfeature.get("oml:ontology"), - ) - if idx != feature.index: - raise ValueError("Data features not provided in right order") - features[feature.index] = feature - - return features + return api_context.backend.datasets.parse_features_file(features_file, features_pickle_file) # TODO(eddiebergman): Should this really exist? @@ -1046,29 +1014,8 @@ def _read_qualities(qualities_file: str | Path) -> dict[str, float]: with qualities_pickle_file.open("rb") as fh_binary: return pickle.load(fh_binary) # type: ignore # noqa: S301 except: # noqa: E722 - with qualities_file.open(encoding="utf8") as fh: - qualities_xml = fh.read() + from openml._api import api_context - qualities = _parse_qualities_xml(qualities_xml) - with qualities_pickle_file.open("wb") as fh_binary: - pickle.dump(qualities, fh_binary) - - return qualities - - -def _check_qualities(qualities: list[dict[str, str]]) -> dict[str, float]: - qualities_ = {} - for xmlquality in qualities: - name = xmlquality["oml:name"] - if xmlquality.get("oml:value", None) is None or xmlquality["oml:value"] == "null": - value = float("NaN") - else: - value = float(xmlquality["oml:value"]) - qualities_[name] = value - return qualities_ - - -def _parse_qualities_xml(qualities_xml: str) -> dict[str, float]: - xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",)) - qualities = xml_as_dict["oml:data_qualities"]["oml:quality"] - return _check_qualities(qualities) + return api_context.backend.datasets.parse_qualities_file( + qualities_file, qualities_pickle_file + ) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 6ede42ea9..0eb30b3db 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -88,7 +88,7 @@ def list_datasets( Parameters ---------- - data_id : list, optional + dataset_id : list, optional A list of data ids, to specify which datasets should be listed offset : int, optional @@ -717,7 +717,7 @@ def status_update(data_id: int, status: Literal["active", "deactivated"]) -> Non if status not in legal_status: raise ValueError(f"Illegal status value. Legal values: {legal_status}") - api_context.backend.datasets.status_update(data_id=data_id, status=status) + api_context.backend.datasets.status_update(dataset_id=data_id, status=status) def edit_dataset( @@ -840,7 +840,7 @@ def fork_dataset(data_id: int) -> int: """ from openml._api import api_context - return api_context.backend.datasets.fork(data_id=data_id) + return api_context.backend.datasets.fork(dataset_id=data_id) def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool: From 4c75e16890a76d8fbc0ddc125a267d23ddaded44 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 15 Jan 2026 14:51:22 +0500 Subject: [PATCH 12/15] undo changes in tasks/functions.py --- openml/tasks/functions.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index a794ad56d..e9b879ae4 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -12,7 +12,6 @@ import openml._api_calls import openml.utils -from openml._api import api_context from openml.datasets import get_dataset from openml.exceptions import OpenMLCacheException @@ -445,16 +444,11 @@ def _get_task_description(task_id: int) -> OpenMLTask: except OpenMLCacheException: _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id) xml_file = _cache_dir / "task.xml" - result = api_context.backend.tasks.get(task_id, return_response=True) + task_xml = openml._api_calls._perform_api_call("task/%d" % task_id, "get") - if isinstance(result, tuple): - task, response = result - with xml_file.open("w", encoding="utf8") as fh: - fh.write(response.text) - else: - task = result - - return task + with xml_file.open("w", encoding="utf8") as fh: + fh.write(task_xml) + return _create_task_from_xml(task_xml) def _create_task_from_xml(xml: str) -> OpenMLTask: From 3e7c415f01a9f19feabf007bac709f9fc4fe3886 Mon Sep 17 00:00:00 2001 From: JATAYU000 Date: Sun, 18 Jan 2026 22:03:42 +0530 Subject: [PATCH 13/15] Download methods --- openml/_api/http/client.py | 151 ++++++++++++++++++++++++++++++ openml/_api/resources/base.py | 14 ++- openml/_api/resources/datasets.py | 107 ++++++++++++--------- 3 files changed, 226 insertions(+), 46 deletions(-) diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py index a90e93933..bab7b20a8 100644 --- a/openml/_api/http/client.py +++ b/openml/_api/http/client.py @@ -1,11 +1,19 @@ from __future__ import annotations +import contextlib +import shutil +import urllib +import urllib.parse +import zipfile +from collections.abc import Callable from pathlib import Path from typing import TYPE_CHECKING, Any from urllib.parse import urlencode, urljoin, urlparse +import minio import requests from requests import Response +from urllib3 import ProxyManager from openml.__version__ import __version__ from openml._api.config import settings @@ -13,6 +21,9 @@ if TYPE_CHECKING: from openml._api.config import APIConfig +import openml.config +from openml.utils import ProgressBar + class CacheMixin: @property @@ -149,3 +160,143 @@ def delete( use_api_key=True, **request_kwargs, ) + + def download( + self, + url: str, + handler: Callable[[Response, Path, str], Path], + encoding: str = "utf-8", + ) -> Path: + response = self.get(url) + dir_path = self._get_cache_dir(url, {}) + dir_path = dir_path.expanduser() + if handler is not None: + return handler(response, dir_path, encoding) + + return self._text_handler(response, dir_path, encoding, url) + + def _text_handler(self, response: Response, path: Path, encoding: str) -> Path: + if path.is_dir(): + path = path / "response.txt" + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding=encoding) as f: + f.write(response.text) + return path + + +class MinIOClient(CacheMixin): + def __init__(self) -> None: + self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"} + + def download_minio_file( + self, + source: str, + destination: str | Path | None = None, + exists_ok: bool = True, # noqa: FBT002 + proxy: str | None = "auto", + ) -> str: + """Download file ``source`` from a MinIO Bucket and store it at ``destination``. + + Parameters + ---------- + source : str + URL to a file in a MinIO bucket. + destination : str | Path + Path to store the file to, if a directory is provided the original filename is used. + exists_ok : bool, optional (default=True) + If False, raise FileExists if a file already exists in ``destination``. + proxy: str, optional (default = "auto") + The proxy server to use. By default it's "auto" which uses ``requests`` to + automatically find the proxy to use. Pass None or the environment variable + ``no_proxy="*"`` to disable proxies. + """ + destination = self._get_cache_dir(source, {}) if destination is None else Path(destination) + parsed_url = urllib.parse.urlparse(source) + + # expect path format: /BUCKET/path/to/file.ext + bucket, object_name = parsed_url.path[1:].split("/", maxsplit=1) + if destination.is_dir(): + destination = Path(destination, object_name) + if destination.is_file() and not exists_ok: + raise FileExistsError(f"File already exists in {destination}.") + + destination = destination.expanduser() + destination.parent.mkdir(parents=True, exist_ok=True) + + if proxy == "auto": + resolved_proxies = requests.utils.get_environ_proxies(parsed_url.geturl()) + proxy = requests.utils.select_proxy(parsed_url.geturl(), resolved_proxies) # type: ignore + + proxy_client = ProxyManager(proxy) if proxy else None + + client = minio.Minio(endpoint=parsed_url.netloc, secure=False, http_client=proxy_client) + try: + client.fget_object( + bucket_name=bucket, + object_name=object_name, + file_path=str(destination), + progress=ProgressBar() if openml.config.show_progress else None, + request_headers=self.headers, + ) + if destination.is_file() and destination.suffix == ".zip": + with zipfile.ZipFile(destination, "r") as zip_ref: + zip_ref.extractall(destination.parent) + + except minio.error.S3Error as e: + if e.message is not None and e.message.startswith("Object does not exist"): + raise FileNotFoundError(f"Object at '{source}' does not exist.") from e + # e.g. permission error, or a bucket does not exist (which is also interpreted as a + # permission error on minio level). + raise FileNotFoundError("Bucket does not exist or is private.") from e + + return str(destination) + + def download_minio_bucket(self, source: str, destination: str | Path) -> None: + """Download file ``source`` from a MinIO Bucket and store it at ``destination``. + + Does not redownload files which already exist. + + Parameters + ---------- + source : str + URL to a MinIO bucket. + destination : str | Path + Path to a directory to store the bucket content in. + """ + destination = self._get_cache_dir(source, {}) if destination is None else Path(destination) + parsed_url = urllib.parse.urlparse(source) + + # expect path format: /BUCKET/path/to/file.ext + _, bucket, *prefixes, _file = parsed_url.path.split("/") + prefix = "/".join(prefixes) + + client = minio.Minio(endpoint=parsed_url.netloc, secure=False) + + for file_object in client.list_objects(bucket, prefix=prefix, recursive=True): + if file_object.object_name is None: + raise ValueError(f"Object name is None for object {file_object!r}") + if file_object.etag is None: + raise ValueError(f"Object etag is None for object {file_object!r}") + + marker = destination / file_object.etag + if marker.exists(): + continue + + file_destination = destination / file_object.object_name.rsplit("/", 1)[1] + if (file_destination.parent / file_destination.stem).exists(): + # Marker is missing but archive exists means the server archive changed + # force a refresh + shutil.rmtree(file_destination.parent / file_destination.stem) + + with contextlib.suppress(FileExistsError): + self.download_minio_file( + source=source.rsplit("/", 1)[0] + + "/" + + file_object.object_name.rsplit("/", 1)[1], + destination=file_destination, + exists_ok=False, + ) + + if file_destination.is_file() and file_destination.suffix == ".zip": + file_destination.unlink() + marker.touch() diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py index 3030ce6ff..79f7ddfe8 100644 --- a/openml/_api/resources/base.py +++ b/openml/_api/resources/base.py @@ -2,8 +2,7 @@ from abc import ABC, abstractmethod from pathlib import Path -from typing import TYPE_CHECKING, Any -from typing_extensions import Literal +from typing import TYPE_CHECKING, Any, Literal if TYPE_CHECKING: import pandas as pd @@ -87,7 +86,16 @@ def parse_features_file( @abstractmethod def parse_qualities_file( self, qualities_file: Path, qualities_pickle_file: Path - ) -> dict[str, float] | None: ... + ) -> dict[str, float]: ... + + @abstractmethod + def download_file(self, url_ext: str, encoding: str = "utf-8") -> Path: ... + + @abstractmethod + def download_features_file(self, dataset_id: int) -> Path: ... + + @abstractmethod + def download_qualities_file(self, dataset_id: int) -> Path: ... class TasksAPI(ResourceAPI, ABC): diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py index 58883f626..91f9fed30 100644 --- a/openml/_api/resources/datasets.py +++ b/openml/_api/resources/datasets.py @@ -1,12 +1,12 @@ from __future__ import annotations +import builtins import json import logging import pickle from collections import OrderedDict from pathlib import Path -from typing import TYPE_CHECKING, Any -from typing_extensions import Literal +from typing import TYPE_CHECKING, Any, Literal from openml._api.resources.base import DatasetsAPI from openml.datasets.data_feature import OpenMLDataFeature @@ -48,7 +48,7 @@ def list( limit: int, offset: int, *, - data_id: list[int] | None = None, # type: ignore + data_id: builtins.list[int] | None = None, **kwargs: Any, ) -> pd.DataFrame: """ @@ -153,7 +153,7 @@ def edit( # noqa: PLR0913 collection_date: str | None = None, language: str | None = None, default_target_attribute: str | None = None, - ignore_attribute: str | list[str] | None = None, # type: ignore + ignore_attribute: str | builtins.list[str] | None = None, citation: str | None = None, row_id_attribute: str | None = None, original_data_url: str | None = None, @@ -314,7 +314,7 @@ def status_update(self, dataset_id: int, status: Literal["active", "deactivated" # This should never happen raise ValueError("Data id/status does not collide") - def list_qualities(self) -> list[str]: # type: ignore + def list_qualities(self) -> builtins.list[str]: """Return list of data qualities available. The function performs an API call to retrieve the entire list of @@ -455,7 +455,7 @@ def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]: def get_qualities(self, dataset_id: int) -> dict[str, float] | None: path = f"data/qualities/{dataset_id!s}" try: - self._http.get(path, use_cache=True).text + xml = self._http.get(path, use_cache=True).text except OpenMLServerException as e: if e.code == 362 and str(e) == "No qualities found - None": # quality file stays as None @@ -464,7 +464,7 @@ def get_qualities(self, dataset_id: int) -> dict[str, float] | None: raise e - return self._parse_qualities_xml() + return self._parse_qualities_xml(xml) def parse_features_file( self, features_file: Path, features_pickle_file: Path @@ -485,7 +485,7 @@ def parse_features_file( def parse_qualities_file( self, qualities_file: Path, qualities_pickle_file: Path - ) -> dict[int, OpenMLDataFeature]: + ) -> dict[str, float]: if qualities_file.suffix != ".xml": # TODO (Shrivaths) can only parse xml warn/ raise exception raise NotImplementedError() @@ -525,7 +525,7 @@ def _parse_features_xml(self, features_xml_string: str) -> dict[int, OpenMLDataF return features - def _parse_qualities_xml(self, qualities_xml: str) -> dict[str, float] | None: + def _parse_qualities_xml(self, qualities_xml: str) -> dict[str, float]: xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",)) qualities = xml_as_dict["oml:data_qualities"]["oml:quality"] qualities_ = {} @@ -573,6 +573,24 @@ def _parse_list_xml(self, xml_string: str) -> pd.DataFrame: } ) + def download_file(self, url_ext: str, encoding: str = "utf-8") -> Path: + def __handler(response: Response, path: Path, encoding: str) -> Path: + file_path = path / "response.xml" + file_path.parent.mkdir(parents=True, exist_ok=True) + with file_path.open("w", encoding=encoding) as f: + f.write(response.text) + return file_path + + return self._http.download(url_ext, __handler, encoding) + + def download_features_file(self, dataset_id: int) -> Path: + path = f"data/features/{dataset_id}" + return self.download_file(path) + + def download_qualities_file(self, dataset_id: int) -> Path: + path = f"data/qualities/{dataset_id}" + return self.download_file(path) + class DatasetsV2(DatasetsAPI): def get( @@ -596,7 +614,7 @@ def list( limit: int, offset: int, *, - dataset_id: list[int] | None = None, # type: ignore + dataset_id: builtins.list[int] | None = None, **kwargs: Any, ) -> pd.DataFrame: """ @@ -656,7 +674,7 @@ def edit( # noqa: PLR0913 collection_date: str | None = None, language: str | None = None, default_target_attribute: str | None = None, - ignore_attribute: str | list[str] | None = None, # type: ignore + ignore_attribute: str | builtins.list[str] | None = None, citation: str | None = None, row_id_attribute: str | None = None, original_data_url: str | None = None, @@ -693,7 +711,7 @@ def status_update(self, dataset_id: int, status: Literal["active", "deactivated" # This should never happen raise ValueError("Data id/status does not collide") - def list_qualities(self) -> list[str]: # type: ignore + def list_qualities(self) -> builtins.list[str]: """Return list of data qualities available. The function performs an API call to retrieve the entire list of @@ -770,27 +788,13 @@ def feature_remove_ontology(self, dataset_id: int, index: int, ontology: str) -> raise NotImplementedError() def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]: - path = f"dataset/features/{dataset_id}" + path = f"datasets/features/{dataset_id}" json = self._http.get(path, use_cache=True).json() - features: dict[int, OpenMLDataFeature] = {} - for idx, xmlfeature in enumerate(json["data_features"]["feature"]): - nr_missing = xmlfeature.get("number_of_missing_values", 0) - feature = OpenMLDataFeature( - int(xmlfeature["index"]), - xmlfeature["name"], - xmlfeature["data_type"], - xmlfeature.get("nominal_value"), - int(nr_missing), - xmlfeature.get("ontology"), - ) - if idx != feature.index: - raise ValueError("Data features not provided in right order") - features[feature.index] = feature - return features + return self._parse_features_json(json) def get_qualities(self, dataset_id: int) -> dict[str, float] | None: - path = f"dataset/qualities/{dataset_id!s}" + path = f"datasets/qualities/{dataset_id!s}" try: qualities_json = self._http.get(path, use_cache=True).json() except OpenMLServerException as e: @@ -800,7 +804,7 @@ def get_qualities(self, dataset_id: int) -> dict[str, float] | None: raise e - return self._parse_features_json(qualities_json) + return self._parse_qualities_json(qualities_json) def parse_features_file( self, features_file: Path, features_pickle_file: Path @@ -821,7 +825,7 @@ def parse_features_file( def parse_qualities_file( self, qualities_file: Path, qualities_pickle_file: Path - ) -> dict[str, float] | None: + ) -> dict[str, float]: if qualities_file.suffix != ".json": # can fallback to v1 if the file is .xml raise NotImplementedError() @@ -836,17 +840,17 @@ def parse_qualities_file( return qualities - def _parse_features_json(self: dict) -> dict[int, OpenMLDataFeature]: + def _parse_features_json(self, features_json: dict) -> dict[int, OpenMLDataFeature]: features: dict[int, OpenMLDataFeature] = {} - for idx, xmlfeature in enumerate(self["data_features"]["feature"]): - nr_missing = xmlfeature.get("number_of_missing_values", 0) + for idx, jsonfeatures in enumerate(features_json): + nr_missing = jsonfeatures.get("number_of_missing_values", 0) feature = OpenMLDataFeature( - int(xmlfeature["index"]), - xmlfeature["name"], - xmlfeature["data_type"], - xmlfeature.get("nominal_value"), + int(jsonfeatures["index"]), + jsonfeatures["name"], + jsonfeatures["data_type"], + jsonfeatures.get("nominal_values"), int(nr_missing), - xmlfeature.get("ontology"), + jsonfeatures.get("ontology"), ) if idx != feature.index: raise ValueError("Data features not provided in right order") @@ -854,10 +858,9 @@ def _parse_features_json(self: dict) -> dict[int, OpenMLDataFeature]: return features - def _parse_qualities_json(self: dict) -> dict[str, float] | None: - qualities = self["data_qualities"]["quality"] + def _parse_qualities_json(self, qualities_json: dict) -> dict[str, float]: qualities_ = {} - for quality in qualities: + for quality in qualities_json: name = quality["name"] if quality.get("value", None) is None or quality["value"] == "null": value = float("NaN") @@ -866,7 +869,7 @@ def _parse_qualities_json(self: dict) -> dict[str, float] | None: qualities_[name] = value return qualities_ - def _parse_list_json(self, datasets_list: list) -> pd.DataFrame: # type: ignore + def _parse_list_json(self, datasets_list: builtins.list) -> pd.DataFrame: datasets = {} for dataset_ in datasets_list: ignore_attribute = ["file_id", "quality"] @@ -889,3 +892,21 @@ def _parse_list_json(self, datasets_list: list) -> pd.DataFrame: # type: ignore "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), } ) + + def download_file(self, url_ext: str, encoding: str = "utf-8") -> Path: + def __handler(response: Response, path: Path, encoding: str) -> Path: + file_path = path / "response.json" + file_path.parent.mkdir(parents=True, exist_ok=True) + with file_path.open("w", encoding=encoding) as f: + json.dump(response.json(), f, indent=4) + return file_path + + return self._http.download(url_ext, __handler, encoding) + + def download_features_file(self, dataset_id: int) -> Path: + path = f"datasets/features/{dataset_id}" + return self.download_file(path) + + def download_qualities_file(self, dataset_id: int) -> Path: + path = f"datasets/qualities/{dataset_id}" + return self.download_file(path) From 8933cd873b79ec5676a30ca9b7ab418710347591 Mon Sep 17 00:00:00 2001 From: JATAYU000 Date: Mon, 19 Jan 2026 13:47:55 +0530 Subject: [PATCH 14/15] Refactored functions --- openml/_api/http/__init__.py | 4 +- openml/_api/http/client.py | 10 +- openml/_api/resources/base.py | 32 +++- openml/_api/resources/datasets.py | 262 +++++++++++++++++++++++++---- openml/_api/runtime/core.py | 8 +- openml/datasets/functions.py | 264 +++--------------------------- 6 files changed, 288 insertions(+), 292 deletions(-) diff --git a/openml/_api/http/__init__.py b/openml/_api/http/__init__.py index 8e6d1e4ce..c92ce14c3 100644 --- a/openml/_api/http/__init__.py +++ b/openml/_api/http/__init__.py @@ -1,3 +1,3 @@ -from openml._api.http.client import HTTPClient +from openml._api.http.client import HTTPClient, MinIOClient -__all__ = ["HTTPClient"] +__all__ = ["HTTPClient", "MinIOClient"] diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py index bab7b20a8..ea812e0bb 100644 --- a/openml/_api/http/client.py +++ b/openml/_api/http/client.py @@ -164,7 +164,7 @@ def delete( def download( self, url: str, - handler: Callable[[Response, Path, str], Path], + handler: Callable[[Response, Path, str], Path] | None = None, encoding: str = "utf-8", ) -> Path: response = self.get(url) @@ -173,7 +173,7 @@ def download( if handler is not None: return handler(response, dir_path, encoding) - return self._text_handler(response, dir_path, encoding, url) + return self._text_handler(response, dir_path, encoding) def _text_handler(self, response: Response, path: Path, encoding: str) -> Path: if path.is_dir(): @@ -194,7 +194,7 @@ def download_minio_file( destination: str | Path | None = None, exists_ok: bool = True, # noqa: FBT002 proxy: str | None = "auto", - ) -> str: + ) -> Path: """Download file ``source`` from a MinIO Bucket and store it at ``destination``. Parameters @@ -249,9 +249,9 @@ def download_minio_file( # permission error on minio level). raise FileNotFoundError("Bucket does not exist or is private.") from e - return str(destination) + return destination - def download_minio_bucket(self, source: str, destination: str | Path) -> None: + def download_minio_bucket(self, source: str, destination: str | Path | None = None) -> None: """Download file ``source`` from a MinIO Bucket and store it at ``destination``. Does not redownload files which already exist. diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py index 79f7ddfe8..5ad143db8 100644 --- a/openml/_api/resources/base.py +++ b/openml/_api/resources/base.py @@ -8,7 +8,7 @@ import pandas as pd from requests import Response - from openml._api.http import HTTPClient + from openml._api.http import HTTPClient, MinIOClient from openml.datasets.dataset import OpenMLDataFeature, OpenMLDataset from openml.tasks.task import OpenMLTask @@ -19,13 +19,20 @@ def __init__(self, http: HTTPClient): class DatasetsAPI(ResourceAPI, ABC): + def __init__(self, http: HTTPClient, minio: MinIOClient): + self._minio = minio + super().__init__(http) + @abstractmethod - def get( + def get( # noqa: PLR0913 self, - dataset_id: int | str, - *, - return_response: bool = False, - ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ... + dataset_id: int, + download_data: bool = False, # noqa: FBT002 + cache_format: Literal["pickle", "feather"] = "pickle", + download_qualities: bool = False, # noqa: FBT002 + download_features_meta_data: bool = False, # noqa: FBT002 + download_all_files: bool = False, # noqa: FBT002 + ) -> OpenMLDataset: ... @abstractmethod def list( @@ -97,6 +104,19 @@ def download_features_file(self, dataset_id: int) -> Path: ... @abstractmethod def download_qualities_file(self, dataset_id: int) -> Path: ... + @abstractmethod + def download_dataset_parquet( + self, + description: dict | OpenMLDataset, + download_all_files: bool = False, # noqa: FBT002 + ) -> Path | None: ... + + @abstractmethod + def download_dataset_arff( + self, + description: dict | OpenMLDataset, + ) -> Path: ... + class TasksAPI(ResourceAPI, ABC): @abstractmethod diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py index 91f9fed30..1387bfd54 100644 --- a/openml/_api/resources/datasets.py +++ b/openml/_api/resources/datasets.py @@ -1,17 +1,29 @@ +# ruff: noqa: PLR0913 from __future__ import annotations import builtins import json import logging +import os import pickle from collections import OrderedDict from pathlib import Path from typing import TYPE_CHECKING, Any, Literal +import minio +import urllib3 + from openml._api.resources.base import DatasetsAPI +from openml.config import OPENML_SKIP_PARQUET_ENV_VAR from openml.datasets.data_feature import OpenMLDataFeature from openml.datasets.dataset import OpenMLDataset -from openml.exceptions import OpenMLNotAuthorizedError, OpenMLServerError, OpenMLServerException +from openml.exceptions import ( + OpenMLHashException, + OpenMLNotAuthorizedError, + OpenMLPrivateDatasetError, + OpenMLServerError, + OpenMLServerException, +) if TYPE_CHECKING: from requests import Response @@ -30,18 +42,55 @@ class DatasetsV1(DatasetsAPI): def get( self, - dataset_id: int | str, - *, - return_response: bool = False, - ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: + dataset_id: int, + download_data: bool = False, # noqa: FBT002 + cache_format: Literal["pickle", "feather"] = "pickle", + download_qualities: bool = False, # noqa: FBT002 + download_features_meta_data: bool = False, # noqa: FBT002 + download_all_files: bool = False, # noqa: FBT002 + ) -> OpenMLDataset: path = f"data/{dataset_id}" response = self._http.get(path) xml_content = response.text - dataset = self._create_dataset_from_xml(xml_content) - if return_response: - return dataset, response + description = xmltodict.parse(xml_content)["oml:data_set_description"] + + try: + features_file = None + qualities_file = None + + if download_features_meta_data: + features_file = self.download_features_file(dataset_id) + if download_qualities: + qualities_file = self.download_qualities_file(dataset_id) + + parquet_file = None + skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true" + download_parquet = "oml:parquet_url" in description and not skip_parquet + if download_parquet and (download_data or download_all_files): + try: + parquet_file = self.download_dataset_parquet( + description, + download_all_files=download_all_files, + ) + except urllib3.exceptions.MaxRetryError: + parquet_file = None + + arff_file = None + if parquet_file is None and download_data: + if download_parquet: + logger.warning("Failed to download parquet, fallback on ARFF.") + arff_file = self.download_dataset_arff(description) + except OpenMLServerException as e: + # if there was an exception + # check if the user had access to the dataset + if e.code == NO_ACCESS_GRANTED_ERRCODE: + raise OpenMLPrivateDatasetError(e.message) from None + + raise e - return dataset + return self._create_dataset_from_xml( + description, features_file, qualities_file, arff_file, parquet_file, cache_format + ) def list( self, @@ -144,7 +193,7 @@ def delete(self, dataset_id: int) -> bool: ) from e raise e - def edit( # noqa: PLR0913 + def edit( self, dataset_id: int, description: str | None = None, @@ -336,7 +385,15 @@ def list_qualities(self) -> builtins.list[str]: return qualities["oml:data_qualities_list"]["oml:quality"] - def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset: + def _create_dataset_from_xml( + self, + description: dict, + features_file: Path | None = None, + qualities_file: Path | None = None, + arff_file: Path | None = None, + parquet_file: Path | None = None, + cache_format: Literal["pickle", "feather"] = "pickle", + ) -> OpenMLDataset: """Create a dataset given a xml string. Parameters @@ -348,14 +405,6 @@ def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset: ------- OpenMLDataset """ - description = xmltodict.parse(xml)["oml:data_set_description"] - - # TODO file path after download, cache_format default = 'pickle' - arff_file = None - features_file = None - parquet_file = None - qualities_file = None - return OpenMLDataset( description["oml:name"], description.get("oml:description"), @@ -375,6 +424,7 @@ def _create_dataset_from_xml(self, xml: str) -> OpenMLDataset: version_label=description.get("oml:version_label"), citation=description.get("oml:citation"), tag=description.get("oml:tag"), + cache_format=cache_format, visibility=description.get("oml:visibility"), original_data_url=description.get("oml:original_data_url"), paper_url=description.get("oml:paper_url"), @@ -591,23 +641,110 @@ def download_qualities_file(self, dataset_id: int) -> Path: path = f"data/qualities/{dataset_id}" return self.download_file(path) + def download_dataset_parquet( + self, + description: dict | OpenMLDataset, + download_all_files: bool = False, # noqa: FBT002 + ) -> Path | None: + if isinstance(description, dict): + url = str(description.get("oml:parquet_url")) + elif isinstance(description, OpenMLDataset): + url = str(description._parquet_url) + assert description.dataset_id is not None + else: + raise TypeError("`description` should be either OpenMLDataset or Dict.") + + if download_all_files: + self._minio.download_minio_bucket(source=url) + + try: + output_file_path = self._minio.download_minio_file( + source=url, + ) + except (FileNotFoundError, urllib3.exceptions.MaxRetryError, minio.error.ServerError) as e: + logger.warning(f"Could not download file from {url}: {e}") + return None + return output_file_path + + def download_dataset_arff( + self, + description: dict | OpenMLDataset, + ) -> Path: + if isinstance(description, dict): + # TODO md5_checksum_fixture = description.get("oml:md5_checksum") + url = str(description["oml:url"]) + did = int(description.get("oml:id")) # type: ignore + elif isinstance(description, OpenMLDataset): + # TODO md5_checksum_fixture = description.md5_checksum + assert description.url is not None + assert description.dataset_id is not None + + url = description.url + did = int(description.dataset_id) + else: + raise TypeError("`description` should be either OpenMLDataset or Dict.") + + try: + output_file_path = self._http.download(url) + except OpenMLHashException as e: + additional_info = f" Raised when downloading dataset {did}." + e.args = (e.args[0] + additional_info,) + raise e + + return output_file_path + class DatasetsV2(DatasetsAPI): def get( self, - dataset_id: int | str, - *, - return_response: bool = False, - ) -> OpenMLDataset | tuple[OpenMLDataset, Response]: - path = f"data/{dataset_id}" + dataset_id: int, + download_data: bool = False, # noqa: FBT002 + cache_format: Literal["pickle", "feather"] = "pickle", + download_qualities: bool = False, # noqa: FBT002 + download_features_meta_data: bool = False, # noqa: FBT002 + download_all_files: bool = False, # noqa: FBT002 + ) -> OpenMLDataset: + path = f"datasets/{dataset_id}" response = self._http.get(path) json_content = response.json() - dataset = self._create_dataset_from_json(json_content) - if return_response: - return dataset, response + try: + features_file = None + qualities_file = None + + if download_features_meta_data: + features_file = self.download_features_file(dataset_id) + if download_qualities: + qualities_file = self.download_qualities_file(dataset_id) + + parquet_file = None + skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true" + download_parquet = "parquet_url" in json_content and not skip_parquet + if download_parquet and (download_data or download_all_files): + try: + parquet_file = self.download_dataset_parquet( + json_content, + download_all_files=download_all_files, + ) + except urllib3.exceptions.MaxRetryError: + parquet_file = None + + arff_file = None + if parquet_file is None and download_data: + if download_parquet: + logger.warning("Failed to download parquet, fallback on ARFF.") + arff_file = self.download_dataset_arff(json_content) + except OpenMLServerException as e: + # if there was an exception + # check if the user had access to the dataset + if e.code == NO_ACCESS_GRANTED_ERRCODE: + raise OpenMLPrivateDatasetError(e.message) from None + + raise e - return dataset + return self._create_dataset_from_json( + json_content, features_file, qualities_file, arff_file, parquet_file, cache_format + ) def list( self, @@ -665,7 +802,7 @@ def list( def delete(self, dataset_id: int) -> bool: raise NotImplementedError() - def edit( # noqa: PLR0913 + def edit( self, dataset_id: int, description: str | None = None, @@ -732,7 +869,15 @@ def list_qualities(self) -> builtins.list[str]: return qualities["data_qualities_list"]["quality"] - def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset: + def _create_dataset_from_json( + self, + json_content: dict, + features_file: Path | None = None, + qualities_file: Path | None = None, + arff_file: Path | None = None, + parquet_file: Path | None = None, + cache_format: Literal["pickle", "feather"] = "pickle", + ) -> OpenMLDataset: """Create a dataset given a json. Parameters @@ -744,12 +889,6 @@ def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset: ------- OpenMLDataset """ - # TODO file path after download, cache_format default = 'pickle' - arff_file = None - features_file = None - parquet_file = None - qualities_file = None - return OpenMLDataset( json_content["name"], json_content.get("description"), @@ -769,6 +908,7 @@ def _create_dataset_from_json(self, json_content: dict) -> OpenMLDataset: version_label=json_content.get("version_label"), citation=json_content.get("citation"), tag=json_content.get("tag"), + cache_format=cache_format, visibility=json_content.get("visibility"), original_data_url=json_content.get("original_data_url"), paper_url=json_content.get("paper_url"), @@ -910,3 +1050,53 @@ def download_features_file(self, dataset_id: int) -> Path: def download_qualities_file(self, dataset_id: int) -> Path: path = f"datasets/qualities/{dataset_id}" return self.download_file(path) + + def download_dataset_parquet( + self, + description: dict | OpenMLDataset, + download_all_files: bool = False, # noqa: FBT002 + ) -> Path | None: + if isinstance(description, dict): + url = str(description.get("parquet_url")) + elif isinstance(description, OpenMLDataset): + url = str(description._parquet_url) + assert description.dataset_id is not None + else: + raise TypeError("`description` should be either OpenMLDataset or Dict.") + + if download_all_files: + self._minio.download_minio_bucket(source=url) + + try: + output_file_path = self._minio.download_minio_file(source=url) + except (FileNotFoundError, urllib3.exceptions.MaxRetryError, minio.error.ServerError) as e: + logger.warning(f"Could not download file from {url}: {e}") + return None + return output_file_path + + def download_dataset_arff( + self, + description: dict | OpenMLDataset, + ) -> Path: + if isinstance(description, dict): + url = str(description["url"]) + did = int(description.get("oml:id")) # type: ignore + elif isinstance(description, OpenMLDataset): + assert description.url is not None + assert description.dataset_id is not None + + url = description.url + did = int(description.dataset_id) + else: + raise TypeError("`description` should be either OpenMLDataset or Dict.") + + try: + output_file_path = self._http.download( + url, + ) + except OpenMLHashException as e: + additional_info = f" Raised when downloading dataset {did}." + e.args = (e.args[0] + additional_info,) + raise e + + return output_file_path diff --git a/openml/_api/runtime/core.py b/openml/_api/runtime/core.py index 98b587411..47ebb2b5b 100644 --- a/openml/_api/runtime/core.py +++ b/openml/_api/runtime/core.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING from openml._api.config import settings -from openml._api.http.client import HTTPClient +from openml._api.http.client import HTTPClient, MinIOClient from openml._api.resources import ( DatasetsV1, DatasetsV2, @@ -24,9 +24,9 @@ def __init__(self, *, datasets: DatasetsAPI, tasks: TasksAPI): def build_backend(version: str, *, strict: bool) -> APIBackend: v1_http = HTTPClient(config=settings.api.v1) v2_http = HTTPClient(config=settings.api.v2) - + minio = MinIOClient() v1 = APIBackend( - datasets=DatasetsV1(v1_http), + datasets=DatasetsV1(v1_http, minio), tasks=TasksV1(v1_http), ) @@ -34,7 +34,7 @@ def build_backend(version: str, *, strict: bool) -> APIBackend: return v1 v2 = APIBackend( - datasets=DatasetsV2(v2_http), + datasets=DatasetsV2(v2_http, minio), tasks=TasksV2(v2_http), ) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 3786e54d6..8b3fbd732 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -import os import warnings from functools import partial from pathlib import Path @@ -11,21 +10,15 @@ from typing import TYPE_CHECKING, Any, Literal import arff -import minio.error import numpy as np import pandas as pd -import urllib3 import xmltodict from scipy.sparse import coo_matrix import openml._api_calls import openml.utils -from openml.config import OPENML_SKIP_PARQUET_ENV_VAR from openml.exceptions import ( - OpenMLHashException, - OpenMLPrivateDatasetError, OpenMLServerError, - OpenMLServerException, ) from openml.utils import ( _create_cache_directory_for_id, @@ -281,7 +274,7 @@ def get_datasets( @openml.utils.thread_safe_if_oslo_installed -def get_dataset( # noqa: C901, PLR0912 +def get_dataset( dataset_id: int | str, download_data: bool = False, # noqa: FBT002 version: int | None = None, @@ -382,59 +375,15 @@ def get_dataset( # noqa: C901, PLR0912 if did_cache_dir.exists(): _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir) - did_cache_dir = _create_cache_directory_for_id( - DATASETS_CACHE_DIR_NAME, - dataset_id, - ) - - remove_dataset_cache = True - try: - description = _get_dataset_description(did_cache_dir, dataset_id) - features_file = None - qualities_file = None - - if download_features_meta_data: - features_file = _get_dataset_features_file(did_cache_dir, dataset_id) - if download_qualities: - qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id) - - parquet_file = None - skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true" - download_parquet = "oml:parquet_url" in description and not skip_parquet - if download_parquet and (download_data or download_all_files): - try: - parquet_file = _get_dataset_parquet( - description, - download_all_files=download_all_files, - ) - except urllib3.exceptions.MaxRetryError: - parquet_file = None - - arff_file = None - if parquet_file is None and download_data: - if download_parquet: - logger.warning("Failed to download parquet, fallback on ARFF.") - arff_file = _get_dataset_arff(description) - - remove_dataset_cache = False - except OpenMLServerException as e: - # if there was an exception - # check if the user had access to the dataset - if e.code == NO_ACCESS_GRANTED_ERRCODE: - raise OpenMLPrivateDatasetError(e.message) from None - - raise e - finally: - if remove_dataset_cache: - _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir) + from openml._api import api_context - return _create_dataset_from_description( - description, - features_file, - qualities_file, - arff_file, - parquet_file, + return api_context.backend.datasets.get( + dataset_id, + download_data, cache_format, + download_qualities, + download_features_meta_data, + download_all_files, ) @@ -988,7 +937,7 @@ def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str, def _get_dataset_parquet( description: dict | OpenMLDataset, - cache_directory: Path | None = None, + cache_directory: Path | None = None, # noqa: ARG001 download_all_files: bool = False, # noqa: FBT002 ) -> Path | None: """Return the path to the local parquet file of the dataset. If is not cached, it is downloaded. @@ -1018,47 +967,14 @@ def _get_dataset_parquet( output_filename : Path, optional Location of the Parquet file if successfully downloaded, None otherwise. """ - if isinstance(description, dict): - url = str(description.get("oml:parquet_url")) - did = int(description.get("oml:id")) # type: ignore - elif isinstance(description, OpenMLDataset): - url = str(description._parquet_url) - assert description.dataset_id is not None - - did = int(description.dataset_id) - else: - raise TypeError("`description` should be either OpenMLDataset or Dict.") - - if cache_directory is None: - cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did) - - output_file_path = cache_directory / f"dataset_{did}.pq" - - old_file_path = cache_directory / "dataset.pq" - if old_file_path.is_file(): - old_file_path.rename(output_file_path) - - # The call below skips files already on disk, so avoids downloading the parquet file twice. - # To force the old behavior of always downloading everything, use `force_refresh_cache` - # of `get_dataset` - if download_all_files: - openml._api_calls._download_minio_bucket(source=url, destination=cache_directory) + from openml._api import api_context - if not output_file_path.is_file(): - try: - openml._api_calls._download_minio_file( - source=url, - destination=output_file_path, - ) - except (FileNotFoundError, urllib3.exceptions.MaxRetryError, minio.error.ServerError) as e: - logger.warning(f"Could not download file from {url}: {e}") - return None - return output_file_path + return api_context.backend.datasets.download_dataset_parquet(description, download_all_files) def _get_dataset_arff( description: dict | OpenMLDataset, - cache_directory: Path | None = None, + cache_directory: Path | None = None, # noqa: ARG001 ) -> Path: """Return the path to the local arff file of the dataset. If is not cached, it is downloaded. @@ -1082,47 +998,15 @@ def _get_dataset_arff( output_filename : Path Location of ARFF file. """ - if isinstance(description, dict): - md5_checksum_fixture = description.get("oml:md5_checksum") - url = str(description["oml:url"]) - did = int(description.get("oml:id")) # type: ignore - elif isinstance(description, OpenMLDataset): - md5_checksum_fixture = description.md5_checksum - assert description.url is not None - assert description.dataset_id is not None - - url = description.url - did = int(description.dataset_id) - else: - raise TypeError("`description` should be either OpenMLDataset or Dict.") - - save_cache_directory = ( - _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did) - if cache_directory is None - else Path(cache_directory) - ) - output_file_path = save_cache_directory / "dataset.arff" - - try: - openml._api_calls._download_text_file( - source=url, - output_path=output_file_path, - md5_checksum=md5_checksum_fixture, - ) - except OpenMLHashException as e: - additional_info = f" Raised when downloading dataset {did}." - e.args = (e.args[0] + additional_info,) - raise e - - return output_file_path - + from openml._api import api_context -def _get_features_xml(dataset_id: int) -> str: - url_extension = f"data/features/{dataset_id}" - return openml._api_calls._perform_api_call(url_extension, "get") + return api_context.backend.datasets.download_dataset_arff(description) -def _get_dataset_features_file(did_cache_dir: str | Path | None, dataset_id: int) -> Path: +def _get_dataset_features_file( + did_cache_dir: str | Path | None, # noqa: ARG001 + dataset_id: int, +) -> Path: """API call to load dataset features. Loads from cache or downloads them. Features are feature descriptions for each column. @@ -1143,28 +1027,14 @@ def _get_dataset_features_file(did_cache_dir: str | Path | None, dataset_id: int Path Path of the cached dataset feature file """ - did_cache_dir = Path(did_cache_dir) if did_cache_dir is not None else None - if did_cache_dir is None: - did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id) - - features_file = did_cache_dir / "features.xml" - - # Dataset features aren't subject to change... - if not features_file.is_file(): - features_xml = _get_features_xml(dataset_id) - with features_file.open("w", encoding="utf8") as fh: - fh.write(features_xml) - - return features_file - + # cache directory not used here anymore + from openml._api import api_context -def _get_qualities_xml(dataset_id: int) -> str: - url_extension = f"data/qualities/{dataset_id!s}" - return openml._api_calls._perform_api_call(url_extension, "get") + return api_context.backend.datasets.download_features_file(dataset_id) def _get_dataset_qualities_file( - did_cache_dir: str | Path | None, + did_cache_dir: str | Path | None, # noqa: ARG001 dataset_id: int, ) -> Path | None: """Get the path for the dataset qualities file, or None if no qualities exist. @@ -1187,94 +1057,10 @@ def _get_dataset_qualities_file( str Path of the cached qualities file """ - save_did_cache_dir = ( - _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id) - if did_cache_dir is None - else Path(did_cache_dir) - ) - - # Dataset qualities are subject to change and must be fetched every time - qualities_file = save_did_cache_dir / "qualities.xml" - try: - with qualities_file.open(encoding="utf8") as fh: - qualities_xml = fh.read() - except OSError: - try: - qualities_xml = _get_qualities_xml(dataset_id) - with qualities_file.open("w", encoding="utf8") as fh: - fh.write(qualities_xml) - except OpenMLServerException as e: - if e.code == 362 and str(e) == "No qualities found - None": - # quality file stays as None - logger.warning(f"No qualities found for dataset {dataset_id}") - return None - - raise e - - return qualities_file - - -def _create_dataset_from_description( - description: dict[str, str], - features_file: Path | None = None, - qualities_file: Path | None = None, - arff_file: Path | None = None, - parquet_file: Path | None = None, - cache_format: Literal["pickle", "feather"] = "pickle", -) -> OpenMLDataset: - """Create a dataset object from a description dict. - - Parameters - ---------- - description : dict - Description of a dataset in xml dict. - features_file : str - Path of the dataset features as xml file. - qualities_file : list - Path of the dataset qualities as xml file. - arff_file : string, optional - Path of dataset ARFF file. - parquet_file : string, optional - Path of dataset Parquet file. - cache_format: string, optional - Caching option for datasets (feather/pickle) + # cache directory not used here anymore + from openml._api import api_context - Returns - ------- - dataset : dataset object - Dataset object from dict and ARFF. - """ - return OpenMLDataset( - description["oml:name"], - description.get("oml:description"), - data_format=description["oml:format"], # type: ignore - dataset_id=int(description["oml:id"]), - version=int(description["oml:version"]), - creator=description.get("oml:creator"), - contributor=description.get("oml:contributor"), - collection_date=description.get("oml:collection_date"), - upload_date=description.get("oml:upload_date"), - language=description.get("oml:language"), - licence=description.get("oml:licence"), - url=description["oml:url"], - default_target_attribute=description.get("oml:default_target_attribute"), - row_id_attribute=description.get("oml:row_id_attribute"), - ignore_attribute=description.get("oml:ignore_attribute"), - version_label=description.get("oml:version_label"), - citation=description.get("oml:citation"), - tag=description.get("oml:tag"), - visibility=description.get("oml:visibility"), - original_data_url=description.get("oml:original_data_url"), - paper_url=description.get("oml:paper_url"), - update_comment=description.get("oml:update_comment"), - md5_checksum=description.get("oml:md5_checksum"), - data_file=str(arff_file) if arff_file is not None else None, - cache_format=cache_format, - features_file=str(features_file) if features_file is not None else None, - qualities_file=str(qualities_file) if qualities_file is not None else None, - parquet_url=description.get("oml:parquet_url"), - parquet_file=str(parquet_file) if parquet_file is not None else None, - ) + return api_context.backend.datasets.download_features_file(dataset_id) def _get_online_dataset_arff(dataset_id: int) -> str | None: From dfa0ab79063673a65d2903374394c90cfdd27a54 Mon Sep 17 00:00:00 2001 From: JATAYU000 Date: Tue, 20 Jan 2026 21:11:09 +0530 Subject: [PATCH 15/15] Update todos, topic endpoints --- openml/_api/http/client.py | 2 +- openml/_api/resources/base.py | 6 ++++++ openml/_api/resources/datasets.py | 23 ++++++++++++++++++++- openml/datasets/functions.py | 33 ++++++++++++++++--------------- 4 files changed, 46 insertions(+), 18 deletions(-) diff --git a/openml/_api/http/client.py b/openml/_api/http/client.py index ea812e0bb..1256b5d84 100644 --- a/openml/_api/http/client.py +++ b/openml/_api/http/client.py @@ -267,7 +267,7 @@ def download_minio_bucket(self, source: str, destination: str | Path | None = No parsed_url = urllib.parse.urlparse(source) # expect path format: /BUCKET/path/to/file.ext - _, bucket, *prefixes, _file = parsed_url.path.split("/") + _, bucket, *prefixes, _ = parsed_url.path.split("/") prefix = "/".join(prefixes) client = minio.Minio(endpoint=parsed_url.netloc, secure=False) diff --git a/openml/_api/resources/base.py b/openml/_api/resources/base.py index 5ad143db8..703351485 100644 --- a/openml/_api/resources/base.py +++ b/openml/_api/resources/base.py @@ -117,6 +117,12 @@ def download_dataset_arff( description: dict | OpenMLDataset, ) -> Path: ... + @abstractmethod + def add_topic(self, data_id: int, topic: str) -> int: ... + + @abstractmethod + def delete_topic(self, data_id: int, topic: str) -> int: ... + class TasksAPI(ResourceAPI, ABC): @abstractmethod diff --git a/openml/_api/resources/datasets.py b/openml/_api/resources/datasets.py index 1387bfd54..a7ff60555 100644 --- a/openml/_api/resources/datasets.py +++ b/openml/_api/resources/datasets.py @@ -28,11 +28,12 @@ if TYPE_CHECKING: from requests import Response - import openml import pandas as pd import xmltodict +import openml + logger = logging.getLogger(__name__) @@ -693,6 +694,20 @@ def download_dataset_arff( return output_file_path + def add_topic(self, data_id: int, topic: str) -> int: + form_data = {"data_id": data_id, "topic": topic} # type: openml._api_calls.DATA_TYPE + result_xml = openml._api_calls._perform_api_call("data/topicadd", "post", data=form_data) + result = xmltodict.parse(result_xml) + data_id = result["oml:data_topic"]["oml:id"] + return int(data_id) + + def delete_topic(self, data_id: int, topic: str) -> int: + form_data = {"data_id": data_id, "topic": topic} # type: openml._api_calls.DATA_TYPE + result_xml = openml._api_calls._perform_api_call("data/topicdelete", "post", data=form_data) + result = xmltodict.parse(result_xml) + data_id = result["oml:data_topic"]["oml:id"] + return int(data_id) + class DatasetsV2(DatasetsAPI): def get( @@ -1100,3 +1115,9 @@ def download_dataset_arff( raise e return output_file_path + + def add_topic(self, data_id: int, topic: str) -> int: + raise NotImplementedError() + + def delete_topic(self, data_id: int, topic: str) -> int: + raise NotImplementedError() diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 8b3fbd732..dd626eb08 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -80,7 +80,7 @@ def list_datasets( Parameters ---------- - dataset_id : list, optional + data_id : list, optional A list of data ids, to specify which datasets should be listed offset : int, optional @@ -842,6 +842,7 @@ def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> boo return api_context.backend.datasets.feature_remove_ontology(data_id, index, ontology) +# TODO used only in tests def _topic_add_dataset(data_id: int, topic: str) -> int: """ Adds a topic for a dataset. @@ -858,15 +859,12 @@ def _topic_add_dataset(data_id: int, topic: str) -> int: ------- Dataset id """ - if not isinstance(data_id, int): - raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") - form_data = {"data_id": data_id, "topic": topic} # type: openml._api_calls.DATA_TYPE - result_xml = openml._api_calls._perform_api_call("data/topicadd", "post", data=form_data) - result = xmltodict.parse(result_xml) - data_id = result["oml:data_topic"]["oml:id"] - return int(data_id) + from openml._api import api_context + + return api_context.backend.datasets.add_topic(data_id, topic) +# TODO used only in tests def _topic_delete_dataset(data_id: int, topic: str) -> int: """ Removes a topic from a dataset. @@ -883,15 +881,12 @@ def _topic_delete_dataset(data_id: int, topic: str) -> int: ------- Dataset id """ - if not isinstance(data_id, int): - raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") - form_data = {"data_id": data_id, "topic": topic} # type: openml._api_calls.DATA_TYPE - result_xml = openml._api_calls._perform_api_call("data/topicdelete", "post", data=form_data) - result = xmltodict.parse(result_xml) - data_id = result["oml:data_topic"]["oml:id"] - return int(data_id) + from openml._api import api_context + return api_context.backend.datasets.delete_topic(data_id, topic) + +# TODO used by tests only def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str, Any]: """Get the dataset description as xml dictionary. @@ -935,6 +930,7 @@ def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str, return description # type: ignore +# TODO remove cache dir def _get_dataset_parquet( description: dict | OpenMLDataset, cache_directory: Path | None = None, # noqa: ARG001 @@ -972,6 +968,7 @@ def _get_dataset_parquet( return api_context.backend.datasets.download_dataset_parquet(description, download_all_files) +# TODO remove cache dir def _get_dataset_arff( description: dict | OpenMLDataset, cache_directory: Path | None = None, # noqa: ARG001 @@ -1003,6 +1000,7 @@ def _get_dataset_arff( return api_context.backend.datasets.download_dataset_arff(description) +# TODO remove cache dir def _get_dataset_features_file( did_cache_dir: str | Path | None, # noqa: ARG001 dataset_id: int, @@ -1033,6 +1031,7 @@ def _get_dataset_features_file( return api_context.backend.datasets.download_features_file(dataset_id) +# TODO remove cache dir def _get_dataset_qualities_file( did_cache_dir: str | Path | None, # noqa: ARG001 dataset_id: int, @@ -1060,9 +1059,10 @@ def _get_dataset_qualities_file( # cache directory not used here anymore from openml._api import api_context - return api_context.backend.datasets.download_features_file(dataset_id) + return api_context.backend.datasets.download_qualities_file(dataset_id) +# TODO used only in tests def _get_online_dataset_arff(dataset_id: int) -> str | None: """Download the ARFF file for a given dataset id from the OpenML website. @@ -1085,6 +1085,7 @@ def _get_online_dataset_arff(dataset_id: int) -> str | None: ) +# TODO used only in tests def _get_online_dataset_format(dataset_id: int) -> str: """Get the dataset format for a given dataset id from the OpenML website.