From 45c2af6033ee9245527075f6077e7e1e4e1cbc1e Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Mon, 9 Feb 2026 14:17:53 +1100 Subject: [PATCH 01/28] initial boiler plate for persistence model --- packages/bundled_models/persistence/README.md | 45 ++++++ .../bundled_models/persistence/pyproject.toml | 57 ++++++++ .../persistence/src/persistence/__init__.py | 0 .../persistence/src/persistence/_interface.py | 130 ++++++++++++++++++ .../persistence/src/persistence/_median.py | 10 ++ .../src/persistence/_mostrecent.py | 0 6 files changed, 242 insertions(+) create mode 100644 packages/bundled_models/persistence/README.md create mode 100644 packages/bundled_models/persistence/pyproject.toml create mode 100644 packages/bundled_models/persistence/src/persistence/__init__.py create mode 100644 packages/bundled_models/persistence/src/persistence/_interface.py create mode 100644 packages/bundled_models/persistence/src/persistence/_median.py create mode 100644 packages/bundled_models/persistence/src/persistence/_mostrecent.py diff --git a/packages/bundled_models/persistence/README.md b/packages/bundled_models/persistence/README.md new file mode 100644 index 00000000..b61c4a03 --- /dev/null +++ b/packages/bundled_models/persistence/README.md @@ -0,0 +1,45 @@ +# Persistence Model for use with the PyEarthTools Package + +**TODO: description** + +## Installation + +Clone the repository, then run +```shell +pip install -e . +``` + +## Training + +No training is required for this model. It computes persistence on-the-fly using historical data loaded via the PET pipeline. + +## Predictions / Inference + +You can generate persistence values out of the box using the `pet predict` command line API, or by using a Jupyter Notebook as demonstrated in the tutorial gallery. + +```shell +pet predict +``` + +and `Development/Persistence` should be visible. + +If so, you can now run some inference. + +```shell +pet predict --model Development/Persistence +``` + +When running the command, it will prompt for other required arguments. + +**TODO: description of required arguments** + + +#### Example + +```shell +pet predict --model Development/Persistence # TODO +``` + +## Acknowledgments + +Not applicable. Heuristically developed. diff --git a/packages/bundled_models/persistence/pyproject.toml b/packages/bundled_models/persistence/pyproject.toml new file mode 100644 index 00000000..acb42bae --- /dev/null +++ b/packages/bundled_models/persistence/pyproject.toml @@ -0,0 +1,57 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + + +[project] +name = "pyearthtools-bundled-persistence" +version = "0.6.0" +description = "Persistence Bundled Model" +readme = "README.md" +requires-python = ">=3.11, <3.14" +keywords = ["fourcastnext"] +maintainers = [ + {name = "Tennessee Leeuwenburg", email = "tennessee.leeuwenburg@bom.gov.au"} + {name = "Nikeeth Ramanathan", email = "nikeeth.ramanathan@gmail.com"} +] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +dependencies = [ + 'pyearthtools.zoo>=0.5.0', + 'pyearthtools.data>=0.5.0', + 'pyearthtools.pipeline>=0.5.0', + 'hydra-core', +] + + +[project.urls] +homepage = "https://pyearthtools.readthedocs.io/" +documentation = "https://pyearthtools.readthedocs.io/" +repository = "https://github.com/ACCESS-Community-Hub/PyEarthTools" + +[project.entry-points."pyearthtools.zoo.model"] +Global_FCNXT = "fourcastnext.registered_model:Persistence" + +[tool.isort] +profile = "black" + +[tool.black] +line-length = 120 + +[tool.mypy] +warn_return_any = true +warn_unused_configs = true + +[[tool.mypy.overrides]] +ignore_missing_imports = true + +[tool.hatch.version] +path = "src/pyearthtools/pipeline/__init__.py" + +[tool.hatch.build.targets.wheel] +packages = ["src/pyearthtools/"] diff --git a/packages/bundled_models/persistence/src/persistence/__init__.py b/packages/bundled_models/persistence/src/persistence/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/bundled_models/persistence/src/persistence/_interface.py b/packages/bundled_models/persistence/src/persistence/_interface.py new file mode 100644 index 00000000..80717c24 --- /dev/null +++ b/packages/bundled_models/persistence/src/persistence/_interface.py @@ -0,0 +1,130 @@ +from enum import Enum +from dataclasses import dataclass + +import numpy as np +import xarray as xr + + +_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER = 2 +_DEFAULT_CHUNK_SIZE = + +@dataclass +class PersistenceDataChunk: + """ + Sample usage pattern: + 1. split dataset into chunks + 2. represent chunk as 'PersistenceDataChunk' + 3. perform computation + 4. grab output chunk and insert into destination dataset + + IMPORTANT: data should not be chunked over time. + """ + chunk: np.ndarray # ndarray including time axis. Sorted ascending in time. The latest + # data point is assumed to be the "reference" time. + time_axis: int # the time axis - this will be flattened + time_lookback: int # number of lookback indices + persistence_method: PersistenceMethod # which method to use to calculate persistence + + +class PersistenceMethod(Enum): + """ + Methods to use for persistence. + + MEDIAN_OF_THREE: + computes the median of the three most recent observations. + fallback = MOST_RECENT + + MOST_RECENT: + simplest form of persistence, will find the most recent non-`nan` value to use as persistence. + + If there are nans, previous observations are used instead, up until the `max_lookback` threshold + as determined by the "sparsity_multiplier". + + E.g. if the sparsity multiplier was "3" i.e. 66.67% of the data is `nan`, median of three + (which needs exactly 3 non-nan values) will look for non-nan values up to 9 indices before the + reference index to find non-nan candidates. + + Most methods will fallback to MOST_RECENT if they fail. If MOST_RECENT fails, the result will be + nan, essentially marking the datapoint as "void" for comparisons. + """ + MOST_RECENT = 0 + MEDIAN_OF_THREE = 1 + + def num_time_indices_required(self): + match self: + case PersistenceData.MOST_RECENT: + return 1 + case PersistenceData.MEDIAN_OF_THREE: + return 3 + case _: + raise NotImplementedError("Invalid persistence method.") + + def min_lookback(self, sparsity_multiplier=_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER): + """ + The minimum amount of lookback required to compute the corresponding metric. + By default we assume a 50% sparsity and require at least double the number of values + required for the compuation. + """ + self.time_indices_required() * sparsity_multiplier + +@dataclass +class PersistenceChunker: + """ + Takes a lazy-loaded xarray dataarray (or similar) and slices it into chunks, processing them + either serially or in parallel. The result is merged into the full field. + + Chunking will happen over non-temporal dimensions as they are generally invariant to the + persistence computations (and even if they have spatial dependence this dependence is very + local in nature compared to the full field size). + + The process is: + + pivot on time dimension + -> chunk on non-time dimension + -> compute persistence for each chunk, aggregating over time + -> reshape into output form + + IMPORTANT: This method may not always speed things up or use less memory. It entirely depends on + the underlying data source being stored/chunked at rest, and how xarray decides to retrieve it. + + FUTUREWORK: usage of rust and/or parquet for dataloading and intermediate caching will be + explored in order to speed up this process. + """ + da_lazy: xr.DataArray # lazy loaded data array + num_chunks: int # number of chunks to use + time_dimname: str # the time dimension name normally "time" + chunk_dimname: str = None # the dimension name to chunk along, or default to a non-time + # dimension + + def __post_init__(self): + # check time dimension + if self.time_dimname not in self.da_lazy.dims: + raise KeyError(f"PersistenceModel: time dimension {self.time_dimname} not found in input array") + self.time_idx = self.da_lazy.dims.index(time_dimname) + + # check chunk dimension, arbitrarily select it + # NOTE: modulus ensures "-1" corrects to last index. "-1" is usually fine in python, + # however, we're doing index comparisons below and need the non-zero equivilent. + self.chunk_idx = (self.time_idx - 1) % len(self.da_lazy.dims) + if self.chunk_dimname is not None: + if self.chunk_dimname not in self.da_lazy.dims: + raise KeyError(f"PersistenceModel: chunk dimension {self.chunk_dimname} not found in input array") + self.chunk_idx = self.da_lazy.dims.index(chunk_dimname) + + # time dimension cannot be used for chunking (since its needed for aggregation) + if self.time_idx == self.chunk_idx: + raise ValueError("PersistenceModel: cannot chunk over time dimension") + + if self.num_chunks < 1: + raise ValueError("PersistenceModel: number of chunks must be greater than or equal to 1") + + def generate_chunks(self): + """ + Generator that extracts chunks + """ + # --- TODO: below is psuedocode + start_slice, end_slice = self.get_chunk_slice(chunk_counter) + chunk_counter = 0 + while chunk_counter < self.num_chunks + yield self.da_lazy + chunk_counter += 1 diff --git a/packages/bundled_models/persistence/src/persistence/_median.py b/packages/bundled_models/persistence/src/persistence/_median.py new file mode 100644 index 00000000..646015ba --- /dev/null +++ b/packages/bundled_models/persistence/src/persistence/_median.py @@ -0,0 +1,10 @@ +import + +def py_median_of_three( + ds: xr.Dataset, + time_dim: str, + reference_time: datetime.datetime, +): + """ + Computes the median of three + """ diff --git a/packages/bundled_models/persistence/src/persistence/_mostrecent.py b/packages/bundled_models/persistence/src/persistence/_mostrecent.py new file mode 100644 index 00000000..e69de29b From c5e740c4109f0f0f0fda46284d4ec47cb7c7f12c Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Thu, 12 Feb 2026 12:51:55 +1100 Subject: [PATCH 02/28] fix basic interface object creation tests. Add initial draft of forcing dask scheduler to single-threaded --- .../bundled_models/persistence/.gitattributes | 2 + .../bundled_models/persistence/.gitignore | 3 + packages/bundled_models/persistence/pixi.lock | 2306 +++++++++++++++++ .../bundled_models/persistence/pyproject.toml | 56 +- .../bundled_models/persistence/report.xml | 1 + .../persistence/src/persistence/__init__.py | 11 + .../src/persistence/_daskconfig.py | 50 + .../persistence/src/persistence/_interface.py | 158 +- .../persistence/tests/test__interface.py | 95 + 9 files changed, 2621 insertions(+), 61 deletions(-) create mode 100644 packages/bundled_models/persistence/.gitattributes create mode 100644 packages/bundled_models/persistence/.gitignore create mode 100644 packages/bundled_models/persistence/pixi.lock create mode 100644 packages/bundled_models/persistence/report.xml create mode 100644 packages/bundled_models/persistence/src/persistence/_daskconfig.py create mode 100644 packages/bundled_models/persistence/tests/test__interface.py diff --git a/packages/bundled_models/persistence/.gitattributes b/packages/bundled_models/persistence/.gitattributes new file mode 100644 index 00000000..997504b4 --- /dev/null +++ b/packages/bundled_models/persistence/.gitattributes @@ -0,0 +1,2 @@ +# SCM syntax highlighting & preventing 3-way merges +pixi.lock merge=binary linguist-language=YAML linguist-generated=true -diff diff --git a/packages/bundled_models/persistence/.gitignore b/packages/bundled_models/persistence/.gitignore new file mode 100644 index 00000000..ae849e65 --- /dev/null +++ b/packages/bundled_models/persistence/.gitignore @@ -0,0 +1,3 @@ +# pixi environments +.pixi/* +!.pixi/config.toml diff --git a/packages/bundled_models/persistence/pixi.lock b/packages/bundled_models/persistence/pixi.lock new file mode 100644 index 00000000..515422be --- /dev/null +++ b/packages/bundled_models/persistence/pixi.lock @@ -0,0 +1,2306 @@ +version: 6 +environments: + dask: + channels: + - url: https://conda.anaconda.org/conda-forge/ + indexes: + - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/backports.zstd-1.3.0-py313h18e8e13_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.2.0-py313hf159716_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_8.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.1.4-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cloudpickle-3.1.2-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cytoolz-1.1.0-py313h07c4f96_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/dask-core-2026.1.2-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/distributed-2026.1.2-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/fsspec-2026.2.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.3-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/locket-1.0.0-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.3-py313h3dea7bd_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/msgpack-python-1.1.2-py313h7037e92_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py313hf6604e3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pandas-3.0.0-py313hbfd7664_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/partd-1.4.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/psutil-7.2.2-py313h54dd161_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.12-hc97d973_100_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py313h3dea7bd_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sortedcontainers-2.4.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tblib-3.2.2-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/toolz-1.1.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tornado-6.5.3-py313h07c4f96_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.3-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/xarray-2026.1.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/zict-3.0.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda + - pypi: https://files.pythonhosted.org/packages/3e/38/7859ff46355f76f8d19459005ca000b6e7012f2f1ca597746cbcd1fbfe5e/antlr4-python3-runtime-4.9.3.tar.gz + - pypi: https://files.pythonhosted.org/packages/d2/39/e7eaf1799466a4aef85b6a4fe7bd175ad2b1c6345066aa33f1f58d4b18d0/asttokens-3.0.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f5/83/6ab5883f57c9c801ce5e5677242328aa45592be8a00644310a008d04f922/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/35/a8/365059bbcd4572cbc41de17fd5b682be5868b218c3c5479071865cab9078/entrypoints-0.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/36/7fb70f04bf00bc646cd5bb45aa9eddb15e19437a28b8fb2b4a5249fac770/filelock-3.20.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/54/e4/fac19dc34cb686c96011388b813ff7b858a70681e5ce6ce7698e5021b0f4/geopandas-1.1.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/91/4c/e0ce1ef95d4000ebc1c11801f9b944fa5910ecc15b5e351865763d8657f8/graphviz-0.21-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c6/50/e0edd38dcd63fb26a8547f13d28f7a008bc4a3fd4eb4ff030673f22ad41a/hydra_core-1.3.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/3d/aa/898dec789a05731cd5a9f50605b7b44a72bd198fd0d4528e11fc610177cc/ipython-9.10.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/93/cf/be4e93afbfa0def2cd6fac9302071db0bd6d0617999ecbf53f92b9398de3/multiurl-0.3.7-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e3/94/1843518e420fa3ed6919835845df698c7e27e183cb997394e4a670973a65/omegaconf-2.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b6/61/fae042894f4296ec49e3f193aff5d7c18440da9e48102c3315e1bc4519a7/parso-0.8.6-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/71/24/538bff45bde96535d7d998c6fed1a751c75ac7c53c37c90dc2601b243893/pillow-12.1.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b3/f8/f47b90fbeaf36e112b1a93fc313d5f0bc9f0051ae8be734173787a00271a/pyearthtools_data-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f2/f8/beda8582d430075031ac8835aced207d7bc639469451c932fdf1c0b2ed5c/pyearthtools_pipeline-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/38/06/7ed1c4fad0195d7700b77df09dae83ce6658fa6e2d5bb0c92bec79d766d3/pyearthtools_training-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/cf/fc/c774d872abe5ae0c4381c5cb1ed61240e682c44ed019f807e18be26a7882/pyearthtools_utils-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a4/45/1cb45ccac7c5f728a363d17a145443ed1f66962d3224b8e1166a4fd7bae1/pyearthtools_zoo-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/46/35/b874f79d03e9f900012cf609f7fff97b77164f2e14ee5aac282f8a999c1b/pyogrio-0.12.1-cp313-cp313-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/f8/85/c2b1706e51942de19076eff082f8495e57d5151364e78b5bef4af4a1d94a/pyproj-3.7.2-cp313-cp313-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/63/1e/12fbf2a3bb240161651c94bb5cdd0eae5d4e8cc6eaeceb74ab07b12a753d/scipy-1.17.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/f2/a2/83fc37e2a58090e3d2ff79175a95493c664bcd0b653dd75cb9134645a4e5/shapely-2.1.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl + - pypi: ./ + default: + channels: + - url: https://conda.anaconda.org/conda-forge/ + indexes: + - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_8.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.1.4-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.3-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py313hf6604e3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pandas-3.0.0-py313hbfd7664_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.12-hc97d973_100_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/xarray-2026.1.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda + - pypi: https://files.pythonhosted.org/packages/3e/38/7859ff46355f76f8d19459005ca000b6e7012f2f1ca597746cbcd1fbfe5e/antlr4-python3-runtime-4.9.3.tar.gz + - pypi: https://files.pythonhosted.org/packages/d2/39/e7eaf1799466a4aef85b6a4fe7bd175ad2b1c6345066aa33f1f58d4b18d0/asttokens-3.0.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f5/83/6ab5883f57c9c801ce5e5677242328aa45592be8a00644310a008d04f922/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/35/a8/365059bbcd4572cbc41de17fd5b682be5868b218c3c5479071865cab9078/entrypoints-0.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/36/7fb70f04bf00bc646cd5bb45aa9eddb15e19437a28b8fb2b4a5249fac770/filelock-3.20.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/54/e4/fac19dc34cb686c96011388b813ff7b858a70681e5ce6ce7698e5021b0f4/geopandas-1.1.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/91/4c/e0ce1ef95d4000ebc1c11801f9b944fa5910ecc15b5e351865763d8657f8/graphviz-0.21-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c6/50/e0edd38dcd63fb26a8547f13d28f7a008bc4a3fd4eb4ff030673f22ad41a/hydra_core-1.3.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/3d/aa/898dec789a05731cd5a9f50605b7b44a72bd198fd0d4528e11fc610177cc/ipython-9.10.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/93/cf/be4e93afbfa0def2cd6fac9302071db0bd6d0617999ecbf53f92b9398de3/multiurl-0.3.7-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e3/94/1843518e420fa3ed6919835845df698c7e27e183cb997394e4a670973a65/omegaconf-2.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b6/61/fae042894f4296ec49e3f193aff5d7c18440da9e48102c3315e1bc4519a7/parso-0.8.6-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/71/24/538bff45bde96535d7d998c6fed1a751c75ac7c53c37c90dc2601b243893/pillow-12.1.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b3/f8/f47b90fbeaf36e112b1a93fc313d5f0bc9f0051ae8be734173787a00271a/pyearthtools_data-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f2/f8/beda8582d430075031ac8835aced207d7bc639469451c932fdf1c0b2ed5c/pyearthtools_pipeline-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/38/06/7ed1c4fad0195d7700b77df09dae83ce6658fa6e2d5bb0c92bec79d766d3/pyearthtools_training-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/cf/fc/c774d872abe5ae0c4381c5cb1ed61240e682c44ed019f807e18be26a7882/pyearthtools_utils-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a4/45/1cb45ccac7c5f728a363d17a145443ed1f66962d3224b8e1166a4fd7bae1/pyearthtools_zoo-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/46/35/b874f79d03e9f900012cf609f7fff97b77164f2e14ee5aac282f8a999c1b/pyogrio-0.12.1-cp313-cp313-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/f8/85/c2b1706e51942de19076eff082f8495e57d5151364e78b5bef4af4a1d94a/pyproj-3.7.2-cp313-cp313-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/63/1e/12fbf2a3bb240161651c94bb5cdd0eae5d4e8cc6eaeceb74ab07b12a753d/scipy-1.17.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/f2/a2/83fc37e2a58090e3d2ff79175a95493c664bcd0b653dd75cb9134645a4e5/shapely-2.1.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl + - pypi: ./ + dev: + channels: + - url: https://conda.anaconda.org/conda-forge/ + indexes: + - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/asttokens-3.0.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/backports.zstd-1.3.0-py313h18e8e13_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.2.0-py313hf159716_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_8.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.1.4-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cloudpickle-3.1.2-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.13.4-py313h3dea7bd_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cytoolz-1.1.0-py313h07c4f96_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/dask-core-2026.1.2-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/decorator-5.2.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/distributed-2026.1.2-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/executing-2.2.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/fsspec-2026.2.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ipython-9.10.0-pyh53cf698_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ipython_pygments_lexers-1.1.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/jedi-0.19.2-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.3-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/locket-1.0.0-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.3-py313h3dea7bd_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.2.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/msgpack-python-1.1.2-py313h7037e92_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py313hf6604e3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pandas-3.0.0-py313hbfd7664_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.6-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/partd-1.4.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pexpect-4.9.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/prompt-toolkit-3.0.52-pyha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/psutil-7.2.2-py313h54dd161_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ptyprocess-0.7.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pure_eval-0.2.3-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.0.0-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.12-hc97d973_100_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py313h3dea7bd_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ruff-0.15.0-h40fa522_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sortedcontainers-2.4.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/stack_data-0.6.3-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tblib-3.2.2-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/toolz-1.1.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tornado-6.5.3-py313h07c4f96_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.3-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.6.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/xarray-2026.1.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/zict-3.0.0-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda + - pypi: https://files.pythonhosted.org/packages/3e/38/7859ff46355f76f8d19459005ca000b6e7012f2f1ca597746cbcd1fbfe5e/antlr4-python3-runtime-4.9.3.tar.gz + - pypi: https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f5/83/6ab5883f57c9c801ce5e5677242328aa45592be8a00644310a008d04f922/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/35/a8/365059bbcd4572cbc41de17fd5b682be5868b218c3c5479071865cab9078/entrypoints-0.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/36/7fb70f04bf00bc646cd5bb45aa9eddb15e19437a28b8fb2b4a5249fac770/filelock-3.20.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/54/e4/fac19dc34cb686c96011388b813ff7b858a70681e5ce6ce7698e5021b0f4/geopandas-1.1.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/91/4c/e0ce1ef95d4000ebc1c11801f9b944fa5910ecc15b5e351865763d8657f8/graphviz-0.21-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c6/50/e0edd38dcd63fb26a8547f13d28f7a008bc4a3fd4eb4ff030673f22ad41a/hydra_core-1.3.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/93/cf/be4e93afbfa0def2cd6fac9302071db0bd6d0617999ecbf53f92b9398de3/multiurl-0.3.7-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e3/94/1843518e420fa3ed6919835845df698c7e27e183cb997394e4a670973a65/omegaconf-2.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/71/24/538bff45bde96535d7d998c6fed1a751c75ac7c53c37c90dc2601b243893/pillow-12.1.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/b3/f8/f47b90fbeaf36e112b1a93fc313d5f0bc9f0051ae8be734173787a00271a/pyearthtools_data-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f2/f8/beda8582d430075031ac8835aced207d7bc639469451c932fdf1c0b2ed5c/pyearthtools_pipeline-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/38/06/7ed1c4fad0195d7700b77df09dae83ce6658fa6e2d5bb0c92bec79d766d3/pyearthtools_training-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/cf/fc/c774d872abe5ae0c4381c5cb1ed61240e682c44ed019f807e18be26a7882/pyearthtools_utils-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a4/45/1cb45ccac7c5f728a363d17a145443ed1f66962d3224b8e1166a4fd7bae1/pyearthtools_zoo-0.5.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/46/35/b874f79d03e9f900012cf609f7fff97b77164f2e14ee5aac282f8a999c1b/pyogrio-0.12.1-cp313-cp313-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/f8/85/c2b1706e51942de19076eff082f8495e57d5151364e78b5bef4af4a1d94a/pyproj-3.7.2-cp313-cp313-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/63/1e/12fbf2a3bb240161651c94bb5cdd0eae5d4e8cc6eaeceb74ab07b12a753d/scipy-1.17.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/f2/a2/83fc37e2a58090e3d2ff79175a95493c664bcd0b653dd75cb9134645a4e5/shapely-2.1.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl + - pypi: ./ +packages: +- conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 + sha256: fe51de6107f9edc7aa4f786a70f4a883943bc9d39b3bb7307c04c41410990726 + md5: d7c89558ba9fa0495403155b64376d81 + license: None + purls: [] + size: 2562 + timestamp: 1578324546067 +- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 + build_number: 16 + sha256: fbe2c5e56a653bebb982eda4876a9178aedfc2b545f25d0ce9c4c0b508253d22 + md5: 73aaf86a425cc6e73fcf236a5a46396d + depends: + - _libgcc_mutex 0.1 conda_forge + - libgomp >=7.5.0 + constrains: + - openmp_impl 9999 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 23621 + timestamp: 1650670423406 +- pypi: https://files.pythonhosted.org/packages/3e/38/7859ff46355f76f8d19459005ca000b6e7012f2f1ca597746cbcd1fbfe5e/antlr4-python3-runtime-4.9.3.tar.gz + name: antlr4-python3-runtime + version: 4.9.3 + sha256: f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b + requires_dist: + - typing ; python_full_version < '3.5' +- pypi: https://files.pythonhosted.org/packages/d2/39/e7eaf1799466a4aef85b6a4fe7bd175ad2b1c6345066aa33f1f58d4b18d0/asttokens-3.0.1-py3-none-any.whl + name: asttokens + version: 3.0.1 + sha256: 15a3ebc0f43c2d0a50eeafea25e19046c68398e487b9f1f5b517f7c0f40f976a + requires_dist: + - astroid>=2,<5 ; extra == 'astroid' + - astroid>=2,<5 ; extra == 'test' + - pytest<9.0 ; extra == 'test' + - pytest-cov ; extra == 'test' + - pytest-xdist ; extra == 'test' + requires_python: '>=3.8' +- conda: https://conda.anaconda.org/conda-forge/noarch/asttokens-3.0.1-pyhd8ed1ab_0.conda + sha256: ee4da0f3fe9d59439798ee399ef3e482791e48784873d546e706d0935f9ff010 + md5: 9673a61a297b00016442e022d689faa6 + depends: + - python >=3.10 + constrains: + - astroid >=2,<5 + license: Apache-2.0 + license_family: Apache + purls: + - pkg:pypi/asttokens?source=hash-mapping + size: 28797 + timestamp: 1763410017955 +- conda: https://conda.anaconda.org/conda-forge/linux-64/backports.zstd-1.3.0-py313h18e8e13_0.conda + sha256: 9552afbec37c4d8d0e83a5c4c6b3c7f4b8785f935094ce3881e0a249045909ce + md5: d9e90792551a527200637e23a915dd79 + depends: + - python + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - python_abi 3.13.* *_cp313 + - zstd >=1.5.7,<1.6.0a0 + license: BSD-3-Clause AND MIT AND EPL-2.0 + purls: + - pkg:pypi/backports-zstd?source=hash-mapping + size: 240943 + timestamp: 1767044981366 +- conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.2.0-py313hf159716_1.conda + sha256: dadec2879492adede0a9af0191203f9b023f788c18efd45ecac676d424c458ae + md5: 6c4d3597cf43f3439a51b2b13e29a4ba + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + - python >=3.13,<3.14.0a0 + - python_abi 3.13.* *_cp313 + constrains: + - libbrotlicommon 1.2.0 hb03c661_1 + license: MIT + license_family: MIT + purls: + - pkg:pypi/brotli?source=hash-mapping + size: 367721 + timestamp: 1764017371123 +- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_8.conda + sha256: c30daba32ddebbb7ded490f0e371eae90f51e72db620554089103b4a6934b0d5 + md5: 51a19bba1b8ebfb60df25cde030b7ebc + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: bzip2-1.0.6 + license_family: BSD + purls: [] + size: 260341 + timestamp: 1757437258798 +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.1.4-hbd8a1cb_0.conda + sha256: b5974ec9b50e3c514a382335efa81ed02b05906849827a34061c496f4defa0b2 + md5: bddacf101bb4dd0e51811cb69c7790e2 + depends: + - __unix + license: ISC + purls: [] + size: 146519 + timestamp: 1767500828366 +- pypi: https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl + name: certifi + version: 2026.1.4 + sha256: 9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/f5/83/6ab5883f57c9c801ce5e5677242328aa45592be8a00644310a008d04f922/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + name: charset-normalizer + version: 3.4.4 + sha256: a8a8b89589086a25749f471e6a900d3f662d1d3b6e2e59dcecf787b1cc3a1894 + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl + name: click + version: 8.3.1 + sha256: 981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6 + requires_dist: + - colorama ; sys_platform == 'win32' + requires_python: '>=3.10' +- conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda + sha256: 38cfe1ee75b21a8361c8824f5544c3866f303af1762693a178266d7f198e8715 + md5: ea8a6c3256897cc31263de9f455e25d9 + depends: + - python >=3.10 + - __unix + - python + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/click?source=hash-mapping + size: 97676 + timestamp: 1764518652276 +- conda: https://conda.anaconda.org/conda-forge/noarch/cloudpickle-3.1.2-pyhcf101f3_1.conda + sha256: 4c287c2721d8a34c94928be8fe0e9a85754e90189dd4384a31b1806856b50a67 + md5: 61b8078a0905b12529abc622406cb62c + depends: + - python >=3.10 + - python + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/cloudpickle?source=compressed-mapping + size: 27353 + timestamp: 1765303462831 +- conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda + sha256: ab29d57dc70786c1269633ba3dff20288b81664d3ff8d21af995742e2bb03287 + md5: 962b9857ee8e7018c22f2776ffa0b2d7 + depends: + - python >=3.9 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/colorama?source=hash-mapping + size: 27011 + timestamp: 1733218222191 +- conda: https://conda.anaconda.org/conda-forge/linux-64/coverage-7.13.4-py313h3dea7bd_0.conda + sha256: 5b88b351c6a61ac25ed02e23cd37b25cc90e071f5cdfbc375b656356fb04ca5c + md5: 77e1fc7133e03ccd62070f2405c82ea9 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - python >=3.13,<3.14.0a0 + - python_abi 3.13.* *_cp313 + - tomli + license: Apache-2.0 + license_family: APACHE + purls: + - pkg:pypi/coverage?source=hash-mapping + size: 394748 + timestamp: 1770720450191 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cytoolz-1.1.0-py313h07c4f96_1.conda + sha256: a8ffc7cf31a698a57a46bf7977185ed1e644c5e35d4e166d8f260dca93af6ffb + md5: bcca9afd203fe05d9582249ac12762da + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - python >=3.13,<3.14.0a0 + - python_abi 3.13.* *_cp313 + - toolz >=0.10.0 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/cytoolz?source=hash-mapping + size: 590435 + timestamp: 1760905824293 +- conda: https://conda.anaconda.org/conda-forge/noarch/dask-core-2026.1.2-pyhcf101f3_0.conda + sha256: c8500be32e2c75b10fd7a0664b0e5abc956dece18a54774a53f357aeabe9e1b6 + md5: b20e7ce9afd59036ab194f3d1e27edf5 + depends: + - python >=3.10 + - click >=8.1 + - cloudpickle >=3.0.0 + - fsspec >=2021.9.0 + - packaging >=20.0 + - partd >=1.4.0 + - pyyaml >=5.3.1 + - toolz >=0.12.0 + - importlib-metadata >=4.13.0 + - python + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/dask?source=hash-mapping + size: 1063599 + timestamp: 1769829714443 +- pypi: https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl + name: decorator + version: 5.2.1 + sha256: d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a + requires_python: '>=3.8' +- conda: https://conda.anaconda.org/conda-forge/noarch/decorator-5.2.1-pyhd8ed1ab_0.conda + sha256: c17c6b9937c08ad63cb20a26f403a3234088e57d4455600974a0ce865cb14017 + md5: 9ce473d1d1be1cc3810856a48b3fab32 + depends: + - python >=3.9 + license: BSD-2-Clause + license_family: BSD + purls: + - pkg:pypi/decorator?source=hash-mapping + size: 14129 + timestamp: 1740385067843 +- conda: https://conda.anaconda.org/conda-forge/noarch/distributed-2026.1.2-pyhcf101f3_0.conda + sha256: 1cbc2ffaef515c43f37d4684942850e1184956a89b1c0651bb656c81bc11aaa1 + md5: 1eac93a6257796dd348d366a85f7f283 + depends: + - python >=3.10 + - click >=8.0 + - cloudpickle >=3.0.0 + - cytoolz >=0.12.0 + - dask-core >=2026.1.2,<2026.1.3.0a0 + - jinja2 >=2.10.3 + - locket >=1.0.0 + - msgpack-python >=1.0.2 + - packaging >=20.0 + - psutil >=5.8.0 + - pyyaml >=5.4.1 + - sortedcontainers >=2.0.5 + - tblib >=1.6.0 + - toolz >=0.12.0 + - tornado >=6.2.0 + - urllib3 >=1.26.5 + - zict >=3.0.0 + - python + constrains: + - openssl !=1.1.1e + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/distributed?source=hash-mapping + size: 844862 + timestamp: 1769888496327 +- pypi: https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl + name: einops + version: 0.8.2 + sha256: 54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/35/a8/365059bbcd4572cbc41de17fd5b682be5868b218c3c5479071865cab9078/entrypoints-0.4-py3-none-any.whl + name: entrypoints + version: '0.4' + sha256: f174b5ff827504fd3cd97cc3f8649f3693f51538c7e4bdf3ef002c8429d42f9f + requires_python: '>=3.6' +- conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.1-pyhd8ed1ab_0.conda + sha256: ee6cf346d017d954255bbcbdb424cddea4d14e4ed7e9813e429db1d795d01144 + md5: 8e662bd460bda79b1ea39194e3c4c9ab + depends: + - python >=3.10 + - typing_extensions >=4.6.0 + license: MIT and PSF-2.0 + purls: + - pkg:pypi/exceptiongroup?source=hash-mapping + size: 21333 + timestamp: 1763918099466 +- conda: https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.2-pyhd8ed1ab_0.conda + sha256: 1acc6a420efc5b64c384c1f35f49129966f8a12c93b4bb2bdc30079e5dc9d8a8 + md5: a57b4be42619213a94f31d2c69c5dda7 + depends: + - python >=3.10 + license: MIT + license_family: MIT + purls: + - pkg:pypi/execnet?source=hash-mapping + size: 39499 + timestamp: 1762974150770 +- pypi: https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl + name: executing + version: 2.2.1 + sha256: 760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017 + requires_dist: + - asttokens>=2.1.0 ; extra == 'tests' + - ipython ; extra == 'tests' + - pytest ; extra == 'tests' + - coverage ; extra == 'tests' + - coverage-enable-subprocess ; extra == 'tests' + - littleutils ; extra == 'tests' + - rich ; python_full_version >= '3.11' and extra == 'tests' + requires_python: '>=3.8' +- conda: https://conda.anaconda.org/conda-forge/noarch/executing-2.2.1-pyhd8ed1ab_0.conda + sha256: 210c8165a58fdbf16e626aac93cc4c14dbd551a01d1516be5ecad795d2422cad + md5: ff9efb7f7469aed3c4a8106ffa29593c + depends: + - python >=3.10 + license: MIT + license_family: MIT + purls: + - pkg:pypi/executing?source=hash-mapping + size: 30753 + timestamp: 1756729456476 +- pypi: https://files.pythonhosted.org/packages/b5/36/7fb70f04bf00bc646cd5bb45aa9eddb15e19437a28b8fb2b4a5249fac770/filelock-3.20.3-py3-none-any.whl + name: filelock + version: 3.20.3 + sha256: 4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1 + requires_python: '>=3.10' +- conda: https://conda.anaconda.org/conda-forge/noarch/fsspec-2026.2.0-pyhd8ed1ab_0.conda + sha256: 239b67edf1c5e5caed52cf36e9bed47cb21b37721779828c130e6b3fd9793c1b + md5: 496c6c9411a6284addf55c898d6ed8d7 + depends: + - python >=3.10 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/fsspec?source=compressed-mapping + size: 148757 + timestamp: 1770387898414 +- pypi: https://files.pythonhosted.org/packages/54/e4/fac19dc34cb686c96011388b813ff7b858a70681e5ce6ce7698e5021b0f4/geopandas-1.1.2-py3-none-any.whl + name: geopandas + version: 1.1.2 + sha256: 2bb0b1052cb47378addb4ba54c47f8d4642dcbda9b61375638274f49d9f0bb0d + requires_dist: + - numpy>=1.24 + - pyogrio>=0.7.2 + - packaging + - pandas>=2.0.0 + - pyproj>=3.5.0 + - shapely>=2.0.0 + - psycopg[binary]>=3.1.0 ; extra == 'all' + - sqlalchemy>=2.0 ; extra == 'all' + - geopy ; extra == 'all' + - matplotlib>=3.7 ; extra == 'all' + - mapclassify>=2.5 ; extra == 'all' + - xyzservices ; extra == 'all' + - folium ; extra == 'all' + - geoalchemy2 ; extra == 'all' + - pyarrow>=10.0.0 ; extra == 'all' + - scipy ; extra == 'all' + - pytest>=3.1.0 ; extra == 'dev' + - pytest-cov ; extra == 'dev' + - pytest-xdist ; extra == 'dev' + - codecov ; extra == 'dev' + - pre-commit ; extra == 'dev' + - ruff ; extra == 'dev' + requires_python: '>=3.10' +- pypi: https://files.pythonhosted.org/packages/91/4c/e0ce1ef95d4000ebc1c11801f9b944fa5910ecc15b5e351865763d8657f8/graphviz-0.21-py3-none-any.whl + name: graphviz + version: '0.21' + sha256: 54f33de9f4f911d7e84e4191749cac8cc5653f815b06738c54db9a15ab8b1e42 + requires_dist: + - build ; extra == 'dev' + - wheel ; extra == 'dev' + - twine ; extra == 'dev' + - flake8 ; extra == 'dev' + - flake8-pyproject ; extra == 'dev' + - pep8-naming ; extra == 'dev' + - tox>=3 ; extra == 'dev' + - pytest>=7,<8.1 ; extra == 'test' + - pytest-mock>=3 ; extra == 'test' + - pytest-cov ; extra == 'test' + - coverage ; extra == 'test' + - sphinx>=5,<7 ; extra == 'docs' + - sphinx-autodoc-typehints ; extra == 'docs' + - sphinx-rtd-theme>=0.2.5 ; extra == 'docs' + requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda + sha256: 84c64443368f84b600bfecc529a1194a3b14c3656ee2e832d15a20e0329b6da3 + md5: 164fc43f0b53b6e3a7bc7dce5e4f1dc9 + depends: + - python >=3.10 + - hyperframe >=6.1,<7 + - hpack >=4.1,<5 + - python + license: MIT + license_family: MIT + purls: + - pkg:pypi/h2?source=hash-mapping + size: 95967 + timestamp: 1756364871835 +- conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda + sha256: 6ad78a180576c706aabeb5b4c8ceb97c0cb25f1e112d76495bff23e3779948ba + md5: 0a802cb9888dd14eeefc611f05c40b6e + depends: + - python >=3.9 + license: MIT + license_family: MIT + purls: + - pkg:pypi/hpack?source=hash-mapping + size: 30731 + timestamp: 1737618390337 +- pypi: https://files.pythonhosted.org/packages/c6/50/e0edd38dcd63fb26a8547f13d28f7a008bc4a3fd4eb4ff030673f22ad41a/hydra_core-1.3.2-py3-none-any.whl + name: hydra-core + version: 1.3.2 + sha256: fa0238a9e31df3373b35b0bfb672c34cc92718d21f81311d8996a16de1141d8b + requires_dist: + - omegaconf>=2.2,<2.4 + - antlr4-python3-runtime==4.9.* + - packaging + - importlib-resources ; python_full_version < '3.9' +- conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda + sha256: 77af6f5fe8b62ca07d09ac60127a30d9069fdc3c68d6b256754d0ffb1f7779f8 + md5: 8e6923fc12f1fe8f8c4e5c9f343256ac + depends: + - python >=3.9 + license: MIT + license_family: MIT + purls: + - pkg:pypi/hyperframe?source=hash-mapping + size: 17397 + timestamp: 1737618427549 +- conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda + sha256: 142a722072fa96cf16ff98eaaf641f54ab84744af81754c292cb81e0881c0329 + md5: 186a18e3ba246eccfc7cff00cd19a870 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + license: MIT + license_family: MIT + purls: [] + size: 12728445 + timestamp: 1767969922681 +- pypi: https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl + name: idna + version: '3.11' + sha256: 771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea + requires_dist: + - ruff>=0.6.2 ; extra == 'all' + - mypy>=1.11.2 ; extra == 'all' + - pytest>=8.3.2 ; extra == 'all' + - flake8>=7.1.1 ; extra == 'all' + requires_python: '>=3.8' +- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda + sha256: c18ab120a0613ada4391b15981d86ff777b5690ca461ea7e9e49531e8f374745 + md5: 63ccfdc3a3ce25b027b8767eb722fca8 + depends: + - python >=3.9 + - zipp >=3.20 + - python + license: Apache-2.0 + license_family: APACHE + purls: + - pkg:pypi/importlib-metadata?source=hash-mapping + size: 34641 + timestamp: 1747934053147 +- conda: https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.3.0-pyhd8ed1ab_0.conda + sha256: e1a9e3b1c8fe62dc3932a616c284b5d8cbe3124bbfbedcf4ce5c828cb166ee19 + md5: 9614359868482abba1bd15ce465e3c42 + depends: + - python >=3.10 + license: MIT + license_family: MIT + purls: + - pkg:pypi/iniconfig?source=compressed-mapping + size: 13387 + timestamp: 1760831448842 +- pypi: https://files.pythonhosted.org/packages/3d/aa/898dec789a05731cd5a9f50605b7b44a72bd198fd0d4528e11fc610177cc/ipython-9.10.0-py3-none-any.whl + name: ipython + version: 9.10.0 + sha256: c6ab68cc23bba8c7e18e9b932797014cc61ea7fd6f19de180ab9ba73e65ee58d + requires_dist: + - colorama>=0.4.4 ; sys_platform == 'win32' + - decorator>=4.3.2 + - ipython-pygments-lexers>=1.0.0 + - jedi>=0.18.1 + - matplotlib-inline>=0.1.5 + - pexpect>4.3 ; sys_platform != 'emscripten' and sys_platform != 'win32' + - prompt-toolkit>=3.0.41,<3.1.0 + - pygments>=2.11.0 + - stack-data>=0.6.0 + - traitlets>=5.13.0 + - typing-extensions>=4.6 ; python_full_version < '3.12' + - black ; extra == 'black' + - docrepr ; extra == 'doc' + - exceptiongroup ; extra == 'doc' + - intersphinx-registry ; extra == 'doc' + - ipykernel ; extra == 'doc' + - ipython[matplotlib,test] ; extra == 'doc' + - setuptools>=70.0 ; extra == 'doc' + - sphinx-toml==0.0.4 ; extra == 'doc' + - sphinx-rtd-theme>=0.1.8 ; extra == 'doc' + - sphinx>=8.0 ; extra == 'doc' + - typing-extensions ; extra == 'doc' + - pytest>=7.0.0 ; extra == 'test' + - pytest-asyncio>=1.0.0 ; extra == 'test' + - testpath>=0.2 ; extra == 'test' + - packaging>=20.1.0 ; extra == 'test' + - setuptools>=61.2 ; extra == 'test' + - ipython[test] ; extra == 'test-extra' + - curio ; extra == 'test-extra' + - jupyter-ai ; extra == 'test-extra' + - ipython[matplotlib] ; extra == 'test-extra' + - nbformat ; extra == 'test-extra' + - nbclient ; extra == 'test-extra' + - ipykernel>6.30 ; extra == 'test-extra' + - numpy>=1.27 ; extra == 'test-extra' + - pandas>2.1 ; extra == 'test-extra' + - trio>=0.1.0 ; extra == 'test-extra' + - matplotlib>3.9 ; extra == 'matplotlib' + - ipython[doc,matplotlib,terminal,test,test-extra] ; extra == 'all' + - argcomplete>=3.0 ; extra == 'all' + requires_python: '>=3.11' +- conda: https://conda.anaconda.org/conda-forge/noarch/ipython-9.10.0-pyh53cf698_0.conda + sha256: 12cb4db242ea1a2e5e60a51b20f16e9c8120a9eb5d013c641cbf827bf3bb78e1 + md5: 441ca4e203a62f7db2f29f190c02b9cf + depends: + - __unix + - pexpect >4.3 + - decorator >=4.3.2 + - ipython_pygments_lexers >=1.0.0 + - jedi >=0.18.1 + - matplotlib-inline >=0.1.5 + - prompt-toolkit >=3.0.41,<3.1.0 + - pygments >=2.11.0 + - python >=3.11 + - stack_data >=0.6.0 + - traitlets >=5.13.0 + - typing_extensions >=4.6 + - python + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/ipython?source=compressed-mapping + size: 647436 + timestamp: 1770040907512 +- pypi: https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl + name: ipython-pygments-lexers + version: 1.1.1 + sha256: a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c + requires_dist: + - pygments + requires_python: '>=3.8' +- conda: https://conda.anaconda.org/conda-forge/noarch/ipython_pygments_lexers-1.1.1-pyhd8ed1ab_0.conda + sha256: 894682a42a7d659ae12878dbcb274516a7031bbea9104e92f8e88c1f2765a104 + md5: bd80ba060603cc228d9d81c257093119 + depends: + - pygments + - python >=3.9 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/ipython-pygments-lexers?source=hash-mapping + size: 13993 + timestamp: 1737123723464 +- pypi: https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl + name: jedi + version: 0.19.2 + sha256: a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9 + requires_dist: + - parso>=0.8.4,<0.9.0 + - jinja2==2.11.3 ; extra == 'docs' + - markupsafe==1.1.1 ; extra == 'docs' + - pygments==2.8.1 ; extra == 'docs' + - alabaster==0.7.12 ; extra == 'docs' + - babel==2.9.1 ; extra == 'docs' + - chardet==4.0.0 ; extra == 'docs' + - commonmark==0.8.1 ; extra == 'docs' + - docutils==0.17.1 ; extra == 'docs' + - future==0.18.2 ; extra == 'docs' + - idna==2.10 ; extra == 'docs' + - imagesize==1.2.0 ; extra == 'docs' + - mock==1.0.1 ; extra == 'docs' + - packaging==20.9 ; extra == 'docs' + - pyparsing==2.4.7 ; extra == 'docs' + - pytz==2021.1 ; extra == 'docs' + - readthedocs-sphinx-ext==2.1.4 ; extra == 'docs' + - recommonmark==0.5.0 ; extra == 'docs' + - requests==2.25.1 ; extra == 'docs' + - six==1.15.0 ; extra == 'docs' + - snowballstemmer==2.1.0 ; extra == 'docs' + - sphinx-rtd-theme==0.4.3 ; extra == 'docs' + - sphinx==1.8.5 ; extra == 'docs' + - sphinxcontrib-serializinghtml==1.1.4 ; extra == 'docs' + - sphinxcontrib-websupport==1.2.4 ; extra == 'docs' + - urllib3==1.26.4 ; extra == 'docs' + - flake8==5.0.4 ; extra == 'qa' + - mypy==0.971 ; extra == 'qa' + - types-setuptools==67.2.0.1 ; extra == 'qa' + - django ; extra == 'testing' + - attrs ; extra == 'testing' + - colorama ; extra == 'testing' + - docopt ; extra == 'testing' + - pytest<9.0.0 ; extra == 'testing' + requires_python: '>=3.6' +- conda: https://conda.anaconda.org/conda-forge/noarch/jedi-0.19.2-pyhd8ed1ab_1.conda + sha256: 92c4d217e2dc68983f724aa983cca5464dcb929c566627b26a2511159667dba8 + md5: a4f4c5dc9b80bc50e0d3dc4e6e8f1bd9 + depends: + - parso >=0.8.3,<0.9.0 + - python >=3.9 + license: Apache-2.0 AND MIT + purls: + - pkg:pypi/jedi?source=hash-mapping + size: 843646 + timestamp: 1733300981994 +- conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda + sha256: fc9ca7348a4f25fed2079f2153ecdcf5f9cf2a0bc36c4172420ca09e1849df7b + md5: 04558c96691bed63104678757beb4f8d + depends: + - markupsafe >=2.0 + - python >=3.10 + - python + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/jinja2?source=compressed-mapping + size: 120685 + timestamp: 1764517220861 +- pypi: https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl + name: joblib + version: 1.5.3 + sha256: 5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713 + requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda + sha256: 565941ac1f8b0d2f2e8f02827cbca648f4d18cd461afc31f15604cd291b5c5f3 + md5: 12bd9a3f089ee6c9266a37dab82afabd + depends: + - __glibc >=2.17,<3.0.a0 + - zstd >=1.5.7,<1.6.0a0 + constrains: + - binutils_impl_linux-64 2.45.1 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 725507 + timestamp: 1770267139900 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda + build_number: 5 + sha256: 18c72545080b86739352482ba14ba2c4815e19e26a7417ca21a95b76ec8da24c + md5: c160954f7418d7b6e87eaf05a8913fa9 + depends: + - libopenblas >=0.3.30,<0.3.31.0a0 + - libopenblas >=0.3.30,<1.0a0 + constrains: + - mkl <2026 + - liblapack 3.11.0 5*_openblas + - libcblas 3.11.0 5*_openblas + - blas 2.305 openblas + - liblapacke 3.11.0 5*_openblas + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 18213 + timestamp: 1765818813880 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda + build_number: 5 + sha256: 0cbdcc67901e02dc17f1d19e1f9170610bd828100dc207de4d5b6b8ad1ae7ad8 + md5: 6636a2b6f1a87572df2970d3ebc87cc0 + depends: + - libblas 3.11.0 5_h4a7cf45_openblas + constrains: + - liblapacke 3.11.0 5*_openblas + - blas 2.305 openblas + - liblapack 3.11.0 5*_openblas + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 18194 + timestamp: 1765818837135 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.3-hecca717_0.conda + sha256: 1e1b08f6211629cbc2efe7a5bca5953f8f6b3cae0eeb04ca4dacee1bd4e2db2f + md5: 8b09ae86839581147ef2e5c5e229d164 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - expat 2.7.3.* + license: MIT + license_family: MIT + purls: [] + size: 76643 + timestamp: 1763549731408 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + sha256: 31f19b6a88ce40ebc0d5a992c131f57d919f73c0b92cd1617a5bec83f6e961e6 + md5: a360c33a5abe61c07959e449fa1453eb + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: MIT + license_family: MIT + purls: [] + size: 58592 + timestamp: 1769456073053 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_17.conda + sha256: 43860222cf3abf04ded0cf24541a105aa388e0e1d4d6ca46258e186d4e87ae3e + md5: 3c281169ea25b987311400d7a7e28445 + depends: + - __glibc >=2.17,<3.0.a0 + - _openmp_mutex >=4.5 + constrains: + - libgcc-ng ==15.2.0=*_17 + - libgomp 15.2.0 he0feb66_17 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 1040478 + timestamp: 1770252533873 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_17.conda + sha256: 1604c083dd65bc91e68b6cfe32c8610395088cb96af1acaf71f0dcaf83ac58f7 + md5: a6c682ac611cb1fa4d73478f9e6efb06 + depends: + - libgfortran5 15.2.0 h68bc16d_17 + constrains: + - libgfortran-ng ==15.2.0=*_17 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 27515 + timestamp: 1770252591906 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_17.conda + sha256: b1c77b85da9a3e204de986f59e262268805c6a35dffdf3953f1b98407db2aef3 + md5: 202fdf8cad9eea704c2b0d823d1732bf + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=15.2.0 + constrains: + - libgfortran 15.2.0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 2480824 + timestamp: 1770252563579 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_17.conda + sha256: b961b5dd9761907a7179678b58a69bb4fc16b940eb477f635aea3aec0a3f17a6 + md5: 51b78c6a757575c0d12f4401ffc67029 + depends: + - __glibc >=2.17,<3.0.a0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 603334 + timestamp: 1770252441199 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda + build_number: 5 + sha256: c723b6599fcd4c6c75dee728359ef418307280fa3e2ee376e14e85e5bbdda053 + md5: b38076eb5c8e40d0106beda6f95d7609 + depends: + - libblas 3.11.0 5_h4a7cf45_openblas + constrains: + - blas 2.305 openblas + - liblapacke 3.11.0 5*_openblas + - libcblas 3.11.0 5*_openblas + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 18200 + timestamp: 1765818857876 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda + sha256: 755c55ebab181d678c12e49cced893598f2bab22d582fbbf4d8b83c18be207eb + md5: c7c83eecbb72d88b940c249af56c8b17 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - xz 5.8.2.* + license: 0BSD + purls: [] + size: 113207 + timestamp: 1768752626120 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + sha256: fe171ed5cf5959993d43ff72de7596e8ac2853e9021dec0344e583734f1e0843 + md5: 2c21e66f50753a083cbe6b80f38268fa + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 92400 + timestamp: 1769482286018 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda + sha256: 199d79c237afb0d4780ccd2fbf829cea80743df60df4705202558675e07dd2c5 + md5: be43915efc66345cccb3c310b6ed0374 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libgfortran + - libgfortran5 >=14.3.0 + constrains: + - openblas >=0.3.30,<0.3.31.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 5927939 + timestamp: 1763114673331 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda + sha256: 04596fcee262a870e4b7c9807224680ff48d4d0cc0dac076a602503d3dc6d217 + md5: da5be73701eecd0e8454423fd6ffcf30 + depends: + - __glibc >=2.17,<3.0.a0 + - icu >=78.2,<79.0a0 + - libgcc >=14 + - libzlib >=1.3.1,<2.0a0 + license: blessing + purls: [] + size: 942808 + timestamp: 1768147973361 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_17.conda + sha256: 50c48cd3716a2e58e8e2e02edc78fef2d08fffe1e3b1ed40eb5f87e7e2d07889 + md5: 24c2fe35fa45cd71214beba6f337c071 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc 15.2.0 he0feb66_17 + constrains: + - libstdcxx-ng ==15.2.0=*_17 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 5852406 + timestamp: 1770252584235 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda + sha256: 1a7539cfa7df00714e8943e18de0b06cceef6778e420a5ee3a2a145773758aee + md5: db409b7c1720428638e7c0d509d3e1b5 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 40311 + timestamp: 1766271528534 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4 + md5: edb0dca6bc32e4f4789199455a1dbeb8 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + constrains: + - zlib 1.3.1 *_2 + license: Zlib + license_family: Other + purls: [] + size: 60963 + timestamp: 1727963148474 +- conda: https://conda.anaconda.org/conda-forge/noarch/locket-1.0.0-pyhd8ed1ab_0.tar.bz2 + sha256: 9afe0b5cfa418e8bdb30d8917c5a6cec10372b037924916f1f85b9f4899a67a6 + md5: 91e27ef3d05cc772ce627e51cff111c4 + depends: + - python >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.* + license: BSD-2-Clause + license_family: BSD + purls: + - pkg:pypi/locket?source=hash-mapping + size: 8250 + timestamp: 1650660473123 +- conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.3-py313h3dea7bd_0.conda + sha256: a530a411bdaaf0b1e4de8869dfaca46cb07407bc7dc0702a9e231b0e5ce7ca85 + md5: c14389156310b8ed3520d84f854be1ee + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - python >=3.13,<3.14.0a0 + - python_abi 3.13.* *_cp313 + constrains: + - jinja2 >=3.0.0 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/markupsafe?source=hash-mapping + size: 25909 + timestamp: 1759055357045 +- pypi: https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl + name: matplotlib-inline + version: 0.2.1 + sha256: d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76 + requires_dist: + - traitlets + - flake8 ; extra == 'test' + - nbdime ; extra == 'test' + - nbval ; extra == 'test' + - notebook ; extra == 'test' + - pytest ; extra == 'test' + requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.2.1-pyhd8ed1ab_0.conda + sha256: 9d690334de0cd1d22c51bc28420663f4277cfa60d34fa5cad1ce284a13f1d603 + md5: 00e120ce3e40bad7bfc78861ce3c4a25 + depends: + - python >=3.10 + - traitlets + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/matplotlib-inline?source=hash-mapping + size: 15175 + timestamp: 1761214578417 +- conda: https://conda.anaconda.org/conda-forge/linux-64/msgpack-python-1.1.2-py313h7037e92_1.conda + sha256: fac37e267dd1d07527f0b078ffe000916e80e8c89cfe69d466f5775b88e93df2 + md5: cd1cfde0ea3bca6c805c73ffa988b12a + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + - python >=3.13,<3.14.0a0 + - python_abi 3.13.* *_cp313 + license: Apache-2.0 + license_family: Apache + purls: + - pkg:pypi/msgpack?source=hash-mapping + size: 103129 + timestamp: 1762504205590 +- pypi: https://files.pythonhosted.org/packages/93/cf/be4e93afbfa0def2cd6fac9302071db0bd6d0617999ecbf53f92b9398de3/multiurl-0.3.7-py3-none-any.whl + name: multiurl + version: 0.3.7 + sha256: 054f42974064f103be0ed55b43f0c32fc435a47dc7353a9adaffa643b99fa380 + requires_dist: + - requests + - tqdm + - pytz + - python-dateutil +- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + sha256: 3fde293232fa3fca98635e1167de6b7c7fda83caf24b9d6c91ec9eefb4f4d586 + md5: 47e340acb35de30501a76c7c799c41d7 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + license: X11 AND BSD-3-Clause + purls: [] + size: 891641 + timestamp: 1738195959188 +- conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py313hf6604e3_1.conda + sha256: 2eb8be25a7504f058a153a84be70471e0ebbf6bd0411ae2b6d34904b89d86fe3 + md5: ca9c6ba4beac38cb3d0a85afde27f94c + depends: + - python + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - libstdcxx >=14 + - liblapack >=3.9.0,<4.0a0 + - libcblas >=3.9.0,<4.0a0 + - python_abi 3.13.* *_cp313 + - libblas >=3.9.0,<4.0a0 + constrains: + - numpy-base <0a0 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/numpy?source=hash-mapping + size: 8857152 + timestamp: 1770098515258 +- pypi: https://files.pythonhosted.org/packages/e3/94/1843518e420fa3ed6919835845df698c7e27e183cb997394e4a670973a65/omegaconf-2.3.0-py3-none-any.whl + name: omegaconf + version: 2.3.0 + sha256: 7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b + requires_dist: + - antlr4-python3-runtime==4.9.* + - pyyaml>=5.1.0 + - dataclasses ; python_full_version == '3.6.*' + requires_python: '>=3.6' +- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda + sha256: 44c877f8af015332a5d12f5ff0fb20ca32f896526a7d0cdb30c769df1144fb5c + md5: f61eb8cd60ff9057122a3d338b99c00f + depends: + - __glibc >=2.17,<3.0.a0 + - ca-certificates + - libgcc >=14 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 3164551 + timestamp: 1769555830639 +- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda + sha256: c1fc0f953048f743385d31c468b4a678b3ad20caffdeaa94bed85ba63049fd58 + md5: b76541e68fea4d511b1ac46a28dcd2c6 + depends: + - python >=3.8 + - python + license: Apache-2.0 + license_family: APACHE + purls: + - pkg:pypi/packaging?source=compressed-mapping + size: 72010 + timestamp: 1769093650580 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pandas-3.0.0-py313hbfd7664_0.conda + sha256: 05719fdfacdf97206a901621d79ab103c34905973ec8a18627825d5adab7a1b0 + md5: ab6d05e915ab2ae4c41d275b14592151 + depends: + - python + - numpy >=1.26.0 + - python-dateutil >=2.8.2 + - __glibc >=2.17,<3.0.a0 + - libstdcxx >=14 + - libgcc >=14 + - python_abi 3.13.* *_cp313 + - numpy >=1.23,<3 + constrains: + - adbc-driver-postgresql >=1.2.0 + - adbc-driver-sqlite >=1.2.0 + - beautifulsoup4 >=4.12.3 + - blosc >=1.21.3 + - bottleneck >=1.4.2 + - fastparquet >=2024.11.0 + - fsspec >=2024.10.0 + - gcsfs >=2024.10.0 + - html5lib >=1.1 + - hypothesis >=6.116.0 + - jinja2 >=3.1.5 + - lxml >=5.3.0 + - matplotlib >=3.9.3 + - numba >=0.60.0 + - numexpr >=2.10.2 + - odfpy >=1.4.1 + - openpyxl >=3.1.5 + - psycopg2 >=2.9.10 + - pyarrow >=13.0.0 + - pyiceberg >=0.8.1 + - pymysql >=1.1.1 + - pyqt5 >=5.15.9 + - pyreadstat >=1.2.8 + - pytables >=3.10.1 + - pytest >=8.3.4 + - pytest-xdist >=3.6.1 + - python-calamine >=0.3.0 + - pytz >=2024.2 + - pyxlsb >=1.0.10 + - qtpy >=2.4.2 + - scipy >=1.14.1 + - s3fs >=2024.10.0 + - sqlalchemy >=2.0.36 + - tabulate >=0.9.0 + - xarray >=2024.10.0 + - xlrd >=2.0.1 + - xlsxwriter >=3.2.0 + - zstandard >=0.23.0 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/pandas?source=hash-mapping + size: 14952243 + timestamp: 1769076307505 +- pypi: https://files.pythonhosted.org/packages/b6/61/fae042894f4296ec49e3f193aff5d7c18440da9e48102c3315e1bc4519a7/parso-0.8.6-py2.py3-none-any.whl + name: parso + version: 0.8.6 + sha256: 2c549f800b70a5c4952197248825584cb00f033b29c692671d3bf08bf380baff + requires_dist: + - pytest ; extra == 'testing' + - docopt ; extra == 'testing' + - flake8==5.0.4 ; extra == 'qa' + - zuban==0.5.1 ; extra == 'qa' + - types-setuptools==67.2.0.1 ; extra == 'qa' + requires_python: '>=3.6' +- conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.6-pyhcf101f3_0.conda + sha256: 42b2d77ccea60752f3aa929a6413a7835aaacdbbde679f2f5870a744fa836b94 + md5: 97c1ce2fffa1209e7afb432810ec6e12 + depends: + - python >=3.10 + - python + license: MIT + license_family: MIT + purls: + - pkg:pypi/parso?source=compressed-mapping + size: 82287 + timestamp: 1770676243987 +- conda: https://conda.anaconda.org/conda-forge/noarch/partd-1.4.2-pyhd8ed1ab_0.conda + sha256: 472fc587c63ec4f6eba0cc0b06008a6371e0a08a5986de3cf4e8024a47b4fe6c + md5: 0badf9c54e24cecfb0ad2f99d680c163 + depends: + - locket + - python >=3.9 + - toolz + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/partd?source=hash-mapping + size: 20884 + timestamp: 1715026639309 +- pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl + name: pexpect + version: 4.9.0 + sha256: 7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523 + requires_dist: + - ptyprocess>=0.5 +- conda: https://conda.anaconda.org/conda-forge/noarch/pexpect-4.9.0-pyhd8ed1ab_1.conda + sha256: 202af1de83b585d36445dc1fda94266697341994d1a3328fabde4989e1b3d07a + md5: d0d408b1f18883a944376da5cf8101ea + depends: + - ptyprocess >=0.5 + - python >=3.9 + license: ISC + purls: + - pkg:pypi/pexpect?source=hash-mapping + size: 53561 + timestamp: 1733302019362 +- pypi: https://files.pythonhosted.org/packages/71/24/538bff45bde96535d7d998c6fed1a751c75ac7c53c37c90dc2601b243893/pillow-12.1.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + name: pillow + version: 12.1.1 + sha256: 47b94983da0c642de92ced1702c5b6c292a84bd3a8e1d1702ff923f183594717 + requires_dist: + - furo ; extra == 'docs' + - olefile ; extra == 'docs' + - sphinx>=8.2 ; extra == 'docs' + - sphinx-autobuild ; extra == 'docs' + - sphinx-copybutton ; extra == 'docs' + - sphinx-inline-tabs ; extra == 'docs' + - sphinxext-opengraph ; extra == 'docs' + - olefile ; extra == 'fpx' + - olefile ; extra == 'mic' + - arro3-compute ; extra == 'test-arrow' + - arro3-core ; extra == 'test-arrow' + - nanoarrow ; extra == 'test-arrow' + - pyarrow ; extra == 'test-arrow' + - check-manifest ; extra == 'tests' + - coverage>=7.4.2 ; extra == 'tests' + - defusedxml ; extra == 'tests' + - markdown2 ; extra == 'tests' + - olefile ; extra == 'tests' + - packaging ; extra == 'tests' + - pyroma>=5 ; extra == 'tests' + - pytest ; extra == 'tests' + - pytest-cov ; extra == 'tests' + - pytest-timeout ; extra == 'tests' + - pytest-xdist ; extra == 'tests' + - trove-classifiers>=2024.10.12 ; extra == 'tests' + - defusedxml ; extra == 'xmp' + requires_python: '>=3.10' +- conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda + sha256: e14aafa63efa0528ca99ba568eaf506eb55a0371d12e6250aaaa61718d2eb62e + md5: d7585b6550ad04c8c5e21097ada2888e + depends: + - python >=3.9 + - python + license: MIT + license_family: MIT + purls: + - pkg:pypi/pluggy?source=compressed-mapping + size: 25877 + timestamp: 1764896838868 +- pypi: https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl + name: prompt-toolkit + version: 3.0.52 + sha256: 9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955 + requires_dist: + - wcwidth + requires_python: '>=3.8' +- conda: https://conda.anaconda.org/conda-forge/noarch/prompt-toolkit-3.0.52-pyha770c72_0.conda + sha256: 4817651a276016f3838957bfdf963386438c70761e9faec7749d411635979bae + md5: edb16f14d920fb3faf17f5ce582942d6 + depends: + - python >=3.10 + - wcwidth + constrains: + - prompt_toolkit 3.0.52 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/prompt-toolkit?source=hash-mapping + size: 273927 + timestamp: 1756321848365 +- conda: https://conda.anaconda.org/conda-forge/linux-64/psutil-7.2.2-py313h54dd161_0.conda + sha256: f19fd682d874689dfde20bf46d7ec1a28084af34583e0405685981363af47c91 + md5: 25fe6e02c2083497b3239e21b49d8093 + depends: + - python + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - python_abi 3.13.* *_cp313 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/psutil?source=hash-mapping + size: 228663 + timestamp: 1769678153829 +- pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl + name: ptyprocess + version: 0.7.0 + sha256: 4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 +- conda: https://conda.anaconda.org/conda-forge/noarch/ptyprocess-0.7.0-pyhd8ed1ab_1.conda + sha256: a7713dfe30faf17508ec359e0bc7e0983f5d94682492469bd462cdaae9c64d83 + md5: 7d9daffbb8d8e0af0f769dbbcd173a54 + depends: + - python >=3.9 + license: ISC + purls: + - pkg:pypi/ptyprocess?source=hash-mapping + size: 19457 + timestamp: 1733302371990 +- pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl + name: pure-eval + version: 0.2.3 + sha256: 1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0 + requires_dist: + - pytest ; extra == 'tests' +- conda: https://conda.anaconda.org/conda-forge/noarch/pure_eval-0.2.3-pyhd8ed1ab_1.conda + sha256: 71bd24600d14bb171a6321d523486f6a06f855e75e547fa0cb2a0953b02047f0 + md5: 3bfdfb8dbcdc4af1ae3f9a8eb3948f04 + depends: + - python >=3.9 + license: MIT + license_family: MIT + purls: + - pkg:pypi/pure-eval?source=hash-mapping + size: 16668 + timestamp: 1733569518868 +- pypi: ./ + name: pyearthtools-bundled-persistence + version: 0.6.0 + sha256: 88d20e73ba2c4cbde71d4a3e3381e5dd361a52f021fe36e79d09d3f9ed1b4cf4 + requires_dist: + - pyearthtools-zoo>=0.5.0 + - pyearthtools-data>=0.5.0 + - pyearthtools-pipeline>=0.5.0 + - hydra-core + requires_python: '>=3.11,<3.14' +- pypi: https://files.pythonhosted.org/packages/b3/f8/f47b90fbeaf36e112b1a93fc313d5f0bc9f0051ae8be734173787a00271a/pyearthtools_data-0.5.1-py3-none-any.whl + name: pyearthtools-data + version: 0.5.1 + sha256: f930e2ff804686d94699c0a6cdc5bf3675f9f8df0f8abb4494198fe6ab1a3fbc + requires_dist: + - click + - filelock + - geopandas + - pyearthtools-utils>=0.5.0 + - pyyaml + - shapely + - tqdm + - urllib3 + - xarray[complete] + - cdsapi ; extra == 'all' + - eccodes ; extra == 'all' + - ecmwf-opendata ; extra == 'all' + - gcsfs ; extra == 'all' + - intake ; extra == 'all' + - intake-esm ; extra == 'all' + - zarr==2.* ; extra == 'all' + - cdsapi ; extra == 'download' + - eccodes ; extra == 'download' + - ecmwf-opendata ; extra == 'download' + - gcsfs ; extra == 'download' + - zarr==2.* ; extra == 'download' + - intake ; extra == 'intake' + - intake-esm ; extra == 'intake' + requires_python: '>=3.11' +- pypi: https://files.pythonhosted.org/packages/f2/f8/beda8582d430075031ac8835aced207d7bc639469451c932fdf1c0b2ed5c/pyearthtools_pipeline-0.5.1-py3-none-any.whl + name: pyearthtools-pipeline + version: 0.5.1 + sha256: 7a02dd6dd91226452ffbc71cf43d8ec16118cd3fb456f8e9180446bd72a4c417 + requires_dist: + - einops + - graphviz + - pandas + - pyearthtools-data>=0.5.0 + - pyearthtools-utils>=0.5.0 + - xarray + - dask ; extra == 'all' + - distributed ; extra == 'all' + - healpy ; extra == 'all' + - pyearthtools-data[all] ; extra == 'all' + - reproject ; extra == 'all' + - dask ; extra == 'distributed' + - distributed ; extra == 'distributed' + - healpy ; extra == 'remapping' + - reproject ; extra == 'remapping' + requires_python: '>=3.11' +- pypi: https://files.pythonhosted.org/packages/38/06/7ed1c4fad0195d7700b77df09dae83ce6658fa6e2d5bb0c92bec79d766d3/pyearthtools_training-0.5.1-py3-none-any.whl + name: pyearthtools-training + version: 0.5.1 + sha256: 14a999fb404182615cfabf62e1279276178ef56e672b801cfa3e7f12049f9350 + requires_dist: + - einops + - pyearthtools-pipeline>=0.5.0 + - pyearthtools-utils>=0.5.0 + - scikit-learn + - scipy + - lightning ; extra == 'all' + - piqa ; extra == 'all' + - scikit-learn ; extra == 'all' + - tensorboard ; extra == 'all' + - tensorly ; extra == 'all' + - torch ; extra == 'all' + - xgboost ; extra == 'all' + - lightning ; extra == 'lightning' + - piqa ; extra == 'lightning' + - tensorboard ; extra == 'lightning' + - tensorly ; extra == 'lightning' + - torch ; extra == 'lightning' + - onnx ; extra == 'onnx' + - onnxruntime ; extra == 'onnx' + - onnxruntime-gpu ; extra == 'onnx-gpu' + - lightning ; extra == 'pytorch' + - piqa ; extra == 'pytorch' + - tensorboard ; extra == 'pytorch' + - tensorly ; extra == 'pytorch' + - torch ; extra == 'pytorch' + - scikit-learn ; extra == 'xgboost' + - xgboost ; extra == 'xgboost' + requires_python: '>=3.11' +- pypi: https://files.pythonhosted.org/packages/cf/fc/c774d872abe5ae0c4381c5cb1ed61240e682c44ed019f807e18be26a7882/pyearthtools_utils-0.5.1-py3-none-any.whl + name: pyearthtools-utils + version: 0.5.1 + sha256: 17eb312fb26edc3d38d1e2da1b23a482b89383c84d7e10de83ff8940b8a701b2 + requires_dist: + - ipython + - numpy + - pillow + - pyyaml + - scikit-learn + - tqdm + - xarray + requires_python: '>=3.11' +- pypi: https://files.pythonhosted.org/packages/a4/45/1cb45ccac7c5f728a363d17a145443ed1f66962d3224b8e1166a4fd7bae1/pyearthtools_zoo-0.5.1-py3-none-any.whl + name: pyearthtools-zoo + version: 0.5.1 + sha256: fa6960043c621366aa020e85ab4d4b3097242f0a624cb603454f85c5d5563b9c + requires_dist: + - click + - entrypoints + - multiurl + - pyearthtools-data>=0.5.0 + - pyearthtools-pipeline>=0.5.0 + - pyearthtools-training>=0.5.0 + - pyearthtools-utils>=0.5.0 + - tqdm + - black ; extra == 'testing' + - coverage ; extra == 'testing' + - pytest ; extra == 'testing' + - pytest-cov ; extra == 'testing' + requires_python: '>=3.11' +- pypi: https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl + name: pygments + version: 2.19.2 + sha256: 86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b + requires_dist: + - colorama>=0.4.6 ; extra == 'windows-terminal' + requires_python: '>=3.8' +- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda + sha256: 5577623b9f6685ece2697c6eb7511b4c9ac5fb607c9babc2646c811b428fd46a + md5: 6b6ece66ebcae2d5f326c77ef2c5a066 + depends: + - python >=3.9 + license: BSD-2-Clause + license_family: BSD + purls: + - pkg:pypi/pygments?source=hash-mapping + size: 889287 + timestamp: 1750615908735 +- pypi: https://files.pythonhosted.org/packages/46/35/b874f79d03e9f900012cf609f7fff97b77164f2e14ee5aac282f8a999c1b/pyogrio-0.12.1-cp313-cp313-manylinux_2_28_x86_64.whl + name: pyogrio + version: 0.12.1 + sha256: 0622bc1a186421547660271083079b38d42e6f868802936d8538c0b379f1ab6b + requires_dist: + - certifi + - numpy + - packaging + - cython>=3.1 ; extra == 'dev' + - pytest ; extra == 'test' + - pytest-cov ; extra == 'test' + - pytest-benchmark ; extra == 'benchmark' + - geopandas ; extra == 'geopandas' + requires_python: '>=3.10' +- pypi: https://files.pythonhosted.org/packages/f8/85/c2b1706e51942de19076eff082f8495e57d5151364e78b5bef4af4a1d94a/pyproj-3.7.2-cp313-cp313-manylinux_2_28_x86_64.whl + name: pyproj + version: 3.7.2 + sha256: 5141a538ffdbe4bfd157421828bb2e07123a90a7a2d6f30fa1462abcfb5ce681 + requires_dist: + - certifi + requires_python: '>=3.11' +- conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda + sha256: ba3b032fa52709ce0d9fd388f63d330a026754587a2f461117cac9ab73d8d0d8 + md5: 461219d1a5bd61342293efa2c0c90eac + depends: + - __unix + - python >=3.9 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/pysocks?source=hash-mapping + size: 21085 + timestamp: 1733217331982 +- conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda + sha256: 9e749fb465a8bedf0184d8b8996992a38de351f7c64e967031944978de03a520 + md5: 2b694bad8a50dc2f712f5368de866480 + depends: + - pygments >=2.7.2 + - python >=3.10 + - iniconfig >=1.0.1 + - packaging >=22 + - pluggy >=1.5,<2 + - tomli >=1 + - colorama >=0.4 + - exceptiongroup >=1 + - python + constrains: + - pytest-faulthandler >=2 + license: MIT + license_family: MIT + purls: + - pkg:pypi/pytest?source=hash-mapping + size: 299581 + timestamp: 1765062031645 +- conda: https://conda.anaconda.org/conda-forge/noarch/pytest-cov-7.0.0-pyhcf101f3_1.conda + sha256: d0f45586aad48ef604590188c33c83d76e4fc6370ac569ba0900906b24fd6a26 + md5: 6891acad5e136cb62a8c2ed2679d6528 + depends: + - coverage >=7.10.6 + - pluggy >=1.2 + - pytest >=7 + - python >=3.10 + - python + license: MIT + license_family: MIT + purls: + - pkg:pypi/pytest-cov?source=hash-mapping + size: 29016 + timestamp: 1757612051022 +- conda: https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.8.0-pyhd8ed1ab_0.conda + sha256: b7b58a5be090883198411337b99afb6404127809c3d1c9f96e99b59f36177a96 + md5: 8375cfbda7c57fbceeda18229be10417 + depends: + - execnet >=2.1 + - pytest >=7.0.0 + - python >=3.9 + constrains: + - psutil >=3.0 + license: MIT + license_family: MIT + purls: + - pkg:pypi/pytest-xdist?source=hash-mapping + size: 39300 + timestamp: 1751452761594 +- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.12-hc97d973_100_cp313.conda + build_number: 100 + sha256: 8a08fe5b7cb5a28aa44e2994d18dbf77f443956990753a4ca8173153ffb6eb56 + md5: 4c875ed0e78c2d407ec55eadffb8cf3d + depends: + - __glibc >=2.17,<3.0.a0 + - bzip2 >=1.0.8,<2.0a0 + - ld_impl_linux-64 >=2.36.1 + - libexpat >=2.7.3,<3.0a0 + - libffi >=3.5.2,<3.6.0a0 + - libgcc >=14 + - liblzma >=5.8.2,<6.0a0 + - libmpdec >=4.0.0,<5.0a0 + - libsqlite >=3.51.2,<4.0a0 + - libuuid >=2.41.3,<3.0a0 + - libzlib >=1.3.1,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.5.5,<4.0a0 + - python_abi 3.13.* *_cp313 + - readline >=8.3,<9.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + license: Python-2.0 + purls: [] + size: 37364553 + timestamp: 1770272309861 + python_site_packages_path: lib/python3.13/site-packages +- conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda + sha256: d6a17ece93bbd5139e02d2bd7dbfa80bee1a4261dced63f65f679121686bf664 + md5: 5b8d21249ff20967101ffa321cab24e8 + depends: + - python >=3.9 + - six >=1.5 + - python + license: Apache-2.0 + license_family: APACHE + purls: + - pkg:pypi/python-dateutil?source=hash-mapping + size: 233310 + timestamp: 1751104122689 +- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + build_number: 8 + sha256: 210bffe7b121e651419cb196a2a63687b087497595c9be9d20ebe97dd06060a7 + md5: 94305520c52a4aa3f6c2b1ff6008d9f8 + constrains: + - python 3.13.* *_cp313 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 7002 + timestamp: 1752805902938 +- pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl + name: pytz + version: '2025.2' + sha256: 5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00 +- pypi: https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + name: pyyaml + version: 6.0.3 + sha256: 0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6 + requires_python: '>=3.8' +- conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py313h3dea7bd_1.conda + sha256: ef7df29b38ef04ec67a8888a4aa039973eaa377e8c4b59a7be0a1c50cd7e4ac6 + md5: f256753e840c3cd3766488c9437a8f8b + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - python >=3.13,<3.14.0a0 + - python_abi 3.13.* *_cp313 + - yaml >=0.2.5,<0.3.0a0 + license: MIT + license_family: MIT + purls: + - pkg:pypi/pyyaml?source=compressed-mapping + size: 201616 + timestamp: 1770223543730 +- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + sha256: 12ffde5a6f958e285aa22c191ca01bbd3d6e710aa852e00618fa6ddc59149002 + md5: d7d95fc8287ea7bf33e0e7116d2b95ec + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - ncurses >=6.5,<7.0a0 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 345073 + timestamp: 1765813471974 +- pypi: https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl + name: requests + version: 2.32.5 + sha256: 2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6 + requires_dist: + - charset-normalizer>=2,<4 + - idna>=2.5,<4 + - urllib3>=1.21.1,<3 + - certifi>=2017.4.17 + - pysocks>=1.5.6,!=1.5.7 ; extra == 'socks' + - chardet>=3.0.2,<6 ; extra == 'use-chardet-on-py3' + requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/linux-64/ruff-0.15.0-h40fa522_0.conda + noarch: python + sha256: fc456645570586c798d2da12fe723b38ea0d0901373fd9959cab914cbb19518b + md5: fe90be2abf12b301dde984719a02ca0b + depends: + - python + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - __glibc >=2.17 + license: MIT + license_family: MIT + purls: + - pkg:pypi/ruff?source=compressed-mapping + size: 9103793 + timestamp: 1770153712370 +- pypi: https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + name: scikit-learn + version: 1.8.0 + sha256: 8fdf95767f989b0cfedb85f7ed8ca215d4be728031f56ff5a519ee1e3276dc2e + requires_dist: + - numpy>=1.24.1 + - scipy>=1.10.0 + - joblib>=1.3.0 + - threadpoolctl>=3.2.0 + - numpy>=1.24.1 ; extra == 'build' + - scipy>=1.10.0 ; extra == 'build' + - cython>=3.1.2 ; extra == 'build' + - meson-python>=0.17.1 ; extra == 'build' + - numpy>=1.24.1 ; extra == 'install' + - scipy>=1.10.0 ; extra == 'install' + - joblib>=1.3.0 ; extra == 'install' + - threadpoolctl>=3.2.0 ; extra == 'install' + - matplotlib>=3.6.1 ; extra == 'benchmark' + - pandas>=1.5.0 ; extra == 'benchmark' + - memory-profiler>=0.57.0 ; extra == 'benchmark' + - matplotlib>=3.6.1 ; extra == 'docs' + - scikit-image>=0.22.0 ; extra == 'docs' + - pandas>=1.5.0 ; extra == 'docs' + - seaborn>=0.13.0 ; extra == 'docs' + - memory-profiler>=0.57.0 ; extra == 'docs' + - sphinx>=7.3.7 ; extra == 'docs' + - sphinx-copybutton>=0.5.2 ; extra == 'docs' + - sphinx-gallery>=0.17.1 ; extra == 'docs' + - numpydoc>=1.2.0 ; extra == 'docs' + - pillow>=10.1.0 ; extra == 'docs' + - pooch>=1.8.0 ; extra == 'docs' + - sphinx-prompt>=1.4.0 ; extra == 'docs' + - sphinxext-opengraph>=0.9.1 ; extra == 'docs' + - plotly>=5.18.0 ; extra == 'docs' + - polars>=0.20.30 ; extra == 'docs' + - sphinx-design>=0.6.0 ; extra == 'docs' + - sphinxcontrib-sass>=0.3.4 ; extra == 'docs' + - pydata-sphinx-theme>=0.15.3 ; extra == 'docs' + - sphinx-remove-toctrees>=1.0.0.post1 ; extra == 'docs' + - towncrier>=24.8.0 ; extra == 'docs' + - matplotlib>=3.6.1 ; extra == 'examples' + - scikit-image>=0.22.0 ; extra == 'examples' + - pandas>=1.5.0 ; extra == 'examples' + - seaborn>=0.13.0 ; extra == 'examples' + - pooch>=1.8.0 ; extra == 'examples' + - plotly>=5.18.0 ; extra == 'examples' + - matplotlib>=3.6.1 ; extra == 'tests' + - pandas>=1.5.0 ; extra == 'tests' + - pytest>=7.1.2 ; extra == 'tests' + - pytest-cov>=2.9.0 ; extra == 'tests' + - ruff>=0.11.7 ; extra == 'tests' + - mypy>=1.15 ; extra == 'tests' + - pyamg>=5.0.0 ; extra == 'tests' + - polars>=0.20.30 ; extra == 'tests' + - pyarrow>=12.0.0 ; extra == 'tests' + - numpydoc>=1.2.0 ; extra == 'tests' + - pooch>=1.8.0 ; extra == 'tests' + - conda-lock==3.0.1 ; extra == 'maintenance' + requires_python: '>=3.11' +- pypi: https://files.pythonhosted.org/packages/63/1e/12fbf2a3bb240161651c94bb5cdd0eae5d4e8cc6eaeceb74ab07b12a753d/scipy-1.17.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + name: scipy + version: 1.17.0 + sha256: 6680f2dfd4f6182e7d6db161344537da644d1cf85cf293f015c60a17ecf08752 + requires_dist: + - numpy>=1.26.4,<2.7 + - pytest>=8.0.0 ; extra == 'test' + - pytest-cov ; extra == 'test' + - pytest-timeout ; extra == 'test' + - pytest-xdist ; extra == 'test' + - asv ; extra == 'test' + - mpmath ; extra == 'test' + - gmpy2 ; extra == 'test' + - threadpoolctl ; extra == 'test' + - scikit-umfpack ; extra == 'test' + - pooch ; extra == 'test' + - hypothesis>=6.30 ; extra == 'test' + - array-api-strict>=2.3.1 ; extra == 'test' + - cython ; extra == 'test' + - meson ; extra == 'test' + - ninja ; sys_platform != 'emscripten' and extra == 'test' + - sphinx>=5.0.0,<8.2.0 ; extra == 'doc' + - intersphinx-registry ; extra == 'doc' + - pydata-sphinx-theme>=0.15.2 ; extra == 'doc' + - sphinx-copybutton ; extra == 'doc' + - sphinx-design>=0.4.0 ; extra == 'doc' + - matplotlib>=3.5 ; extra == 'doc' + - numpydoc ; extra == 'doc' + - jupytext ; extra == 'doc' + - myst-nb>=1.2.0 ; extra == 'doc' + - pooch ; extra == 'doc' + - jupyterlite-sphinx>=0.19.1 ; extra == 'doc' + - jupyterlite-pyodide-kernel ; extra == 'doc' + - linkify-it-py ; extra == 'doc' + - tabulate ; extra == 'doc' + - click<8.3.0 ; extra == 'dev' + - spin ; extra == 'dev' + - mypy==1.10.0 ; extra == 'dev' + - typing-extensions ; extra == 'dev' + - types-psutil ; extra == 'dev' + - pycodestyle ; extra == 'dev' + - ruff>=0.12.0 ; extra == 'dev' + - cython-lint>=0.12.2 ; extra == 'dev' + requires_python: '>=3.11' +- pypi: https://files.pythonhosted.org/packages/f2/a2/83fc37e2a58090e3d2ff79175a95493c664bcd0b653dd75cb9134645a4e5/shapely-2.1.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + name: shapely + version: 2.1.2 + sha256: 7ed1a5bbfb386ee8332713bf7508bc24e32d24b74fc9a7b9f8529a55db9f4ee6 + requires_dist: + - numpy>=1.21 + - pytest ; extra == 'test' + - pytest-cov ; extra == 'test' + - scipy-doctest ; extra == 'test' + - numpydoc==1.1.* ; extra == 'docs' + - matplotlib ; extra == 'docs' + - sphinx ; extra == 'docs' + - sphinx-book-theme ; extra == 'docs' + - sphinx-remove-toctrees ; extra == 'docs' + requires_python: '>=3.10' +- conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + sha256: 458227f759d5e3fcec5d9b7acce54e10c9e1f4f4b7ec978f3bfd54ce4ee9853d + md5: 3339e3b65d58accf4ca4fb8748ab16b3 + depends: + - python >=3.9 + - python + license: MIT + license_family: MIT + purls: + - pkg:pypi/six?source=hash-mapping + size: 18455 + timestamp: 1753199211006 +- conda: https://conda.anaconda.org/conda-forge/noarch/sortedcontainers-2.4.0-pyhd8ed1ab_1.conda + sha256: d1e3e06b5cf26093047e63c8cc77b70d970411c5cbc0cb1fad461a8a8df599f7 + md5: 0401a17ae845fa72c7210e206ec5647d + depends: + - python >=3.9 + license: Apache-2.0 + license_family: APACHE + purls: + - pkg:pypi/sortedcontainers?source=hash-mapping + size: 28657 + timestamp: 1738440459037 +- pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl + name: stack-data + version: 0.6.3 + sha256: d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695 + requires_dist: + - executing>=1.2.0 + - asttokens>=2.1.0 + - pure-eval + - pytest ; extra == 'tests' + - typeguard ; extra == 'tests' + - pygments ; extra == 'tests' + - littleutils ; extra == 'tests' + - cython ; extra == 'tests' +- conda: https://conda.anaconda.org/conda-forge/noarch/stack_data-0.6.3-pyhd8ed1ab_1.conda + sha256: 570da295d421661af487f1595045760526964f41471021056e993e73089e9c41 + md5: b1b505328da7a6b246787df4b5a49fbc + depends: + - asttokens + - executing + - pure_eval + - python >=3.9 + license: MIT + license_family: MIT + purls: + - pkg:pypi/stack-data?source=hash-mapping + size: 26988 + timestamp: 1733569565672 +- conda: https://conda.anaconda.org/conda-forge/noarch/tblib-3.2.2-pyhcf101f3_0.conda + sha256: 6b549360f687ee4d11bf85a6d6a276a30f9333df1857adb0fe785f0f8e9bcd60 + md5: f88bb644823094f436792f80fba3207e + depends: + - python >=3.10 + - python + license: BSD-2-Clause + license_family: BSD + purls: + - pkg:pypi/tblib?source=hash-mapping + size: 19397 + timestamp: 1762956379123 +- pypi: https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl + name: threadpoolctl + version: 3.6.0 + sha256: 43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb + requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + sha256: cafeec44494f842ffeca27e9c8b0c27ed714f93ac77ddadc6aaf726b5554ebac + md5: cffd3bdd58090148f4cfcd831f4b26ab + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libzlib >=1.3.1,<2.0a0 + constrains: + - xorg-libx11 >=1.8.12,<2.0a0 + license: TCL + license_family: BSD + purls: [] + size: 3301196 + timestamp: 1769460227866 +- conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.4.0-pyhcf101f3_0.conda + sha256: 62940c563de45790ba0f076b9f2085a842a65662268b02dd136a8e9b1eaf47a8 + md5: 72e780e9aa2d0a3295f59b1874e3768b + depends: + - python >=3.10 + - python + license: MIT + license_family: MIT + purls: + - pkg:pypi/tomli?source=compressed-mapping + size: 21453 + timestamp: 1768146676791 +- conda: https://conda.anaconda.org/conda-forge/noarch/toolz-1.1.0-pyhd8ed1ab_1.conda + sha256: 4e379e1c18befb134247f56021fdf18e112fb35e64dd1691858b0a0f3bea9a45 + md5: c07a6153f8306e45794774cf9b13bd32 + depends: + - python >=3.10 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/toolz?source=hash-mapping + size: 53978 + timestamp: 1760707830681 +- conda: https://conda.anaconda.org/conda-forge/linux-64/tornado-6.5.3-py313h07c4f96_0.conda + sha256: 6006d4e5a6ff99be052c939e43adee844a38f2dc148f44a7c11aa0011fd3d811 + md5: 82da2dcf1ea3e298f2557b50459809e0 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - python >=3.13,<3.14.0a0 + - python_abi 3.13.* *_cp313 + license: Apache-2.0 + license_family: Apache + purls: + - pkg:pypi/tornado?source=hash-mapping + size: 878109 + timestamp: 1765458900582 +- pypi: https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl + name: tqdm + version: 4.67.3 + sha256: ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf + requires_dist: + - colorama ; sys_platform == 'win32' + - importlib-metadata ; python_full_version < '3.8' + - pytest>=6 ; extra == 'dev' + - pytest-cov ; extra == 'dev' + - pytest-timeout ; extra == 'dev' + - pytest-asyncio>=0.24 ; extra == 'dev' + - nbval ; extra == 'dev' + - requests ; extra == 'discord' + - slack-sdk ; extra == 'slack' + - requests ; extra == 'telegram' + - ipywidgets>=6 ; extra == 'notebook' + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl + name: traitlets + version: 5.14.3 + sha256: b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f + requires_dist: + - myst-parser ; extra == 'docs' + - pydata-sphinx-theme ; extra == 'docs' + - sphinx ; extra == 'docs' + - argcomplete>=3.0.3 ; extra == 'test' + - mypy>=1.7.0 ; extra == 'test' + - pre-commit ; extra == 'test' + - pytest-mock ; extra == 'test' + - pytest-mypy-testing ; extra == 'test' + - pytest>=7.0,<8.2 ; extra == 'test' + requires_python: '>=3.8' +- conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda + sha256: f39a5620c6e8e9e98357507262a7869de2ae8cc07da8b7f84e517c9fd6c2b959 + md5: 019a7385be9af33791c989871317e1ed + depends: + - python >=3.9 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/traitlets?source=hash-mapping + size: 110051 + timestamp: 1733367480074 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda + sha256: 032271135bca55aeb156cee361c81350c6f3fb203f57d024d7e5a1fc9ef18731 + md5: 0caa1af407ecff61170c9437a808404d + depends: + - python >=3.10 + - python + license: PSF-2.0 + license_family: PSF + purls: + - pkg:pypi/typing-extensions?source=hash-mapping + size: 51692 + timestamp: 1756220668932 +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + sha256: 1d30098909076af33a35017eed6f2953af1c769e273a0626a04722ac4acaba3c + md5: ad659d0a2b3e47e38d829aa8cad2d610 + license: LicenseRef-Public-Domain + purls: [] + size: 119135 + timestamp: 1767016325805 +- pypi: https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl + name: urllib3 + version: 2.6.3 + sha256: bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4 + requires_dist: + - brotli>=1.2.0 ; platform_python_implementation == 'CPython' and extra == 'brotli' + - brotlicffi>=1.2.0.0 ; platform_python_implementation != 'CPython' and extra == 'brotli' + - h2>=4,<5 ; extra == 'h2' + - pysocks>=1.5.6,!=1.5.7,<2.0 ; extra == 'socks' + - backports-zstd>=1.0.0 ; python_full_version < '3.14' and extra == 'zstd' + requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.3-pyhd8ed1ab_0.conda + sha256: af641ca7ab0c64525a96fd9ad3081b0f5bcf5d1cbb091afb3f6ed5a9eee6111a + md5: 9272daa869e03efe68833e3dc7a02130 + depends: + - backports.zstd >=1.0.0 + - brotli-python >=1.2.0 + - h2 >=4,<5 + - pysocks >=1.5.6,<2.0,!=1.5.7 + - python >=3.10 + license: MIT + license_family: MIT + purls: + - pkg:pypi/urllib3?source=hash-mapping + size: 103172 + timestamp: 1767817860341 +- pypi: https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl + name: wcwidth + version: 0.6.0 + sha256: 1a3a1e510b553315f8e146c54764f4fb6264ffad731b3d78088cdb1478ffbdad + requires_python: '>=3.8' +- conda: https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.6.0-pyhd8ed1ab_0.conda + sha256: e298b508b2473c4227206800dfb14c39e4b14fd79d4636132e9e1e4244cdf4aa + md5: c3197f8c0d5b955c904616b716aca093 + depends: + - python >=3.10 + license: MIT + license_family: MIT + purls: + - pkg:pypi/wcwidth?source=compressed-mapping + size: 71550 + timestamp: 1770634638503 +- conda: https://conda.anaconda.org/conda-forge/noarch/xarray-2026.1.0-pyhcf101f3_0.conda + sha256: 878d190db1a78f1e3fe90497e053a0dc0941937e82378cc990f43115ffe2bee6 + md5: 397276eff153e81b0e7128acc56deb32 + depends: + - python >=3.11 + - numpy >=1.26 + - packaging >=24.1 + - pandas >=2.2 + - python + constrains: + - bottleneck >=1.4 + - cartopy >=0.23 + - cftime >=1.6 + - dask-core >=2024.6 + - distributed >=2024.6 + - flox >=0.9 + - h5netcdf >=1.3 + - h5py >=3.11 + - hdf5 >=1.14 + - iris >=3.9 + - matplotlib-base >=3.8 + - nc-time-axis >=1.4 + - netcdf4 >=1.6.0 + - numba >=0.60 + - numbagg >=0.8 + - pint >=0.24 + - pydap >=3.5.0 + - scipy >=1.13 + - seaborn-base >=0.13 + - sparse >=0.15 + - toolz >=0.12 + - zarr >=2.18 + license: Apache-2.0 + license_family: APACHE + purls: + - pkg:pypi/xarray?source=compressed-mapping + size: 1010206 + timestamp: 1769665430320 +- conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda + sha256: 6d9ea2f731e284e9316d95fa61869fe7bbba33df7929f82693c121022810f4ad + md5: a77f85f77be52ff59391544bfe73390a + depends: + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + license: MIT + license_family: MIT + purls: [] + size: 85189 + timestamp: 1753484064210 +- conda: https://conda.anaconda.org/conda-forge/noarch/zict-3.0.0-pyhd8ed1ab_1.conda + sha256: 5488542dceeb9f2874e726646548ecc5608060934d6f9ceaa7c6a48c61f9cc8d + md5: e52c2ef711ccf31bb7f70ca87d144b9e + depends: + - python >=3.9 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/zict?source=hash-mapping + size: 36341 + timestamp: 1733261642963 +- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda + sha256: b4533f7d9efc976511a73ef7d4a2473406d7f4c750884be8e8620b0ce70f4dae + md5: 30cd29cb87d819caead4d55184c1d115 + depends: + - python >=3.10 + - python + license: MIT + license_family: MIT + purls: + - pkg:pypi/zipp?source=hash-mapping + size: 24194 + timestamp: 1764460141901 +- conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda + sha256: 68f0206ca6e98fea941e5717cec780ed2873ffabc0e1ed34428c061e2c6268c7 + md5: 4a13eeac0b5c8e5b8ab496e6c4ddd829 + depends: + - __glibc >=2.17,<3.0.a0 + - libzlib >=1.3.1,<2.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 601375 + timestamp: 1764777111296 diff --git a/packages/bundled_models/persistence/pyproject.toml b/packages/bundled_models/persistence/pyproject.toml index acb42bae..15b2f5d9 100644 --- a/packages/bundled_models/persistence/pyproject.toml +++ b/packages/bundled_models/persistence/pyproject.toml @@ -2,24 +2,23 @@ requires = ["setuptools"] build-backend = "setuptools.build_meta" - [project] name = "pyearthtools-bundled-persistence" version = "0.6.0" description = "Persistence Bundled Model" readme = "README.md" requires-python = ">=3.11, <3.14" -keywords = ["fourcastnext"] +keywords = ["persistence", "pyearthtools", "models"] maintainers = [ - {name = "Tennessee Leeuwenburg", email = "tennessee.leeuwenburg@bom.gov.au"} - {name = "Nikeeth Ramanathan", email = "nikeeth.ramanathan@gmail.com"} + {name = "Tennessee Leeuwenburg", email = "tennessee.leeuwenburg@bom.gov.au"}, + {name = "Nikeeth Ramanathan", email = "nikeeth.ramanathan@gmail.com"}, ] classifiers = [ - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", ] dependencies = [ 'pyearthtools.zoo>=0.5.0', @@ -27,7 +26,13 @@ dependencies = [ 'pyearthtools.pipeline>=0.5.0', 'hydra-core', ] - +[dependency-groups] +dev = [ + "pytest>=8.4.2", + "ruff", + "pytest-cov", + "pytest-xdist", +] [project.urls] homepage = "https://pyearthtools.readthedocs.io/" @@ -35,7 +40,7 @@ documentation = "https://pyearthtools.readthedocs.io/" repository = "https://github.com/ACCESS-Community-Hub/PyEarthTools" [project.entry-points."pyearthtools.zoo.model"] -Global_FCNXT = "fourcastnext.registered_model:Persistence" +Global_PERSIST = "persistence.registered_model:Persistence" [tool.isort] profile = "black" @@ -51,7 +56,36 @@ warn_unused_configs = true ignore_missing_imports = true [tool.hatch.version] +# TODO: is this the right path? path = "src/pyearthtools/pipeline/__init__.py" [tool.hatch.build.targets.wheel] packages = ["src/pyearthtools/"] + +[tool.pixi.workspace] +channels = ["conda-forge"] +platforms = ["linux-64"] + +[tool.pixi.pypi-dependencies] +pyearthtools-bundled-persistence = { path = ".", editable = true } + +[tool.pixi.tasks] + +[tool.pixi.dependencies] +python = ">=3.11,<3.14" +xarray = ">=2026.1.0,<2027" + +[tool.pixi.feature.testing.dependencies] +pytest = ">=9.0.2,<10" +pytest-cov = ">=7.0.0,<8" +pytest-xdist = ">=3.8.0,<4" +ruff = ">=0.15.0,<0.16" +ipython = ">=9.10.0,<10" + +[tool.pixi.feature.dask.dependencies] +dask-core = "*" +distributed = "*" + +[tool.pixi.environments] +dask = ["dask"] +dev = ["dask", "testing"] diff --git a/packages/bundled_models/persistence/report.xml b/packages/bundled_models/persistence/report.xml new file mode 100644 index 00000000..608b3e16 --- /dev/null +++ b/packages/bundled_models/persistence/report.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/packages/bundled_models/persistence/src/persistence/__init__.py b/packages/bundled_models/persistence/src/persistence/__init__.py index e69de29b..a2fa5d0e 100644 --- a/packages/bundled_models/persistence/src/persistence/__init__.py +++ b/packages/bundled_models/persistence/src/persistence/__init__.py @@ -0,0 +1,11 @@ +from persistence._interface import ( + PersistenceMethod, + PersistenceDataChunk, + PersistenceChunker, +) + +__all__ = [ + "PersistenceMethod", + "PersistenceDataChunk", + "PersistenceChunker", +] diff --git a/packages/bundled_models/persistence/src/persistence/_daskconfig.py b/packages/bundled_models/persistence/src/persistence/_daskconfig.py new file mode 100644 index 00000000..0375f29b --- /dev/null +++ b/packages/bundled_models/persistence/src/persistence/_daskconfig.py @@ -0,0 +1,50 @@ +from contextlib import contextmanager + + +# default scheduler string to set "single-threaded" mode. +_STR_DASK_SYNC_SCHEDULER="synchronous" + +@contextmanager +def _set_synchronous_dask(): + """ + Wrapper to set `dask` to single-threaded mode. Note: "single-threaded" in `dask`-land + (specifically) is the same as "synchronous". + + This handles the case where dask is _not_ installed. In which case it does a pass-through. + + Example: + def do_stuff(...): + # I can now (optionally) fork other processes here - without confusing dask. + ... + + with _set_synchronous_dask(): + do_stuff(...) + """ + try: + import dask + + # store state - note: scheduler config is not guarenteed to exist by default + flag_nonexistant_scheduler_config = False + state_scheduler_type = None + try: + state_scheduler_type = dask.config.get("scheduler") + except KeyError: + flag_nonexistant_scheduler_config = True + + # set state to desired config + dask.config.set(scheduler=_STR_DASK_SYNC_SCHEDULER) + + # release scope to caller context + yield + + # retrieve current stack after context execution + if flag_nonexistant_scheduler_config: + # scheduler state did not exist so delete it, note: "Not exist" is different from "None" + del dask.config['scheduler'] + else: + # otherwise revert it to normal + dask.config.set(scheduler=state_scheduler_type) + + except ImportError: + yield + diff --git a/packages/bundled_models/persistence/src/persistence/_interface.py b/packages/bundled_models/persistence/src/persistence/_interface.py index 80717c24..9f5f3b5a 100644 --- a/packages/bundled_models/persistence/src/persistence/_interface.py +++ b/packages/bundled_models/persistence/src/persistence/_interface.py @@ -1,30 +1,29 @@ from enum import Enum from dataclasses import dataclass +from collections.abc import Callable +from contextlib import contextmanager import numpy as np import xarray as xr -_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER = 2 -_DEFAULT_CHUNK_SIZE = -@dataclass -class PersistenceDataChunk: - """ - Sample usage pattern: - 1. split dataset into chunks - 2. represent chunk as 'PersistenceDataChunk' - 3. perform computation - 4. grab output chunk and insert into destination dataset +_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER = 2 # 50% sparsity is reasonable, though some data like + # precipitation may be more sparse than this +_MAX_NUM_CHUNKS = 1000 # unlikely to have more than 1000 processes for persistence, due to + # diminishing returns - even on a supercomputer - IMPORTANT: data should not be chunked over time. - """ - chunk: np.ndarray # ndarray including time axis. Sorted ascending in time. The latest - # data point is assumed to be the "reference" time. - time_axis: int # the time axis - this will be flattened - time_lookback: int # number of lookback indices - persistence_method: PersistenceMethod # which method to use to calculate persistence +_mod_index: Callable[[int, int], np.uint] = np.mod +_mod_index.__doc__ = ( +""" +Maps negative integer to a positive integer element in a ring. The ring has a domain of +`[0, (cardinality - 1)]`. +sample usage: get the positive equivilent index for an array. E.g. "-1" in python is the last +element, this will standardize it to len(.) - 1. The reason for doing this is to make sure index +comparisons are accurately represented. +""" +) class PersistenceMethod(Enum): """ @@ -52,12 +51,12 @@ class PersistenceMethod(Enum): def num_time_indices_required(self): match self: - case PersistenceData.MOST_RECENT: + case PersistenceMethod.MOST_RECENT: return 1 - case PersistenceData.MEDIAN_OF_THREE: + case PersistenceMethod.MEDIAN_OF_THREE: return 3 case _: - raise NotImplementedError("Invalid persistence method.") + raise NotImplementedError("PersistenceMethod: Invalid persistence method.") def min_lookback(self, sparsity_multiplier=_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER): """ @@ -65,7 +64,28 @@ def min_lookback(self, sparsity_multiplier=_DEFAULT_PERSISTENCE_SPARSITY_MULTIPL By default we assume a 50% sparsity and require at least double the number of values required for the compuation. """ - self.time_indices_required() * sparsity_multiplier + if sparsity_multiplier < 1: + raise ValueError("PersistenceMethod: Sparsity multiplier must be >= 1") + + return self.num_time_indices_required() * sparsity_multiplier + + + +@dataclass +class PersistenceDataChunk: + """ + Sample usage pattern: + 1. split dataset into chunks + 2. represent chunk as 'PersistenceDataChunk' + 3. perform computation + 4. grab output chunk and insert into destination dataset + + IMPORTANT: data should not be chunked over time. + """ + arr_chunk: np.ndarray # ndarray including time axis. Sorted ascending in time. The latest + # data point is assumed to be the "reference" time. + idx_time: int # the time axis - this will be flattened + method: PersistenceMethod # which method to use to calculate persistence @dataclass class PersistenceChunker: @@ -90,41 +110,79 @@ class PersistenceChunker: FUTUREWORK: usage of rust and/or parquet for dataloading and intermediate caching will be explored in order to speed up this process. """ - da_lazy: xr.DataArray # lazy loaded data array - num_chunks: int # number of chunks to use - time_dimname: str # the time dimension name normally "time" - chunk_dimname: str = None # the dimension name to chunk along, or default to a non-time - # dimension + da_lazy: xr.DataArray # lazy loaded data array + num_chunks: int # number of chunks to use + method: PersistenceMethod # the persistence method to use (needed for lookback slicing) + idx_time: int = None # axis index for time + idx_chunk: int = None # axis index for chunk + dimname_time: str = None # the time dimension name normally "time" + dimname_chunk: str = None # the dimension name to chunk along, or default to a non-time + # dimension def __post_init__(self): - # check time dimension - if self.time_dimname not in self.da_lazy.dims: - raise KeyError(f"PersistenceModel: time dimension {self.time_dimname} not found in input array") - self.time_idx = self.da_lazy.dims.index(time_dimname) - - # check chunk dimension, arbitrarily select it - # NOTE: modulus ensures "-1" corrects to last index. "-1" is usually fine in python, - # however, we're doing index comparisons below and need the non-zero equivilent. - self.chunk_idx = (self.time_idx - 1) % len(self.da_lazy.dims) - if self.chunk_dimname is not None: - if self.chunk_dimname not in self.da_lazy.dims: - raise KeyError(f"PersistenceModel: chunk dimension {self.chunk_dimname} not found in input array") - self.chunk_idx = self.da_lazy.dims.index(chunk_dimname) - - # time dimension cannot be used for chunking (since its needed for aggregation) - if self.time_idx == self.chunk_idx: - raise ValueError("PersistenceModel: cannot chunk over time dimension") - + # --- handle time dimension --- + if self.idx_time is None: + if self.dimname_time not in self.da_lazy.dims: + raise KeyError(f"PersistenceChunker: time dimension {self.dimname_time} not found in input array") + self.idx_time = self.da_lazy.dims.index(self.dimname_time) + + # --- handle chunk dimension --- + if self.idx_chunk is None: + if self.dimname_chunk is None: + # --- default chunk dimension --- + # attempt to choose the previous index as the chunk index, so that it doesn't + # overlap with the time index. modulo operation is used so that negative indices + # are cycled. + self.idx_chunk = _modidx(self.idx_time - 1, self.da_lazy.dims) + dimkeys = self.dims.keys() + self.dimname_chunk = dimkeys[self.idx_chunk] + else: + # --- check and update chunk dimension --- + if self.dimname_chunk not in self.da_lazy.dims: + raise KeyError(f"PersistenceChunker: chunk dimension {self.dimname_chunk} not found in input array") + self.idx_chunk = self.da_lazy.dims.index(self.dimname_chunk) + + # --- check chunk/time index compatibilty --- + if self.idx_time == self.idx_chunk: + raise ValueError("PersistenceChunker: cannot chunk over time dimension") + + # --- check chunk size --- if self.num_chunks < 1: - raise ValueError("PersistenceModel: number of chunks must be greater than or equal to 1") + raise ValueError("PersistenceChunker: number of chunks must be greater than or equal to 1") + + if self.num_chunks > self.da_lazy.shape[self.idx_chunk]: + raise ValueError("PersistenceChunker: num_chunks must be less than the axis length") + + if self.num_chunks > _MAX_NUM_CHUNKS: + raise ValueError(f"PersistenceChunker: num_chunks is too large. Must be <{_MAX_NUM_CHUNKS}") + + # safety (tests only): check that indices are appropriately set + assert self.idx_chunk is not None + assert self.idx_time is not None + + # safety: these are usually handled by the underlying compute indexers, + # which will spit out an error if the user inputs the wrong index. + assert self.idx_time in self.da_lazy + assert self.idx_chunk in self.da_lazy def generate_chunks(self): """ - Generator that extracts chunks + Generator that extracts chunks from the underlying data array. + sample application: + - minimize memory bloat (lazy loading chunks) + - better utilization of CPU by splitting embarassingly parallel dimensions across processes + (multi processing) """ - # --- TODO: below is psuedocode - start_slice, end_slice = self.get_chunk_slice(chunk_counter) + + # --- iterate and split chunks --- chunk_counter = 0 - while chunk_counter < self.num_chunks - yield self.da_lazy + while chunk_counter < self.num_chunks: + slice_chunk = self._get_chunk_slice(chunk_counter) + slice_time = self._get_time_slice(chunk_counter) + # --- yield reference to array --- + # still lazy at this point, until it is loaded and moved/copied into the forked process. + yield self.da_lazy.isel({ + self.dimname_time: slice_time, + self.dimname_chunk: slice_chunk, + }) chunk_counter += 1 diff --git a/packages/bundled_models/persistence/tests/test__interface.py b/packages/bundled_models/persistence/tests/test__interface.py new file mode 100644 index 00000000..9641c215 --- /dev/null +++ b/packages/bundled_models/persistence/tests/test__interface.py @@ -0,0 +1,95 @@ +""" +TODO: + + 1. Test that data chunking works as expected, given a time index + 2. Test that data retrieval (lookback) works as expected given sparsity multiplier + 3. (mock) Test median of 3 is activated as expected. (call - not computation) + 4. (mock) Test latest is activated as expected + 5. test that appropriate errors are thrown for various invalid inputs +""" + +import numpy as np +import xarray as xr +import persistence as pet_persist + + +def test_persistence_method_obj(): + """ + Basic test to check object creation: PersistenceMethod + """ + persistence_mostrecent = pet_persist.PersistenceMethod.MOST_RECENT + persistence_median = pet_persist.PersistenceMethod.MEDIAN_OF_THREE + + # sense checks - mostrecent + assert persistence_mostrecent.num_time_indices_required() == 1 + assert persistence_mostrecent.min_lookback() == 2 + assert persistence_mostrecent.min_lookback(3) == 3 # 3 * 1 + + # sense checks - median + assert persistence_median.num_time_indices_required() == 3 + assert persistence_median.min_lookback() == 6 + assert persistence_median.min_lookback(50) == 150 # 3 * 50 + +def test_persistence_data_chunk_obj(): + arr_chunk = np.random.randint(0, 10, (2,5,8)) + persistence_method = pet_persist.PersistenceMethod.MOST_RECENT + idx_time: int = 1 # len = 5 + datachunk = pet_persist.PersistenceDataChunk( + arr_chunk=arr_chunk, + idx_time=idx_time, + method=persistence_method, + ) + assert datachunk.arr_chunk.shape.index(5) == datachunk.idx_time + assert datachunk.method.min_lookback() == 2 + +def test_persistence_chunker_obj(): + """ + Basic test to check object creation: PersistenceChunker + """ + # --- index variant --- + da = xr.DataArray( + np.random.randint(0, 10, (2,5,8)), + dims=["x0", "time", "x2"], + ) + idx_time: int = 1 # len = 5 + idx_chunk: int = 2 # len = 8 + num_chunks: int = 4 # each chunk is 2x5x2 + persistence_method = pet_persist.PersistenceMethod.MOST_RECENT + + chunker = pet_persist.PersistenceChunker( + da_lazy=da, + idx_time=idx_time, + idx_chunk=idx_chunk, + num_chunks=num_chunks, + method=persistence_method, + ) + + # sense checks + assert da.shape.index(5) == chunker.idx_time + assert da.shape.index(8) == chunker.idx_chunk + assert chunker.num_chunks == 4 + assert chunker.method.num_time_indices_required() == 1 + + # --- name variant --- + da = xr.DataArray( + np.random.randint(0, 10, (2,5,8)), + dims=["x0", "time", "x2"], + ) + dimname_time: str = "time" # len = 5 + dimname_chunk: str = "x2" # len = 8 + num_chunks: int = 4 # each chunk is 2x5x2 + persistence_method = pet_persist.PersistenceMethod.MOST_RECENT + + chunker = pet_persist.PersistenceChunker( + da_lazy=da, + dimname_time=dimname_time, + dimname_chunk=dimname_chunk, + num_chunks=num_chunks, + method=persistence_method, + ) + + # sense checks + assert da.shape.index(5) == chunker.idx_time + assert da.shape.index(8) == chunker.idx_chunk + assert chunker.num_chunks == 4 + assert chunker.method.num_time_indices_required() == 1 From 102f3785ba3c94d6a28c615d46bfd2f7ebb6f634 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Mon, 16 Feb 2026 14:53:28 +1100 Subject: [PATCH 03/28] fix dask shinnenigans from tests --- .../bundled_models/persistence/.gitignore | 1 + packages/bundled_models/persistence/pixi.lock | 1065 ++++++++++++++++- .../bundled_models/persistence/pyproject.toml | 4 + .../src/persistence/_daskconfig.py | 26 +- .../persistence/tests/test__daskconfig.py | 120 ++ .../persistence/tests/test__interface.py | 8 +- 6 files changed, 1194 insertions(+), 30 deletions(-) create mode 100644 packages/bundled_models/persistence/tests/test__daskconfig.py diff --git a/packages/bundled_models/persistence/.gitignore b/packages/bundled_models/persistence/.gitignore index ae849e65..75bc4918 100644 --- a/packages/bundled_models/persistence/.gitignore +++ b/packages/bundled_models/persistence/.gitignore @@ -1,3 +1,4 @@ # pixi environments .pixi/* !.pixi/config.toml +report.xml diff --git a/packages/bundled_models/persistence/pixi.lock b/packages/bundled_models/persistence/pixi.lock index 515422be..c76a1f28 100644 --- a/packages/bundled_models/persistence/pixi.lock +++ b/packages/bundled_models/persistence/pixi.lock @@ -11,9 +11,28 @@ environments: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-auth-0.9.3-hef928c7_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-cal-0.9.13-h2c9d079_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-common-0.12.6-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-compression-0.3.1-h8b1a151_9.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-event-stream-0.5.7-h28f887f_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-http-0.10.7-ha8fc4e3_5.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-io-0.23.3-hdaf4b65_5.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-mqtt-0.13.3-hc63082f_11.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-s3-0.11.3-h06ab39a_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-sdkutils-0.2.4-h8b1a151_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.2.7-h8b1a151_5.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.35.4-h8824e59_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.606-h20b40b1_10.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-core-cpp-1.16.2-h206d751_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-identity-cpp-1.13.3-hed0cdb0_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-blobs-cpp-12.16.0-hdd73cc9_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-common-cpp-12.12.0-ha7a2c86_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-files-datalake-cpp-12.14.0-h52c5a47_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/backports.zstd-1.3.0-py313h18e8e13_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.2.0-py313hf159716_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_8.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.1.4-hbd8a1cb_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cloudpickle-3.1.2-pyhcf101f3_1.conda @@ -21,46 +40,90 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/dask-core-2026.1.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/distributed-2026.1.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/fsspec-2026.2.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gflags-2.2.2-h5888daf_1005.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/glog-0.7.1-hbabe93e_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.2-h33c6efd_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libabseil-20260107.1-cxx17_h7b12aa8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-23.0.0-h2603568_3_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-23.0.0-h635bf11_3_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-compute-23.0.0-h53684a4_3_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-23.0.0-h635bf11_3_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-23.0.0-hb4dd7c2_3_cpu.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.2.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.2.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.2.0-hb03c661_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcrc32c-1.1.2-h9c3ff4c_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.12-hf998b51_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.3-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_17.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_17.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_17.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-2.39.0-h9d11ab5_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-2.39.0-hdbdcf42_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgrpc-1.78.0-h1d1128b_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h3b78370_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-1.21.0-h9692893_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-headers-1.21.0-ha770c72_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-23.0.0-h7376487_3_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-6.33.5-h2b00c02_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2025.11.05-h0dc7533_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.2.0-hdf11a46_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libthrift-0.22.0-h454ac66_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libutf8proc-2.11.3-hfe17d71_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-16-2.15.1-hca6bf5a_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.15.1-he237659_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/locket-1.0.0-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.10.0-h5888daf_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.3-py313h3dea7bd_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/msgpack-python-1.1.2-py313h7037e92_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/nlohmann_json-3.12.0-h54a6638_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py313hf6604e3_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/orc-2.2.2-hbb90d81_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/pandas-3.0.0-py313hbfd7664_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/partd-1.4.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/prometheus-cpp-1.3.0-ha5d0236_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/psutil-7.2.2-py313h54dd161_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-23.0.0-py313h78bf25f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-23.0.0-py313h98bfbea_0_cpu.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.12-hc97d973_100_cp313.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py313h3dea7bd_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/re2-2025.11.05-h5301d42_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/s2n-1.6.2-he8a4886_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.2-h03e3b7b_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sortedcontainers-2.4.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/tblib-3.2.2-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda @@ -72,6 +135,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda - conda: https://conda.anaconda.org/conda-forge/noarch/zict-3.0.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-hb9d3cd8_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda - pypi: https://files.pythonhosted.org/packages/3e/38/7859ff46355f76f8d19459005ca000b6e7012f2f1ca597746cbcd1fbfe5e/antlr4-python3-runtime-4.9.3.tar.gz - pypi: https://files.pythonhosted.org/packages/d2/39/e7eaf1799466a4aef85b6a4fe7bd175ad2b1c6345066aa33f1f58d4b18d0/asttokens-3.0.1-py3-none-any.whl @@ -223,9 +287,28 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/noarch/asttokens-3.0.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-auth-0.9.3-hef928c7_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-cal-0.9.13-h2c9d079_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-common-0.12.6-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-compression-0.3.1-h8b1a151_9.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-event-stream-0.5.7-h28f887f_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-http-0.10.7-ha8fc4e3_5.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-io-0.23.3-hdaf4b65_5.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-mqtt-0.13.3-hc63082f_11.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-s3-0.11.3-h06ab39a_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-sdkutils-0.2.4-h8b1a151_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.2.7-h8b1a151_5.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.35.4-h8824e59_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.606-h20b40b1_10.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-core-cpp-1.16.2-h206d751_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-identity-cpp-1.13.3-hed0cdb0_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-blobs-cpp-12.16.0-hdd73cc9_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-common-cpp-12.12.0-ha7a2c86_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-files-datalake-cpp-12.14.0-h52c5a47_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/backports.zstd-1.3.0-py313h18e8e13_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.2.0-py313hf159716_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_8.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.1.4-hbd8a1cb_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cloudpickle-3.1.2-pyhcf101f3_1.conda @@ -239,6 +322,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/executing-2.2.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/fsspec-2026.2.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/gflags-2.2.2-h5888daf_1005.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/glog-0.7.1-hbabe93e_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda @@ -249,40 +334,79 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/ipython_pygments_lexers-1.1.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/jedi-0.19.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libabseil-20260107.1-cxx17_h7b12aa8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-23.0.0-h2603568_3_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-23.0.0-h635bf11_3_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-compute-23.0.0-h53684a4_3_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-23.0.0-h635bf11_3_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-23.0.0-hb4dd7c2_3_cpu.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.2.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.2.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.2.0-hb03c661_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcrc32c-1.1.2-h9c3ff4c_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.12-hf998b51_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.3-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_17.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_17.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_17.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-2.39.0-h9d11ab5_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-2.39.0-hdbdcf42_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgrpc-1.78.0-h1d1128b_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h3b78370_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-1.21.0-h9692893_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-headers-1.21.0-ha770c72_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-23.0.0-h7376487_3_cpu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-6.33.5-h2b00c02_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2025.11.05-h0dc7533_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.2.0-hdf11a46_17.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libthrift-0.22.0-h454ac66_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libutf8proc-2.11.3-hfe17d71_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-16-2.15.1-hca6bf5a_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.15.1-he237659_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/locket-1.0.0-pyhd8ed1ab_0.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.10.0-h5888daf_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.3-py313h3dea7bd_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.2.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/msgpack-python-1.1.2-py313h7037e92_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/nlohmann_json-3.12.0-h54a6638_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py313hf6604e3_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/orc-2.2.2-hbb90d81_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/pandas-3.0.0-py313hbfd7664_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.6-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/partd-1.4.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pexpect-4.9.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhf9edf01_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/prometheus-cpp-1.3.0-ha5d0236_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/prompt-toolkit-3.0.52-pyha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/psutil-7.2.2-py313h54dd161_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/ptyprocess-0.7.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pure_eval-0.2.3-pyhd8ed1ab_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-23.0.0-py313h78bf25f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-23.0.0-py313h98bfbea_0_cpu.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pytest-9.0.2-pyhcf101f3_0.conda @@ -292,9 +416,12 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py313h3dea7bd_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/re2-2025.11.05-h5301d42_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ruff-0.15.0-h40fa522_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/s2n-1.6.2-he8a4886_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.2-h03e3b7b_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sortedcontainers-2.4.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/stack_data-0.6.3-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/tblib-3.2.2-pyhcf101f3_0.conda @@ -311,6 +438,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda - conda: https://conda.anaconda.org/conda-forge/noarch/zict-3.0.0-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-hb9d3cd8_2.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda - pypi: https://files.pythonhosted.org/packages/3e/38/7859ff46355f76f8d19459005ca000b6e7012f2f1ca597746cbcd1fbfe5e/antlr4-python3-runtime-4.9.3.tar.gz - pypi: https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl @@ -393,6 +521,269 @@ packages: - pkg:pypi/asttokens?source=hash-mapping size: 28797 timestamp: 1763410017955 +- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-auth-0.9.3-hef928c7_0.conda + sha256: d9c5babed03371448bb0dc91a1573c80d278d1222a3b0accef079ed112e584f9 + md5: bdd464b33f6540ed70845b946c11a7b8 + depends: + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - aws-c-http >=0.10.7,<0.10.8.0a0 + - aws-c-sdkutils >=0.2.4,<0.2.5.0a0 + - aws-c-cal >=0.9.13,<0.9.14.0a0 + - aws-c-io >=0.23.3,<0.23.4.0a0 + - aws-c-common >=0.12.6,<0.12.7.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 133443 + timestamp: 1764765235190 +- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-cal-0.9.13-h2c9d079_1.conda + sha256: f21d648349a318f4ae457ea5403d542ba6c0e0343b8642038523dd612b2a5064 + md5: 3c3d02681058c3d206b562b2e3bc337f + depends: + - __glibc >=2.17,<3.0.a0 + - aws-c-common >=0.12.6,<0.12.7.0a0 + - libgcc >=14 + - openssl >=3.5.4,<4.0a0 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 56230 + timestamp: 1764593147526 +- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-common-0.12.6-hb03c661_0.conda + sha256: 926a5b9de0a586e88669d81de717c8dd3218c51ce55658e8a16af7e7fe87c833 + md5: e36ad70a7e0b48f091ed6902f04c23b8 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 239605 + timestamp: 1763585595898 +- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-compression-0.3.1-h8b1a151_9.conda + sha256: 96edccb326b8c653c8eb95a356e01d4aba159da1a97999577b7dd74461b040b4 + md5: f7ec84186dfe7a9e3a9f9e5a4d023e75 + depends: + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - aws-c-common >=0.12.6,<0.12.7.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 22272 + timestamp: 1764593718823 +- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-event-stream-0.5.7-h28f887f_1.conda + sha256: a5b151db1c8373b6ca2dacea65bc8bda02791a43685eebfa4ea987bb1a758ca9 + md5: 7b8e3f846353b75db163ad93248e5f9d + depends: + - libgcc >=14 + - libstdcxx >=14 + - __glibc >=2.17,<3.0.a0 + - aws-c-io >=0.23.3,<0.23.4.0a0 + - aws-c-common >=0.12.6,<0.12.7.0a0 + - aws-checksums >=0.2.7,<0.2.8.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 58806 + timestamp: 1764675439822 +- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-http-0.10.7-ha8fc4e3_5.conda + sha256: 5527224d6e0813e37426557d38cb04fed3753d6b1e544026cfbe2654f5e556be + md5: 3028f20dacafc00b22b88b324c8956cc + depends: + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - aws-c-cal >=0.9.13,<0.9.14.0a0 + - aws-c-io >=0.23.3,<0.23.4.0a0 + - aws-c-compression >=0.3.1,<0.3.2.0a0 + - aws-c-common >=0.12.6,<0.12.7.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 224580 + timestamp: 1764675497060 +- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-io-0.23.3-hdaf4b65_5.conda + sha256: 07d7f2a4493ada676084c3f4313da1fab586cf0a7302572c5d8dde6606113bf4 + md5: 132e8f8f40f0ffc0bbde12bb4e8dd1a1 + depends: + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - aws-c-common >=0.12.6,<0.12.7.0a0 + - s2n >=1.6.2,<1.6.3.0a0 + - aws-c-cal >=0.9.13,<0.9.14.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 181361 + timestamp: 1765168239856 +- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-mqtt-0.13.3-hc63082f_11.conda + sha256: fb102b0346a1f5c4f3bb680ec863c529b0333fa4119d78768c3e8a5d1cc2c812 + md5: 6a653aefdc5d83a4f959869d1759e6e3 + depends: + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - aws-c-io >=0.23.3,<0.23.4.0a0 + - aws-c-http >=0.10.7,<0.10.8.0a0 + - aws-c-common >=0.12.6,<0.12.7.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 216454 + timestamp: 1764681745427 +- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-s3-0.11.3-h06ab39a_1.conda + sha256: 8de2292329dce2fd512413d83988584d616582442a07990f67670f9bc793a98b + md5: 3689a4290319587e3b54a4f9e68f70c8 + depends: + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - aws-c-common >=0.12.6,<0.12.7.0a0 + - openssl >=3.5.4,<4.0a0 + - aws-c-io >=0.23.3,<0.23.4.0a0 + - aws-c-http >=0.10.7,<0.10.8.0a0 + - aws-c-auth >=0.9.3,<0.9.4.0a0 + - aws-checksums >=0.2.7,<0.2.8.0a0 + - aws-c-cal >=0.9.13,<0.9.14.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 151382 + timestamp: 1765174166541 +- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-sdkutils-0.2.4-h8b1a151_4.conda + sha256: 9d62c5029f6f8219368a8665f0a549da572dc777f52413b7d75609cacdbc02cc + md5: c7e3e08b7b1b285524ab9d74162ce40b + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - aws-c-common >=0.12.6,<0.12.7.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 59383 + timestamp: 1764610113765 +- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.2.7-h8b1a151_5.conda + sha256: a8693d2e06903a09e98fe724ed5ec32e7cd1b25c405d754f0ab7efb299046f19 + md5: 68da5b56dde41e172b7b24f071c4b392 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - aws-c-common >=0.12.6,<0.12.7.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 76915 + timestamp: 1764593731486 +- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.35.4-h8824e59_0.conda + sha256: 524fc8aa2645e5701308b865bf5c523257feabc6dfa7000cb8207ccfbb1452a1 + md5: 113b9d9913280474c0868b0e290c0326 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + - aws-c-event-stream >=0.5.7,<0.5.8.0a0 + - aws-c-common >=0.12.6,<0.12.7.0a0 + - aws-c-cal >=0.9.13,<0.9.14.0a0 + - aws-c-sdkutils >=0.2.4,<0.2.5.0a0 + - aws-c-io >=0.23.3,<0.23.4.0a0 + - aws-c-auth >=0.9.3,<0.9.4.0a0 + - aws-c-http >=0.10.7,<0.10.8.0a0 + - aws-c-mqtt >=0.13.3,<0.13.4.0a0 + - aws-c-s3 >=0.11.3,<0.11.4.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 408804 + timestamp: 1765200263609 +- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.606-h20b40b1_10.conda + sha256: e0d81b7dd6d054d457a1c54d17733d430d96dc5ca9b2ca69a72eb41c3fc8c9bf + md5: 937d1d4c233adc6eeb2ac3d6e9a73e53 + depends: + - libstdcxx >=14 + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - libcurl >=8.17.0,<9.0a0 + - aws-c-common >=0.12.6,<0.12.7.0a0 + - aws-crt-cpp >=0.35.4,<0.35.5.0a0 + - libzlib >=1.3.1,<2.0a0 + - aws-c-event-stream >=0.5.7,<0.5.8.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 3472674 + timestamp: 1765257107074 +- conda: https://conda.anaconda.org/conda-forge/linux-64/azure-core-cpp-1.16.2-h206d751_0.conda + sha256: 321d1070905e467b6bc6f5067b97c1868d7345c272add82b82e08a0224e326f0 + md5: 5492abf806c45298ae642831c670bba0 + depends: + - __glibc >=2.17,<3.0.a0 + - libcurl >=8.18.0,<9.0a0 + - libgcc >=14 + - libstdcxx >=14 + - openssl >=3.5.4,<4.0a0 + license: MIT + license_family: MIT + purls: [] + size: 348729 + timestamp: 1768837519361 +- conda: https://conda.anaconda.org/conda-forge/linux-64/azure-identity-cpp-1.13.3-hed0cdb0_1.conda + sha256: 2beb6ae8406f946b8963a67e72fe74453e1411c5ae7e992978340de6c512d13c + md5: 68bfb556bdf56d56e9f38da696e752ca + depends: + - __glibc >=2.17,<3.0.a0 + - azure-core-cpp >=1.16.2,<1.16.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + - openssl >=3.5.5,<4.0a0 + license: MIT + license_family: MIT + purls: [] + size: 250511 + timestamp: 1770344967948 +- conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-blobs-cpp-12.16.0-hdd73cc9_1.conda + sha256: cef75b91bdd5a65c560b501df78905437cc2090a64b4c5ecd7da9e08e9e9af7c + md5: 939d9ce324e51961c7c4c0046733dbb7 + depends: + - __glibc >=2.17,<3.0.a0 + - azure-core-cpp >=1.16.2,<1.16.3.0a0 + - azure-storage-common-cpp >=12.12.0,<12.12.1.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: MIT + license_family: MIT + purls: [] + size: 579825 + timestamp: 1770321459546 +- conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-common-cpp-12.12.0-ha7a2c86_1.conda + sha256: ef7d1cae36910b21385d0816f8524a84dee1513e0306927e41a6bd32b5b9a0d0 + md5: 6400f73fe5ebe19fe7aca3616f1f1de7 + depends: + - __glibc >=2.17,<3.0.a0 + - azure-core-cpp >=1.16.2,<1.16.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + - libxml2 + - libxml2-16 >=2.14.6 + - openssl >=3.5.5,<4.0a0 + license: MIT + license_family: MIT + purls: [] + size: 150405 + timestamp: 1770240307002 +- conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-files-datalake-cpp-12.14.0-h52c5a47_1.conda + sha256: 55aa8ad5217d358e0ccf4a715bd1f9bafef3cd1c2ea4021f0e916f174c20f8e3 + md5: 6d10339800840562b7dad7775f5d2c16 + depends: + - __glibc >=2.17,<3.0.a0 + - azure-core-cpp >=1.16.2,<1.16.3.0a0 + - azure-storage-blobs-cpp >=12.16.0,<12.16.1.0a0 + - azure-storage-common-cpp >=12.12.0,<12.12.1.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: MIT + license_family: MIT + purls: [] + size: 302524 + timestamp: 1770384269834 - conda: https://conda.anaconda.org/conda-forge/linux-64/backports.zstd-1.3.0-py313h18e8e13_0.conda sha256: 9552afbec37c4d8d0e83a5c4c6b3c7f4b8785f935094ce3881e0a249045909ce md5: d9e90792551a527200637e23a915dd79 @@ -435,6 +826,17 @@ packages: purls: [] size: 260341 timestamp: 1757437258798 +- conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.6-hb03c661_0.conda + sha256: cc9accf72fa028d31c2a038460787751127317dcfa991f8d1f1babf216bb454e + md5: 920bb03579f15389b9e512095ad995b7 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: MIT + license_family: MIT + purls: [] + size: 207882 + timestamp: 1765214722852 - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.1.4-hbd8a1cb_0.conda sha256: b5974ec9b50e3c514a382335efa81ed02b05906849827a34061c496f4defa0b2 md5: bddacf101bb4dd0e51811cb69c7790e2 @@ -693,6 +1095,30 @@ packages: - pre-commit ; extra == 'dev' - ruff ; extra == 'dev' requires_python: '>=3.10' +- conda: https://conda.anaconda.org/conda-forge/linux-64/gflags-2.2.2-h5888daf_1005.conda + sha256: 6c33bf0c4d8f418546ba9c250db4e4221040936aef8956353bc764d4877bc39a + md5: d411fc29e338efb48c5fd4576d71d881 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + - libstdcxx >=13 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 119654 + timestamp: 1726600001928 +- conda: https://conda.anaconda.org/conda-forge/linux-64/glog-0.7.1-hbabe93e_0.conda + sha256: dc824dc1d0aa358e28da2ecbbb9f03d932d976c8dca11214aa1dcdfcbd054ba2 + md5: ff862eebdfeb2fd048ae9dc92510baca + depends: + - gflags >=2.2.2,<2.3.0a0 + - libgcc-ng >=12 + - libstdcxx-ng >=12 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 143452 + timestamp: 1718284177264 - pypi: https://files.pythonhosted.org/packages/91/4c/e0ce1ef95d4000ebc1c11801f9b944fa5910ecc15b5e351865763d8657f8/graphviz-0.21-py3-none-any.whl name: graphviz version: '0.21' @@ -961,6 +1387,32 @@ packages: version: 1.5.3 sha256: 5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713 requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda + sha256: 0960d06048a7185d3542d850986d807c6e37ca2e644342dd0c72feefcf26c2a4 + md5: b38117a3c920364aff79f870c984b4a3 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + license: LGPL-2.1-or-later + purls: [] + size: 134088 + timestamp: 1754905959823 +- conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.22.2-ha1258a1_0.conda + sha256: 3e307628ca3527448dd1cb14ad7bb9d04d1d28c7d4c5f97ba196ae984571dd25 + md5: fb53fb07ce46a575c5d004bbc96032c2 + depends: + - __glibc >=2.17,<3.0.a0 + - keyutils >=1.6.3,<2.0a0 + - libedit >=3.1.20250104,<3.2.0a0 + - libedit >=3.1.20250104,<4.0a0 + - libgcc >=14 + - libstdcxx >=14 + - openssl >=3.5.5,<4.0a0 + license: MIT + license_family: MIT + purls: [] + size: 1386730 + timestamp: 1769769569681 - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_101.conda sha256: 565941ac1f8b0d2f2e8f02827cbca648f4d18cd461afc31f15604cd291b5c5f3 md5: 12bd9a3f089ee6c9266a37dab82afabd @@ -974,6 +1426,127 @@ packages: purls: [] size: 725507 timestamp: 1770267139900 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libabseil-20260107.1-cxx17_h7b12aa8_0.conda + sha256: a7a4481a4d217a3eadea0ec489826a69070fcc3153f00443aa491ed21527d239 + md5: 6f7b4302263347698fd24565fbf11310 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + constrains: + - libabseil-static =20260107.1=cxx17* + - abseil-cpp =20260107.1 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 1384817 + timestamp: 1770863194876 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-23.0.0-h2603568_3_cpu.conda + build_number: 3 + sha256: 249572775ce68f418392b2e4fd08a6adcd1c1c75bf4c870145a96d61f71d08ff + md5: 4952208743759431df21f01aba7466dd + depends: + - __glibc >=2.17,<3.0.a0 + - aws-crt-cpp >=0.35.4,<0.35.5.0a0 + - aws-sdk-cpp >=1.11.606,<1.11.607.0a0 + - azure-core-cpp >=1.16.2,<1.16.3.0a0 + - azure-identity-cpp >=1.13.3,<1.13.4.0a0 + - azure-storage-blobs-cpp >=12.16.0,<12.16.1.0a0 + - azure-storage-files-datalake-cpp >=12.14.0,<12.14.1.0a0 + - bzip2 >=1.0.8,<2.0a0 + - glog >=0.7.1,<0.8.0a0 + - libabseil * cxx17* + - libabseil >=20260107.0,<20260108.0a0 + - libbrotlidec >=1.2.0,<1.3.0a0 + - libbrotlienc >=1.2.0,<1.3.0a0 + - libgcc >=14 + - libgoogle-cloud >=2.39.0,<2.40.0a0 + - libgoogle-cloud-storage >=2.39.0,<2.40.0a0 + - libopentelemetry-cpp >=1.21.0,<1.22.0a0 + - libprotobuf >=6.33.5,<6.33.6.0a0 + - libstdcxx >=14 + - libzlib >=1.3.1,<2.0a0 + - lz4-c >=1.10.0,<1.11.0a0 + - orc >=2.2.2,<2.2.3.0a0 + - snappy >=1.2.2,<1.3.0a0 + - zstd >=1.5.7,<1.6.0a0 + constrains: + - parquet-cpp <0.0a0 + - apache-arrow-proc =*=cpu + - arrow-cpp <0.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 6482745 + timestamp: 1770642318900 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-23.0.0-h635bf11_3_cpu.conda + build_number: 3 + sha256: 85104db18ecf79a5f2498434843fdd525fe77befe5cdb0a26950f542afe2f850 + md5: c2415c2264b6b5e4ef45019ce6aa9579 + depends: + - __glibc >=2.17,<3.0.a0 + - libarrow 23.0.0 h2603568_3_cpu + - libarrow-compute 23.0.0 h53684a4_3_cpu + - libgcc >=14 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 612674 + timestamp: 1770642525144 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-compute-23.0.0-h53684a4_3_cpu.conda + build_number: 3 + sha256: c3d47ea6e732c178d0d276b9e14578fbc4ec519baf9b47af1a4f7c9184787cd5 + md5: 8ffa55113b6ade32fe4a51d480f0b806 + depends: + - __glibc >=2.17,<3.0.a0 + - libarrow 23.0.0 h2603568_3_cpu + - libgcc >=14 + - libre2-11 >=2025.11.5 + - libstdcxx >=14 + - libutf8proc >=2.11.3,<2.12.0a0 + - re2 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 3007250 + timestamp: 1770642389976 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-23.0.0-h635bf11_3_cpu.conda + build_number: 3 + sha256: fb0de4d207633cdb9e1cb80c67b292eef04dde3d81c61741c825be2a6510ea1e + md5: 22beeb3b36026e14f509a8b62ca58f1a + depends: + - __glibc >=2.17,<3.0.a0 + - libarrow 23.0.0 h2603568_3_cpu + - libarrow-acero 23.0.0 h635bf11_3_cpu + - libarrow-compute 23.0.0 h53684a4_3_cpu + - libgcc >=14 + - libparquet 23.0.0 h7376487_3_cpu + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 611552 + timestamp: 1770642619988 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-23.0.0-hb4dd7c2_3_cpu.conda + build_number: 3 + sha256: d91e8f99b17dcc1d9f387d5119163a34f7486daaac39f9e766c0890be8ad0826 + md5: c582146e900636a8db83955cc15eadd5 + depends: + - __glibc >=2.17,<3.0.a0 + - libabseil * cxx17* + - libabseil >=20260107.0,<20260108.0a0 + - libarrow 23.0.0 h2603568_3_cpu + - libarrow-acero 23.0.0 h635bf11_3_cpu + - libarrow-dataset 23.0.0 h635bf11_3_cpu + - libgcc >=14 + - libprotobuf >=6.33.5,<6.33.6.0a0 + - libstdcxx >=14 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 522978 + timestamp: 1770642651554 - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda build_number: 5 sha256: 18c72545080b86739352482ba14ba2c4815e19e26a7417ca21a95b76ec8da24c @@ -992,6 +1565,41 @@ packages: purls: [] size: 18213 timestamp: 1765818813880 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.2.0-hb03c661_1.conda + sha256: 318f36bd49ca8ad85e6478bd8506c88d82454cc008c1ac1c6bf00a3c42fa610e + md5: 72c8fd1af66bd67bf580645b426513ed + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: MIT + license_family: MIT + purls: [] + size: 79965 + timestamp: 1764017188531 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.2.0-hb03c661_1.conda + sha256: 12fff21d38f98bc446d82baa890e01fd82e3b750378fedc720ff93522ffb752b + md5: 366b40a69f0ad6072561c1d09301c886 + depends: + - __glibc >=2.17,<3.0.a0 + - libbrotlicommon 1.2.0 hb03c661_1 + - libgcc >=14 + license: MIT + license_family: MIT + purls: [] + size: 34632 + timestamp: 1764017199083 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.2.0-hb03c661_1.conda + sha256: a0c15c79997820bbd3fbc8ecf146f4fe0eca36cc60b62b63ac6cf78857f1dd0d + md5: 4ffbb341c8b616aa2494b6afb26a0c5f + depends: + - __glibc >=2.17,<3.0.a0 + - libbrotlicommon 1.2.0 hb03c661_1 + - libgcc >=14 + license: MIT + license_family: MIT + purls: [] + size: 298378 + timestamp: 1764017210931 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda build_number: 5 sha256: 0cbdcc67901e02dc17f1d19e1f9170610bd828100dc207de4d5b6b8ad1ae7ad8 @@ -1007,6 +1615,68 @@ packages: purls: [] size: 18194 timestamp: 1765818837135 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcrc32c-1.1.2-h9c3ff4c_0.tar.bz2 + sha256: fd1d153962764433fe6233f34a72cdeed5dcf8a883a85769e8295ce940b5b0c5 + md5: c965a5aa0d5c1c37ffc62dff36e28400 + depends: + - libgcc-ng >=9.4.0 + - libstdcxx-ng >=9.4.0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 20440 + timestamp: 1633683576494 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.18.0-hcf29cc6_1.conda + sha256: c84e8dccb65ad5149c0121e4b54bdc47fa39303fd5f4979b8c44bb51b39a369b + md5: 1707cdd636af2ff697b53186572c9f77 + depends: + - __glibc >=2.17,<3.0.a0 + - krb5 >=1.22.2,<1.23.0a0 + - libgcc >=14 + - libnghttp2 >=1.67.0,<2.0a0 + - libssh2 >=1.11.1,<2.0a0 + - libzlib >=1.3.1,<2.0a0 + - openssl >=3.5.5,<4.0a0 + - zstd >=1.5.7,<1.6.0a0 + license: curl + license_family: MIT + purls: [] + size: 463621 + timestamp: 1770892808818 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda + sha256: d789471216e7aba3c184cd054ed61ce3f6dac6f87a50ec69291b9297f8c18724 + md5: c277e0a4d549b03ac1e9d6cbbe3d017b + depends: + - ncurses + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + - ncurses >=6.5,<7.0a0 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 134676 + timestamp: 1738479519902 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda + sha256: 1cd6048169fa0395af74ed5d8f1716e22c19a81a8a36f934c110ca3ad4dd27b4 + md5: 172bf1cd1ff8629f2b1179945ed45055 + depends: + - libgcc-ng >=12 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 112766 + timestamp: 1702146165126 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.12-hf998b51_1.conda + sha256: 2e14399d81fb348e9d231a82ca4d816bf855206923759b69ad006ba482764131 + md5: a1cfcc585f0c42bf8d5546bb1dfb668d + depends: + - libgcc-ng >=12 + - openssl >=3.1.1,<4.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 427426 + timestamp: 1685725977222 - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.3-hecca717_0.conda sha256: 1e1b08f6211629cbc2efe7a5bca5953f8f6b3cae0eeb04ca4dacee1bd4e2db2f md5: 8b09ae86839581147ef2e5c5e229d164 @@ -1045,6 +1715,16 @@ packages: purls: [] size: 1040478 timestamp: 1770252533873 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_17.conda + sha256: bdfe50501e4a2d904a5eae65a7ae26e2b7a29b473ab084ad55d96080b966502e + md5: 1478bfa85224a65ab096d69ffd2af1e5 + depends: + - libgcc 15.2.0 he0feb66_17 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 27541 + timestamp: 1770252546553 - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_17.conda sha256: 1604c083dd65bc91e68b6cfe32c8610395088cb96af1acaf71f0dcaf83ac58f7 md5: a6c682ac611cb1fa4d73478f9e6efb06 @@ -1080,6 +1760,76 @@ packages: purls: [] size: 603334 timestamp: 1770252441199 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-2.39.0-h9d11ab5_1.conda + sha256: 44f8e354431d2336475465ec8d71df7f3dea1397e70df0718c2ac75137976c63 + md5: cd398eb8374fb626a710b7a35b7ffa98 + depends: + - __glibc >=2.17,<3.0.a0 + - libabseil * cxx17* + - libabseil >=20260107.0,<20260108.0a0 + - libcurl >=8.18.0,<9.0a0 + - libgcc >=14 + - libgrpc >=1.78.0,<1.79.0a0 + - libprotobuf >=6.33.5,<6.33.6.0a0 + - libstdcxx >=14 + - openssl >=3.5.5,<4.0a0 + constrains: + - libgoogle-cloud 2.39.0 *_1 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 1307253 + timestamp: 1770461665848 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-2.39.0-hdbdcf42_1.conda + sha256: 2cce946ebf40b0b5fdb3e82c8a9f90ca28cd62abd281b20713067cc69a75c441 + md5: 384a1730ea66a72692e377cb45996d61 + depends: + - __glibc >=2.17,<3.0.a0 + - libabseil + - libcrc32c >=1.1.2,<1.2.0a0 + - libcurl + - libgcc >=14 + - libgoogle-cloud 2.39.0 h9d11ab5_1 + - libstdcxx >=14 + - libzlib >=1.3.1,<2.0a0 + - openssl + license: Apache-2.0 + license_family: Apache + purls: [] + size: 803453 + timestamp: 1770461856392 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgrpc-1.78.0-h1d1128b_1.conda + sha256: f6861217d6c4bf96283738ba8d55782fccb577513a6cd346abc60cf88d1795df + md5: 66055700c90b50c0405a4e515bb4fe3c + depends: + - __glibc >=2.17,<3.0.a0 + - c-ares >=1.34.6,<2.0a0 + - libabseil * cxx17* + - libabseil >=20260107.0,<20260108.0a0 + - libgcc >=14 + - libprotobuf >=6.33.5,<6.33.6.0a0 + - libre2-11 >=2025.11.5 + - libstdcxx >=14 + - libzlib >=1.3.1,<2.0a0 + - openssl >=3.5.5,<4.0a0 + - re2 + constrains: + - grpc-cpp =1.78.0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 6992089 + timestamp: 1770260975908 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h3b78370_2.conda + sha256: c467851a7312765447155e071752d7bf9bf44d610a5687e32706f480aad2833f + md5: 915f5995e94f60e9a4826e0b0920ee88 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: LGPL-2.1-only + purls: [] + size: 790176 + timestamp: 1754908768807 - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda build_number: 5 sha256: c723b6599fcd4c6c75dee728359ef418307280fa3e2ee376e14e85e5bbdda053 @@ -1118,6 +1868,23 @@ packages: purls: [] size: 92400 timestamp: 1769482286018 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda + sha256: a4a7dab8db4dc81c736e9a9b42bdfd97b087816e029e221380511960ac46c690 + md5: b499ce4b026493a13774bcf0f4c33849 + depends: + - __glibc >=2.17,<3.0.a0 + - c-ares >=1.34.5,<2.0a0 + - libev >=4.33,<4.34.0a0 + - libev >=4.33,<5.0a0 + - libgcc >=14 + - libstdcxx >=14 + - libzlib >=1.3.1,<2.0a0 + - openssl >=3.5.2,<4.0a0 + license: MIT + license_family: MIT + purls: [] + size: 666600 + timestamp: 1756834976695 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda sha256: 199d79c237afb0d4780ccd2fbf829cea80743df60df4705202558675e07dd2c5 md5: be43915efc66345cccb3c310b6ed0374 @@ -1133,6 +1900,81 @@ packages: purls: [] size: 5927939 timestamp: 1763114673331 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-1.21.0-h9692893_2.conda + sha256: 59663bdd97ac6d8ce8a83bf80e18c14c4ac5ca536ef1a2de4bc9080a45dc501a + md5: c3de1cc30bc11edbc98aed352381449d + depends: + - libabseil * cxx17* + - libabseil >=20260107.0,<20260108.0a0 + - libcurl >=8.18.0,<9.0a0 + - libgrpc >=1.78.0,<1.79.0a0 + - libopentelemetry-cpp-headers 1.21.0 ha770c72_2 + - libprotobuf >=6.33.5,<6.33.6.0a0 + - libzlib >=1.3.1,<2.0a0 + - nlohmann_json + - prometheus-cpp >=1.3.0,<1.4.0a0 + constrains: + - cpp-opentelemetry-sdk =1.21.0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 896630 + timestamp: 1770452315175 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-headers-1.21.0-ha770c72_2.conda + sha256: b2b2122f214c417851ba280009aea040e546665c43de737690c2610055a255e3 + md5: 253e70376a8ae74f9d99d44712b3e087 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 362214 + timestamp: 1770452273268 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-23.0.0-h7376487_3_cpu.conda + build_number: 3 + sha256: 8f9f1885cbfb20de14c18d55cd69c8076e003f845658ad17a967eb28f8fb9bf1 + md5: e3eef5f398cccdd73d3ff2e3c8ec0793 + depends: + - __glibc >=2.17,<3.0.a0 + - libarrow 23.0.0 h2603568_3_cpu + - libgcc >=14 + - libstdcxx >=14 + - libthrift >=0.22.0,<0.22.1.0a0 + - openssl >=3.5.5,<4.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 1392223 + timestamp: 1770642492655 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-6.33.5-h2b00c02_0.conda + sha256: afbf195443269ae10a940372c1d37cda749355d2bd96ef9587a962abd87f2429 + md5: 11ac478fa72cf12c214199b8a96523f4 + depends: + - __glibc >=2.17,<3.0.a0 + - libabseil * cxx17* + - libabseil >=20260107.0,<20260108.0a0 + - libgcc >=14 + - libstdcxx >=14 + - libzlib >=1.3.1,<2.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 3638698 + timestamp: 1769749419271 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2025.11.05-h0dc7533_1.conda + sha256: 138fc85321a8c0731c1715688b38e2be4fb71db349c9ab25f685315095ae70ff + md5: ced7f10b6cfb4389385556f47c0ad949 + depends: + - __glibc >=2.17,<3.0.a0 + - libabseil * cxx17* + - libabseil >=20260107.0,<20260108.0a0 + - libgcc >=14 + - libstdcxx >=14 + constrains: + - re2 2025.11.05.* + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 213122 + timestamp: 1768190028309 - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.51.2-hf4e2dac_0.conda sha256: 04596fcee262a870e4b7c9807224680ff48d4d0cc0dac076a602503d3dc6d217 md5: da5be73701eecd0e8454423fd6ffcf30 @@ -1145,6 +1987,19 @@ packages: purls: [] size: 942808 timestamp: 1768147973361 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda + sha256: fa39bfd69228a13e553bd24601332b7cfeb30ca11a3ca50bb028108fe90a7661 + md5: eecce068c7e4eddeb169591baac20ac4 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + - libzlib >=1.3.1,<2.0a0 + - openssl >=3.5.0,<4.0a0 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 304790 + timestamp: 1745608545575 - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_17.conda sha256: 50c48cd3716a2e58e8e2e02edc78fef2d08fffe1e3b1ed40eb5f87e7e2d07889 md5: 24c2fe35fa45cd71214beba6f337c071 @@ -1158,6 +2013,42 @@ packages: purls: [] size: 5852406 timestamp: 1770252584235 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.2.0-hdf11a46_17.conda + sha256: ca3fb322dab3373946b1064da686ec076f5b1b9caf0a2823dad00d0b0f704928 + md5: ea12f5a6bf12c88c06750d9803e1a570 + depends: + - libstdcxx 15.2.0 h934c35e_17 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 27573 + timestamp: 1770252638797 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libthrift-0.22.0-h454ac66_1.conda + sha256: 4888b9ea2593c36ca587a5ebe38d0a56a0e6d6a9e4bb7da7d9a326aaaca7c336 + md5: 8ed82d90e6b1686f5e98f8b7825a15ef + depends: + - __glibc >=2.17,<3.0.a0 + - libevent >=2.1.12,<2.1.13.0a0 + - libgcc >=14 + - libstdcxx >=14 + - libzlib >=1.3.1,<2.0a0 + - openssl >=3.5.1,<4.0a0 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 424208 + timestamp: 1753277183984 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libutf8proc-2.11.3-hfe17d71_0.conda + sha256: ecbf4b7520296ed580498dc66a72508b8a79da5126e1d6dc650a7087171288f9 + md5: 1247168fe4a0b8912e3336bccdbf98a5 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: MIT + license_family: MIT + purls: [] + size: 85969 + timestamp: 1768735071295 - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda sha256: 1a7539cfa7df00714e8943e18de0b06cceef6778e420a5ee3a2a145773758aee md5: db409b7c1720428638e7c0d509d3e1b5 @@ -1169,6 +2060,39 @@ packages: purls: [] size: 40311 timestamp: 1766271528534 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.15.1-he237659_1.conda + sha256: 047be059033c394bd32ae5de66ce389824352120b3a7c0eff980195f7ed80357 + md5: 417955234eccd8f252b86a265ccdab7f + depends: + - __glibc >=2.17,<3.0.a0 + - icu >=78.1,<79.0a0 + - libgcc >=14 + - libiconv >=1.18,<2.0a0 + - liblzma >=5.8.1,<6.0a0 + - libxml2-16 2.15.1 hca6bf5a_1 + - libzlib >=1.3.1,<2.0a0 + license: MIT + license_family: MIT + purls: [] + size: 45402 + timestamp: 1766327161688 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-16-2.15.1-hca6bf5a_1.conda + sha256: 8331284bf9ae641b70cdc0e5866502dd80055fc3b9350979c74bb1d192e8e09e + md5: 3fdd8d99683da9fe279c2f4cecd1e048 + depends: + - __glibc >=2.17,<3.0.a0 + - icu >=78.1,<79.0a0 + - libgcc >=14 + - libiconv >=1.18,<2.0a0 + - liblzma >=5.8.1,<6.0a0 + - libzlib >=1.3.1,<2.0a0 + constrains: + - libxml2 2.15.1 + license: MIT + license_family: MIT + purls: [] + size: 555747 + timestamp: 1766327145986 - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4 md5: edb0dca6bc32e4f4789199455a1dbeb8 @@ -1193,6 +2117,18 @@ packages: - pkg:pypi/locket?source=hash-mapping size: 8250 timestamp: 1650660473123 +- conda: https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.10.0-h5888daf_1.conda + sha256: 47326f811392a5fd3055f0f773036c392d26fdb32e4d8e7a8197eed951489346 + md5: 9de5350a85c4a20c685259b889aa6393 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + - libstdcxx >=13 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 167055 + timestamp: 1733741040117 - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.3-py313h3dea7bd_0.conda sha256: a530a411bdaaf0b1e4de8869dfaca46cb07407bc7dc0702a9e231b0e5ce7ca85 md5: c14389156310b8ed3520d84f854be1ee @@ -1267,6 +2203,16 @@ packages: purls: [] size: 891641 timestamp: 1738195959188 +- conda: https://conda.anaconda.org/conda-forge/linux-64/nlohmann_json-3.12.0-h54a6638_1.conda + sha256: fd2cbd8dfc006c72f45843672664a8e4b99b2f8137654eaae8c3d46dca776f63 + md5: 16c2a0e9c4a166e53632cfca4f68d020 + constrains: + - nlohmann_json-abi ==3.12.0 + license: MIT + license_family: MIT + purls: [] + size: 136216 + timestamp: 1758194284857 - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.2-py313hf6604e3_1.conda sha256: 2eb8be25a7504f058a153a84be70471e0ebbf6bd0411ae2b6d34904b89d86fe3 md5: ca9c6ba4beac38cb3d0a85afde27f94c @@ -1308,6 +2254,24 @@ packages: purls: [] size: 3164551 timestamp: 1769555830639 +- conda: https://conda.anaconda.org/conda-forge/linux-64/orc-2.2.2-hbb90d81_1.conda + sha256: c59d22c4e555c09259c52da96f1576797fcb4fba5665073e9c1907393309172d + md5: 9269175175f18091b8844c8e9f213205 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libprotobuf >=6.33.5,<6.33.6.0a0 + - libstdcxx >=14 + - libzlib >=1.3.1,<2.0a0 + - lz4-c >=1.10.0,<1.11.0a0 + - snappy >=1.2.2,<1.3.0a0 + - tzdata + - zstd >=1.5.7,<1.6.0a0 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 1319627 + timestamp: 1770452421607 - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda sha256: c1fc0f953048f743385d31c468b4a678b3ad20caffdeaa94bed85ba63049fd58 md5: b76541e68fea4d511b1ac46a28dcd2c6 @@ -1474,6 +2438,21 @@ packages: - pkg:pypi/pluggy?source=compressed-mapping size: 25877 timestamp: 1764896838868 +- conda: https://conda.anaconda.org/conda-forge/linux-64/prometheus-cpp-1.3.0-ha5d0236_0.conda + sha256: 013669433eb447548f21c3c6b16b2ed64356f726b5f77c1b39d5ba17a8a4b8bc + md5: a83f6a2fdc079e643237887a37460668 + depends: + - __glibc >=2.17,<3.0.a0 + - libcurl >=8.10.1,<9.0a0 + - libgcc >=13 + - libstdcxx >=13 + - libzlib >=1.3.1,<2.0a0 + - zlib + license: MIT + license_family: MIT + purls: [] + size: 199544 + timestamp: 1730769112346 - pypi: https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl name: prompt-toolkit version: 3.0.52 @@ -1540,10 +2519,47 @@ packages: - pkg:pypi/pure-eval?source=hash-mapping size: 16668 timestamp: 1733569518868 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-23.0.0-py313h78bf25f_0.conda + sha256: 43636b4ce58c57f3aeab182238b47cb8b860d2cc0544c184612c15ee294be154 + md5: a6e89cb214f318db9548b791ba27f862 + depends: + - libarrow-acero 23.0.0.* + - libarrow-dataset 23.0.0.* + - libarrow-substrait 23.0.0.* + - libparquet 23.0.0.* + - pyarrow-core 23.0.0 *_0_* + - python >=3.13,<3.14.0a0 + - python_abi 3.13.* *_cp313 + license: Apache-2.0 + license_family: APACHE + purls: [] + size: 27332 + timestamp: 1769291558903 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-23.0.0-py313h98bfbea_0_cpu.conda + sha256: 30247f262175f7408c7856735c529a9402356f85b8f99cc54c86bbcd7600a2c0 + md5: c8d1ba76789588fdf7fddc213a25137e + depends: + - __glibc >=2.17,<3.0.a0 + - libarrow 23.0.0.* *cpu + - libarrow-compute 23.0.0.* *cpu + - libgcc >=14 + - libstdcxx >=14 + - libzlib >=1.3.1,<2.0a0 + - python >=3.13,<3.14.0a0 + - python_abi 3.13.* *_cp313 + constrains: + - apache-arrow-proc * cpu + - numpy >=1.23,<3 + license: Apache-2.0 + license_family: APACHE + purls: + - pkg:pypi/pyarrow?source=hash-mapping + size: 4776275 + timestamp: 1770672664641 - pypi: ./ name: pyearthtools-bundled-persistence version: 0.6.0 - sha256: 88d20e73ba2c4cbde71d4a3e3381e5dd361a52f021fe36e79d09d3f9ed1b4cf4 + sha256: dbb05e58245981cba55647de8b43a09454295a58a5faf8299d4d98ba3f3c6ae3 requires_dist: - pyearthtools-zoo>=0.5.0 - pyearthtools-data>=0.5.0 @@ -1841,6 +2857,16 @@ packages: - pkg:pypi/pyyaml?source=compressed-mapping size: 201616 timestamp: 1770223543730 +- conda: https://conda.anaconda.org/conda-forge/linux-64/re2-2025.11.05-h5301d42_1.conda + sha256: 3fc684b81631348540e9a42f6768b871dfeab532d3f47d5c341f1f83e2a2b2b2 + md5: 66a715bc01c77d43aca1f9fcb13dde3c + depends: + - libre2-11 2025.11.05 h0dc7533_1 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 27469 + timestamp: 1768190052132 - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda sha256: 12ffde5a6f958e285aa22c191ca01bbd3d6e710aa852e00618fa6ddc59149002 md5: d7d95fc8287ea7bf33e0e7116d2b95ec @@ -1881,6 +2907,18 @@ packages: - pkg:pypi/ruff?source=compressed-mapping size: 9103793 timestamp: 1770153712370 +- conda: https://conda.anaconda.org/conda-forge/linux-64/s2n-1.6.2-he8a4886_1.conda + sha256: dec76e9faa3173579d34d226dbc91892417a80784911daf8e3f0eb9bad19d7a6 + md5: bade189a194e66b93c03021bd36c337b + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - openssl >=3.5.4,<4.0a0 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 394197 + timestamp: 1765160261434 - pypi: https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl name: scikit-learn version: 1.8.0 @@ -2011,6 +3049,19 @@ packages: - pkg:pypi/six?source=hash-mapping size: 18455 timestamp: 1753199211006 +- conda: https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.2-h03e3b7b_1.conda + sha256: 48f3f6a76c34b2cfe80de9ce7f2283ecb55d5ed47367ba91e8bb8104e12b8f11 + md5: 98b6c9dc80eb87b2519b97bcf7e578dd + depends: + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - libstdcxx >=14 + - libgcc >=14 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 45829 + timestamp: 1762948049098 - conda: https://conda.anaconda.org/conda-forge/noarch/sortedcontainers-2.4.0-pyhd8ed1ab_1.conda sha256: d1e3e06b5cf26093047e63c8cc77b70d970411c5cbc0cb1fad461a8a8df599f7 md5: 0401a17ae845fa72c7210e206ec5647d @@ -2293,6 +3344,18 @@ packages: - pkg:pypi/zipp?source=hash-mapping size: 24194 timestamp: 1764460141901 +- conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-hb9d3cd8_2.conda + sha256: 5d7c0e5f0005f74112a34a7425179f4eb6e73c92f5d109e6af4ddeca407c92ab + md5: c9f075ab2f33b3bbee9e62d4ad0a6cd8 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + - libzlib 1.3.1 hb9d3cd8_2 + license: Zlib + license_family: Other + purls: [] + size: 92286 + timestamp: 1727963153079 - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda sha256: 68f0206ca6e98fea941e5717cec780ed2873ffabc0e1ed34428c061e2c6268c7 md5: 4a13eeac0b5c8e5b8ab496e6c4ddd829 diff --git a/packages/bundled_models/persistence/pyproject.toml b/packages/bundled_models/persistence/pyproject.toml index 15b2f5d9..69009d60 100644 --- a/packages/bundled_models/persistence/pyproject.toml +++ b/packages/bundled_models/persistence/pyproject.toml @@ -85,6 +85,10 @@ ipython = ">=9.10.0,<10" [tool.pixi.feature.dask.dependencies] dask-core = "*" distributed = "*" +pyarrow = ">=23.0.0,<24" + +[tool.pixi.feature.dev.dependencies] +pyarrow = "*" [tool.pixi.environments] dask = ["dask"] diff --git a/packages/bundled_models/persistence/src/persistence/_daskconfig.py b/packages/bundled_models/persistence/src/persistence/_daskconfig.py index 0375f29b..171f522f 100644 --- a/packages/bundled_models/persistence/src/persistence/_daskconfig.py +++ b/packages/bundled_models/persistence/src/persistence/_daskconfig.py @@ -22,29 +22,11 @@ def do_stuff(...): """ try: import dask - - # store state - note: scheduler config is not guarenteed to exist by default - flag_nonexistant_scheduler_config = False - state_scheduler_type = None - try: - state_scheduler_type = dask.config.get("scheduler") - except KeyError: - flag_nonexistant_scheduler_config = True - + import dask.config + import dask.distributed # set state to desired config - dask.config.set(scheduler=_STR_DASK_SYNC_SCHEDULER) - - # release scope to caller context - yield - - # retrieve current stack after context execution - if flag_nonexistant_scheduler_config: - # scheduler state did not exist so delete it, note: "Not exist" is different from "None" - del dask.config['scheduler'] - else: - # otherwise revert it to normal - dask.config.set(scheduler=state_scheduler_type) - + with dask.config.set(scheduler=_STR_DASK_SYNC_SCHEDULER): + yield except ImportError: yield diff --git a/packages/bundled_models/persistence/tests/test__daskconfig.py b/packages/bundled_models/persistence/tests/test__daskconfig.py new file mode 100644 index 00000000..0fffe574 --- /dev/null +++ b/packages/bundled_models/persistence/tests/test__daskconfig.py @@ -0,0 +1,120 @@ +""" +Tests that dask is actually in synchronous/signle-threaded mode +""" +from dataclasses import dataclass +import numpy as np +import persistence as pet_persist +import persistence._daskconfig as pet_daskconfig + +@dataclass +class _PyTestThreadInfo(): + id_thread_kern: int # usually same as process id + id_thread_py: int # python read id + id_process: int # process id for current worker + num_cpus: int # number of cpus + + +def _fn_dask_get_thread_info(count): + return _make_thread_info() + +def _cmp_thread_info(thread_info_a: _PyTestThreadInfo, thread_info_b: _PyTestThreadInfo) -> int: + """ + Works like strcmp, thread info is the same => return 0, otherwise they are different. + """ + # Each critera will return 0 if they are equal or 1 if they are not. A larger number implies + # that there is larger discrepency. + # NOTE: cpu checks is not strictly required, but helpful to know, since it is not an expected + # scenario unless running multi-node. + count_diff = ( + int(thread_info_a.id_thread_kern != thread_info_b.id_thread_kern) + + int(thread_info_a.id_thread_py != thread_info_b.id_thread_py) + + int(thread_info_a.id_process != thread_info_b.id_process) + + int(thread_info_a.num_cpus != thread_info_b.num_cpus) + ) + return count_diff + +def _is_multithreaded_compute(list_thread_info) -> bool: + """ + Returns true if the list of thread_info have different threads or processes. + """ + ref_thread_info = list_thread_info[0] + flag_has_different_threads = False + for i, v in enumerate(list_thread_info): + # ignore reference (i == 0) and update flag if a difference is spotted + if i != 0 and _cmp_thread_info(v, ref_thread_info) != 0: + flag_has_different_threads = True + break + return flag_has_different_threads + +def _make_thread_info(): + """ + Creates the current thread info for the given context. This shouldn't be a fixture, it needs to + be called internally by a worker in the test. + """ + import threading + import os + obj_thread_py: threading.Thread = threading.current_thread() + return _PyTestThreadInfo( + id_thread_kern=obj_thread_py.native_id, + id_thread_py=obj_thread_py.ident, + id_process=os.getpid(), + num_cpus=os.cpu_count(), + ) + +def test_dask_single_threaded(): + """ + Set single threaded mode and check that the thread ids are the same for each worker. + """ + import dask + import dask.dataframe as _dd + import dask.array as _da + main_thread_info: _PyTestThreadInfo = _make_thread_info() + + # we still set multiprocess here to check if our context manager is working as expected. + dask.config.config["scheduler"] = "processes" + dask.config.refresh() + + # partition task of processing 100 items by number of ccpus + _chunks = min(main_thread_info.num_cpus, 100), + _dask_df = _dd.io.from_dask_array( + _da.from_array(np.arange(100), chunks=_chunks), + columns=["x"], + ) + + # run computation in context manager + with pet_daskconfig._set_synchronous_dask(): + results = _dask_df.apply(_fn_dask_get_thread_info, axis=1, meta=(None, 'object')).compute() + assert not _is_multithreaded_compute(results) + +def test_dask_default_multithreaded(): + """ + Tests dask without singlethreaded context management. + """ + # NOTE: this namespacing does not guarentee dask is out of scope in other tests + import dask + import dask.config + import dask.distributed + import dask.dataframe as _dd + import dask.array as _da + + # intentionally set to multiprocess mode (which is usually the case with e.g. xarray) + + main_thread_info: _PyTestThreadInfo = _make_thread_info() + dask.config.config["scheduler"] = "processes" + dask.config.refresh() + + # partition task of processing 100 items by number of ccpus + _chunks = min(main_thread_info.num_cpus, 100), + _dask_df = _dd.io.from_dask_array( + _da.from_array(np.arange(100), chunks=_chunks), + columns=["x"], + ) + # get results + results = _dask_df.apply(_fn_dask_get_thread_info, axis=1, meta=(None, 'object')).compute() + + # --- check if there are sufficient threads on system + if len(results) <= 1: + print("Insufficient cores/threads to do multi-process tests") + return + + assert _is_multithreaded_compute(results) diff --git a/packages/bundled_models/persistence/tests/test__interface.py b/packages/bundled_models/persistence/tests/test__interface.py index 9641c215..a7ae965e 100644 --- a/packages/bundled_models/persistence/tests/test__interface.py +++ b/packages/bundled_models/persistence/tests/test__interface.py @@ -1,11 +1,5 @@ """ -TODO: - - 1. Test that data chunking works as expected, given a time index - 2. Test that data retrieval (lookback) works as expected given sparsity multiplier - 3. (mock) Test median of 3 is activated as expected. (call - not computation) - 4. (mock) Test latest is activated as expected - 5. test that appropriate errors are thrown for various invalid inputs +Basic suite of tests that make sure that the interface objects work as expected. """ import numpy as np From 190f73ce98af134c6fb826d5d459c1452471041a Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Mon, 16 Feb 2026 15:45:46 +1100 Subject: [PATCH 04/28] report.xml: remove accidental commit --- packages/bundled_models/persistence/report.xml | 1 - 1 file changed, 1 deletion(-) delete mode 100644 packages/bundled_models/persistence/report.xml diff --git a/packages/bundled_models/persistence/report.xml b/packages/bundled_models/persistence/report.xml deleted file mode 100644 index 608b3e16..00000000 --- a/packages/bundled_models/persistence/report.xml +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file From 705bd19e30b5879da2e67f3f6e3c3598fd15e70e Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Mon, 16 Feb 2026 15:52:39 +1100 Subject: [PATCH 05/28] ruff: minor formatting --- .../src/persistence/_daskconfig.py | 13 ++- .../persistence/src/persistence/_interface.py | 89 ++++++++++++------- .../persistence/tests/test__daskconfig.py | 33 +++++-- .../persistence/tests/test__interface.py | 16 ++-- 4 files changed, 102 insertions(+), 49 deletions(-) diff --git a/packages/bundled_models/persistence/src/persistence/_daskconfig.py b/packages/bundled_models/persistence/src/persistence/_daskconfig.py index 171f522f..ea6a751d 100644 --- a/packages/bundled_models/persistence/src/persistence/_daskconfig.py +++ b/packages/bundled_models/persistence/src/persistence/_daskconfig.py @@ -2,7 +2,8 @@ # default scheduler string to set "single-threaded" mode. -_STR_DASK_SYNC_SCHEDULER="synchronous" +_STR_DASK_SYNC_SCHEDULER = "synchronous" + @contextmanager def _set_synchronous_dask(): @@ -12,21 +13,29 @@ def _set_synchronous_dask(): This handles the case where dask is _not_ installed. In which case it does a pass-through. + IMPORTANT: never nest this context manager or call dask.config.reset() or attempt to update any + configs inside this context. Doing so may invalidate the "synchronous" setting. + Example: def do_stuff(...): # I can now (optionally) fork other processes here - without confusing dask. + # IMPORTANT: I shouldn't try to reintroduce parallelism using dask here ... with _set_synchronous_dask(): do_stuff(...) """ try: + # this import order is important for the "distributed" configs to be recognized import dask import dask.config + + # NOTE: if you don't have dask.distributed, this setting may not work as intended. + # so you will have to manually deal with it in the compute level. import dask.distributed + # set state to desired config with dask.config.set(scheduler=_STR_DASK_SYNC_SCHEDULER): yield except ImportError: yield - diff --git a/packages/bundled_models/persistence/src/persistence/_interface.py b/packages/bundled_models/persistence/src/persistence/_interface.py index 9f5f3b5a..31ffa3df 100644 --- a/packages/bundled_models/persistence/src/persistence/_interface.py +++ b/packages/bundled_models/persistence/src/persistence/_interface.py @@ -7,15 +7,17 @@ import xarray as xr - -_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER = 2 # 50% sparsity is reasonable, though some data like - # precipitation may be more sparse than this -_MAX_NUM_CHUNKS = 1000 # unlikely to have more than 1000 processes for persistence, due to - # diminishing returns - even on a supercomputer +_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER = ( + 2 # 50% sparsity is reasonable, though some data like +) +# precipitation may be more sparse than this +_MAX_NUM_CHUNKS = ( + 1000 # unlikely to have more than 1000 processes for persistence, due to +) +# diminishing returns - even on a supercomputer _mod_index: Callable[[int, int], np.uint] = np.mod -_mod_index.__doc__ = ( -""" +_mod_index.__doc__ = """ Maps negative integer to a positive integer element in a ring. The ring has a domain of `[0, (cardinality - 1)]`. @@ -23,7 +25,7 @@ element, this will standardize it to len(.) - 1. The reason for doing this is to make sure index comparisons are accurately represented. """ -) + class PersistenceMethod(Enum): """ @@ -46,6 +48,7 @@ class PersistenceMethod(Enum): Most methods will fallback to MOST_RECENT if they fail. If MOST_RECENT fails, the result will be nan, essentially marking the datapoint as "void" for comparisons. """ + MOST_RECENT = 0 MEDIAN_OF_THREE = 1 @@ -56,11 +59,15 @@ def num_time_indices_required(self): case PersistenceMethod.MEDIAN_OF_THREE: return 3 case _: - raise NotImplementedError("PersistenceMethod: Invalid persistence method.") + raise NotImplementedError( + "PersistenceMethod: Invalid persistence method." + ) - def min_lookback(self, sparsity_multiplier=_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER): + def min_lookback( + self, sparsity_multiplier=_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER + ): """ - The minimum amount of lookback required to compute the corresponding metric. + The minimum amount of lookback required to compute the corresponding metric. By default we assume a 50% sparsity and require at least double the number of values required for the compuation. """ @@ -70,7 +77,6 @@ def min_lookback(self, sparsity_multiplier=_DEFAULT_PERSISTENCE_SPARSITY_MULTIPL return self.num_time_indices_required() * sparsity_multiplier - @dataclass class PersistenceDataChunk: """ @@ -82,11 +88,15 @@ class PersistenceDataChunk: IMPORTANT: data should not be chunked over time. """ - arr_chunk: np.ndarray # ndarray including time axis. Sorted ascending in time. The latest - # data point is assumed to be the "reference" time. + + arr_chunk: ( + np.ndarray + ) # ndarray including time axis. Sorted ascending in time. The latest + # data point is assumed to be the "reference" time. idx_time: int # the time axis - this will be flattened method: PersistenceMethod # which method to use to calculate persistence + @dataclass class PersistenceChunker: """ @@ -110,20 +120,27 @@ class PersistenceChunker: FUTUREWORK: usage of rust and/or parquet for dataloading and intermediate caching will be explored in order to speed up this process. """ - da_lazy: xr.DataArray # lazy loaded data array - num_chunks: int # number of chunks to use - method: PersistenceMethod # the persistence method to use (needed for lookback slicing) - idx_time: int = None # axis index for time - idx_chunk: int = None # axis index for chunk - dimname_time: str = None # the time dimension name normally "time" - dimname_chunk: str = None # the dimension name to chunk along, or default to a non-time - # dimension + + da_lazy: xr.DataArray # lazy loaded data array + num_chunks: int # number of chunks to use + method: ( + PersistenceMethod # the persistence method to use (needed for lookback slicing) + ) + idx_time: int = None # axis index for time + idx_chunk: int = None # axis index for chunk + dimname_time: str = None # the time dimension name normally "time" + dimname_chunk: str = ( + None # the dimension name to chunk along, or default to a non-time + ) + # dimension def __post_init__(self): # --- handle time dimension --- if self.idx_time is None: if self.dimname_time not in self.da_lazy.dims: - raise KeyError(f"PersistenceChunker: time dimension {self.dimname_time} not found in input array") + raise KeyError( + f"PersistenceChunker: time dimension {self.dimname_time} not found in input array" + ) self.idx_time = self.da_lazy.dims.index(self.dimname_time) # --- handle chunk dimension --- @@ -139,7 +156,9 @@ def __post_init__(self): else: # --- check and update chunk dimension --- if self.dimname_chunk not in self.da_lazy.dims: - raise KeyError(f"PersistenceChunker: chunk dimension {self.dimname_chunk} not found in input array") + raise KeyError( + f"PersistenceChunker: chunk dimension {self.dimname_chunk} not found in input array" + ) self.idx_chunk = self.da_lazy.dims.index(self.dimname_chunk) # --- check chunk/time index compatibilty --- @@ -148,13 +167,19 @@ def __post_init__(self): # --- check chunk size --- if self.num_chunks < 1: - raise ValueError("PersistenceChunker: number of chunks must be greater than or equal to 1") + raise ValueError( + "PersistenceChunker: number of chunks must be greater than or equal to 1" + ) if self.num_chunks > self.da_lazy.shape[self.idx_chunk]: - raise ValueError("PersistenceChunker: num_chunks must be less than the axis length") + raise ValueError( + "PersistenceChunker: num_chunks must be less than the axis length" + ) if self.num_chunks > _MAX_NUM_CHUNKS: - raise ValueError(f"PersistenceChunker: num_chunks is too large. Must be <{_MAX_NUM_CHUNKS}") + raise ValueError( + f"PersistenceChunker: num_chunks is too large. Must be <{_MAX_NUM_CHUNKS}" + ) # safety (tests only): check that indices are appropriately set assert self.idx_chunk is not None @@ -181,8 +206,10 @@ def generate_chunks(self): slice_time = self._get_time_slice(chunk_counter) # --- yield reference to array --- # still lazy at this point, until it is loaded and moved/copied into the forked process. - yield self.da_lazy.isel({ - self.dimname_time: slice_time, - self.dimname_chunk: slice_chunk, - }) + yield self.da_lazy.isel( + { + self.dimname_time: slice_time, + self.dimname_chunk: slice_chunk, + } + ) chunk_counter += 1 diff --git a/packages/bundled_models/persistence/tests/test__daskconfig.py b/packages/bundled_models/persistence/tests/test__daskconfig.py index 0fffe574..c1737cb1 100644 --- a/packages/bundled_models/persistence/tests/test__daskconfig.py +++ b/packages/bundled_models/persistence/tests/test__daskconfig.py @@ -1,23 +1,28 @@ """ Tests that dask is actually in synchronous/signle-threaded mode """ + from dataclasses import dataclass import numpy as np import persistence as pet_persist import persistence._daskconfig as pet_daskconfig + @dataclass -class _PyTestThreadInfo(): +class _PyTestThreadInfo: id_thread_kern: int # usually same as process id - id_thread_py: int # python read id - id_process: int # process id for current worker - num_cpus: int # number of cpus + id_thread_py: int # python read id + id_process: int # process id for current worker + num_cpus: int # number of cpus def _fn_dask_get_thread_info(count): return _make_thread_info() -def _cmp_thread_info(thread_info_a: _PyTestThreadInfo, thread_info_b: _PyTestThreadInfo) -> int: + +def _cmp_thread_info( + thread_info_a: _PyTestThreadInfo, thread_info_b: _PyTestThreadInfo +) -> int: """ Works like strcmp, thread info is the same => return 0, otherwise they are different. """ @@ -33,6 +38,7 @@ def _cmp_thread_info(thread_info_a: _PyTestThreadInfo, thread_info_b: _PyTestThr ) return count_diff + def _is_multithreaded_compute(list_thread_info) -> bool: """ Returns true if the list of thread_info have different threads or processes. @@ -46,6 +52,7 @@ def _is_multithreaded_compute(list_thread_info) -> bool: break return flag_has_different_threads + def _make_thread_info(): """ Creates the current thread info for the given context. This shouldn't be a fixture, it needs to @@ -53,6 +60,7 @@ def _make_thread_info(): """ import threading import os + obj_thread_py: threading.Thread = threading.current_thread() return _PyTestThreadInfo( id_thread_kern=obj_thread_py.native_id, @@ -61,6 +69,7 @@ def _make_thread_info(): num_cpus=os.cpu_count(), ) + def test_dask_single_threaded(): """ Set single threaded mode and check that the thread ids are the same for each worker. @@ -68,6 +77,7 @@ def test_dask_single_threaded(): import dask import dask.dataframe as _dd import dask.array as _da + main_thread_info: _PyTestThreadInfo = _make_thread_info() # we still set multiprocess here to check if our context manager is working as expected. @@ -75,7 +85,7 @@ def test_dask_single_threaded(): dask.config.refresh() # partition task of processing 100 items by number of ccpus - _chunks = min(main_thread_info.num_cpus, 100), + _chunks = (min(main_thread_info.num_cpus, 100),) _dask_df = _dd.io.from_dask_array( _da.from_array(np.arange(100), chunks=_chunks), columns=["x"], @@ -83,9 +93,12 @@ def test_dask_single_threaded(): # run computation in context manager with pet_daskconfig._set_synchronous_dask(): - results = _dask_df.apply(_fn_dask_get_thread_info, axis=1, meta=(None, 'object')).compute() + results = _dask_df.apply( + _fn_dask_get_thread_info, axis=1, meta=(None, "object") + ).compute() assert not _is_multithreaded_compute(results) + def test_dask_default_multithreaded(): """ Tests dask without singlethreaded context management. @@ -104,13 +117,15 @@ def test_dask_default_multithreaded(): dask.config.refresh() # partition task of processing 100 items by number of ccpus - _chunks = min(main_thread_info.num_cpus, 100), + _chunks = (min(main_thread_info.num_cpus, 100),) _dask_df = _dd.io.from_dask_array( _da.from_array(np.arange(100), chunks=_chunks), columns=["x"], ) # get results - results = _dask_df.apply(_fn_dask_get_thread_info, axis=1, meta=(None, 'object')).compute() + results = _dask_df.apply( + _fn_dask_get_thread_info, axis=1, meta=(None, "object") + ).compute() # --- check if there are sufficient threads on system if len(results) <= 1: diff --git a/packages/bundled_models/persistence/tests/test__interface.py b/packages/bundled_models/persistence/tests/test__interface.py index a7ae965e..8d58563b 100644 --- a/packages/bundled_models/persistence/tests/test__interface.py +++ b/packages/bundled_models/persistence/tests/test__interface.py @@ -9,7 +9,7 @@ def test_persistence_method_obj(): """ - Basic test to check object creation: PersistenceMethod + Basic test to check object creation: PersistenceMethod """ persistence_mostrecent = pet_persist.PersistenceMethod.MOST_RECENT persistence_median = pet_persist.PersistenceMethod.MEDIAN_OF_THREE @@ -24,8 +24,9 @@ def test_persistence_method_obj(): assert persistence_median.min_lookback() == 6 assert persistence_median.min_lookback(50) == 150 # 3 * 50 + def test_persistence_data_chunk_obj(): - arr_chunk = np.random.randint(0, 10, (2,5,8)) + arr_chunk = np.random.randint(0, 10, (2, 5, 8)) persistence_method = pet_persist.PersistenceMethod.MOST_RECENT idx_time: int = 1 # len = 5 datachunk = pet_persist.PersistenceDataChunk( @@ -36,18 +37,19 @@ def test_persistence_data_chunk_obj(): assert datachunk.arr_chunk.shape.index(5) == datachunk.idx_time assert datachunk.method.min_lookback() == 2 + def test_persistence_chunker_obj(): """ Basic test to check object creation: PersistenceChunker """ # --- index variant --- da = xr.DataArray( - np.random.randint(0, 10, (2,5,8)), + np.random.randint(0, 10, (2, 5, 8)), dims=["x0", "time", "x2"], ) idx_time: int = 1 # len = 5 idx_chunk: int = 2 # len = 8 - num_chunks: int = 4 # each chunk is 2x5x2 + num_chunks: int = 4 # each chunk is 2x5x2 persistence_method = pet_persist.PersistenceMethod.MOST_RECENT chunker = pet_persist.PersistenceChunker( @@ -63,15 +65,15 @@ def test_persistence_chunker_obj(): assert da.shape.index(8) == chunker.idx_chunk assert chunker.num_chunks == 4 assert chunker.method.num_time_indices_required() == 1 - + # --- name variant --- da = xr.DataArray( - np.random.randint(0, 10, (2,5,8)), + np.random.randint(0, 10, (2, 5, 8)), dims=["x0", "time", "x2"], ) dimname_time: str = "time" # len = 5 dimname_chunk: str = "x2" # len = 8 - num_chunks: int = 4 # each chunk is 2x5x2 + num_chunks: int = 4 # each chunk is 2x5x2 persistence_method = pet_persist.PersistenceMethod.MOST_RECENT chunker = pet_persist.PersistenceChunker( From e7b24fe540c43920b4a96dd59c894b43ef0cbb15 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Mon, 16 Feb 2026 16:23:27 +1100 Subject: [PATCH 06/28] ruff: format + fix missing imports on other test. --- .../bundled_models/persistence/pyproject.toml | 3 -- .../persistence/src/persistence/_interface.py | 46 ++++++++++--------- .../persistence/tests/test__daskconfig.py | 2 + 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/packages/bundled_models/persistence/pyproject.toml b/packages/bundled_models/persistence/pyproject.toml index 69009d60..56c79cff 100644 --- a/packages/bundled_models/persistence/pyproject.toml +++ b/packages/bundled_models/persistence/pyproject.toml @@ -87,9 +87,6 @@ dask-core = "*" distributed = "*" pyarrow = ">=23.0.0,<24" -[tool.pixi.feature.dev.dependencies] -pyarrow = "*" - [tool.pixi.environments] dask = ["dask"] dev = ["dask", "testing"] diff --git a/packages/bundled_models/persistence/src/persistence/_interface.py b/packages/bundled_models/persistence/src/persistence/_interface.py index 31ffa3df..0528a069 100644 --- a/packages/bundled_models/persistence/src/persistence/_interface.py +++ b/packages/bundled_models/persistence/src/persistence/_interface.py @@ -6,15 +6,11 @@ import numpy as np import xarray as xr - -_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER = ( - 2 # 50% sparsity is reasonable, though some data like -) -# precipitation may be more sparse than this -_MAX_NUM_CHUNKS = ( - 1000 # unlikely to have more than 1000 processes for persistence, due to -) -# diminishing returns - even on a supercomputer +# 50% sparsity is reasonable, though some data like precipitation may be more sparse than this +_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER = 2 +# unlikely to have more than 1000 processes for persistence, due to diminishing returns - even on a +# supercomputer +_MAX_NUM_CHUNKS = 1000 _mod_index: Callable[[int, int], np.uint] = np.mod _mod_index.__doc__ = """ @@ -121,18 +117,26 @@ class PersistenceChunker: explored in order to speed up this process. """ - da_lazy: xr.DataArray # lazy loaded data array - num_chunks: int # number of chunks to use - method: ( - PersistenceMethod # the persistence method to use (needed for lookback slicing) - ) - idx_time: int = None # axis index for time - idx_chunk: int = None # axis index for chunk - dimname_time: str = None # the time dimension name normally "time" - dimname_chunk: str = ( - None # the dimension name to chunk along, or default to a non-time - ) - # dimension + # lazy loaded data array + da_lazy: xr.DataArray + + # number of chunks to use + num_chunks: int + + # the method - determines how much data needs to be loaded + method: PersistenceMethod + + # axis index for time + idx_time: int = None + + # axis index for chunk + idx_chunk: int = None + + # the time dimension name normally "time" + dimname_time: str = None + + # the dimension name to chunk along, or default to a non-time + dimname_chunk: str = None def __post_init__(self): # --- handle time dimension --- diff --git a/packages/bundled_models/persistence/tests/test__daskconfig.py b/packages/bundled_models/persistence/tests/test__daskconfig.py index c1737cb1..74d43565 100644 --- a/packages/bundled_models/persistence/tests/test__daskconfig.py +++ b/packages/bundled_models/persistence/tests/test__daskconfig.py @@ -75,6 +75,8 @@ def test_dask_single_threaded(): Set single threaded mode and check that the thread ids are the same for each worker. """ import dask + import dask.config + import dask.distributed import dask.dataframe as _dd import dask.array as _da From b6c8120ce67eea275b33d703dbde58ed7603cbf2 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Mon, 16 Feb 2026 17:58:07 +1100 Subject: [PATCH 07/28] add simple imputation logic for missing values so that median is easier to compute --- .../persistence/src/persistence/__init__.py | 3 ++ .../persistence/src/persistence/_impute.py | 31 ++++++++++++++ .../persistence/src/persistence/_interface.py | 5 +++ .../persistence/src/persistence/_median.py | 21 +++++----- .../persistence/tests/test__impute.py | 41 +++++++++++++++++++ 5 files changed, 91 insertions(+), 10 deletions(-) create mode 100644 packages/bundled_models/persistence/src/persistence/_impute.py create mode 100644 packages/bundled_models/persistence/tests/test__impute.py diff --git a/packages/bundled_models/persistence/src/persistence/__init__.py b/packages/bundled_models/persistence/src/persistence/__init__.py index a2fa5d0e..87de4174 100644 --- a/packages/bundled_models/persistence/src/persistence/__init__.py +++ b/packages/bundled_models/persistence/src/persistence/__init__.py @@ -4,8 +4,11 @@ PersistenceChunker, ) +from persistence._impute import SimpleImpute + __all__ = [ "PersistenceMethod", "PersistenceDataChunk", "PersistenceChunker", + "SimpleImpute", ] diff --git a/packages/bundled_models/persistence/src/persistence/_impute.py b/packages/bundled_models/persistence/src/persistence/_impute.py new file mode 100644 index 00000000..2896f0c1 --- /dev/null +++ b/packages/bundled_models/persistence/src/persistence/_impute.py @@ -0,0 +1,31 @@ +""" +This module handles imputation of missing data using very simple techniques. + +Only mean is currently supported. +""" + +from dataclasses import dataclass +import numpy as np + + +@dataclass(frozen=True) +class SimpleImpute: + arr: np.ndarray + + def impute_mean(self) -> np.ndarray: + """ + To keep the imputation representative of the data but yet simple we can do a simple + mean interpolation over the data slab. + + NOTE: This is non-deterministic depending on the data chunking strategy. + """ + nanmask = np.isnan(self.arr) + if not nanmask.any() or nanmask.all(): + # if nothing is missing or everything is missing, return the original array as-is + return self.arr + else: + # otherwise, replace missing values with the mean of the slab + # NOTE: the following flattens the array by default if axis isn't specified + fillval = np.nanmean(self.arr) + arr_imputed = np.where(nanmask, fillval, self.arr) + return arr_imputed diff --git a/packages/bundled_models/persistence/src/persistence/_interface.py b/packages/bundled_models/persistence/src/persistence/_interface.py index 0528a069..e1c20dd3 100644 --- a/packages/bundled_models/persistence/src/persistence/_interface.py +++ b/packages/bundled_models/persistence/src/persistence/_interface.py @@ -1,3 +1,8 @@ +""" +Module that contains the interface required to "hook" into other pipeline methods in order to run +Persistence as a model. +""" + from enum import Enum from dataclasses import dataclass from collections.abc import Callable diff --git a/packages/bundled_models/persistence/src/persistence/_median.py b/packages/bundled_models/persistence/src/persistence/_median.py index 646015ba..3aae5f2d 100644 --- a/packages/bundled_models/persistence/src/persistence/_median.py +++ b/packages/bundled_models/persistence/src/persistence/_median.py @@ -1,10 +1,11 @@ -import - -def py_median_of_three( - ds: xr.Dataset, - time_dim: str, - reference_time: datetime.datetime, -): - """ - Computes the median of three - """ +# WIP: +# import +# +# def py_median_of_three( +# ds: xr.Dataset, +# time_dim: str, +# reference_time: datetime.datetime, +# ): +# """ +# Computes the median of three +# """ diff --git a/packages/bundled_models/persistence/tests/test__impute.py b/packages/bundled_models/persistence/tests/test__impute.py new file mode 100644 index 00000000..64675d5e --- /dev/null +++ b/packages/bundled_models/persistence/tests/test__impute.py @@ -0,0 +1,41 @@ +""" +This suite tests the simple imputer +""" + +import persistence as pet_persist +import numpy as np + + +def test_temporal_imputation_no_missing(): + """ + Nothing should change if there's no missing value + """ + arr_no_missing = np.full((5, 4, 3), 1, dtype=np.float64) + imputer = pet_persist.SimpleImpute(arr_no_missing) + arr_ret = imputer.impute_mean() + assert np.allclose(arr_ret, arr_no_missing, equal_nan=True) + + +def test_temporal_imputation_some_missing(): + """ + if some missing, then the nanmean is used to impute. + """ + # have no missing array for reference + arr_no_missing = np.full((5, 4, 3), 1, dtype=np.float64) + # put some nans in a random slab + arr_some_missing = np.full((5, 4, 3), 1, dtype=np.float64) + arr_some_missing[1:3, 0:3, 0] = np.nan + imputer = pet_persist.SimpleImpute(arr_some_missing) + arr_ret = imputer.impute_mean() + assert np.allclose(arr_ret, arr_no_missing, equal_nan=True) + assert np.sum(arr_ret) == 5 * 4 * 3 # (all ones) + + +def test_temporal_imputation_all_nans(): + """ + If all nan => don't alter original array. + """ + arr_all_missing = np.full((5, 4, 3), np.nan, dtype=np.float64) + imputer = pet_persist.SimpleImpute(arr_all_missing) + arr_ret = imputer.impute_mean() + assert np.allclose(arr_ret, arr_all_missing, equal_nan=True) From ab63c97351246fa1783997de8288f2d21bda7834 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Mon, 16 Feb 2026 18:15:52 +1100 Subject: [PATCH 08/28] minor adjustment of docs --- .../persistence/src/persistence/_interface.py | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/packages/bundled_models/persistence/src/persistence/_interface.py b/packages/bundled_models/persistence/src/persistence/_interface.py index e1c20dd3..30840887 100644 --- a/packages/bundled_models/persistence/src/persistence/_interface.py +++ b/packages/bundled_models/persistence/src/persistence/_interface.py @@ -37,17 +37,23 @@ class PersistenceMethod(Enum): fallback = MOST_RECENT MOST_RECENT: - simplest form of persistence, will find the most recent non-`nan` value to use as persistence. + uses the most-recent value as persistence. If there are nans, previous observations are used instead, up until the `max_lookback` threshold as determined by the "sparsity_multiplier". E.g. if the sparsity multiplier was "3" i.e. 66.67% of the data is `nan`, median of three - (which needs exactly 3 non-nan values) will look for non-nan values up to 9 indices before the - reference index to find non-nan candidates. + (which needs exactly 3 non-nan values) will look for non-nan values up to 9 indices prior in + order to fill any missing values. - Most methods will fallback to MOST_RECENT if they fail. If MOST_RECENT fails, the result will be - nan, essentially marking the datapoint as "void" for comparisons. + For now the imputation is kept simple and uses `mean` for speed reasons. e.g. for median of 3 + with a sparsity multiplier of 3, this would be the mean over a 9 by N slab - with N being the + cardinality of the remaining dimensions in a give data array or chunk. + + FUTUREWORK: + Simpler imputations work better with complex learning models. Given that the persistence + models are not at all complex, a clustering algorithm like KNN (by using e.g. kd-trees, for + multiple dimensions) would work better. But this is out of scope for now. """ MOST_RECENT = 0 @@ -89,13 +95,17 @@ class PersistenceDataChunk: IMPORTANT: data should not be chunked over time. """ + # ndarray with a mandatory time axis. Sorted ascending in time. The latest data point is assumed + # to be the "reference" time. + # NOTE: this API may change in the future depending on how temporal indexing is handled in the + # pipeline. + arr_chunk: np.ndarray + + # the time axis - this will be flattened + idx_time: int - arr_chunk: ( - np.ndarray - ) # ndarray including time axis. Sorted ascending in time. The latest - # data point is assumed to be the "reference" time. - idx_time: int # the time axis - this will be flattened - method: PersistenceMethod # which method to use to calculate persistence + # the method to use to calculate persistence + method: PersistenceMethod @dataclass From 104c56d4ef38ca6bad0d06610ed136c8337e0173 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Tue, 17 Feb 2026 13:11:01 +1100 Subject: [PATCH 09/28] conversion between common data types as inputs --- packages/bundled_models/persistence/pixi.lock | 2 +- .../persistence/src/persistence/__init__.py | 1 + .../persistence/src/persistence/_datatypes.py | 90 +++++++++++ .../persistence/src/persistence/_interface.py | 4 +- .../src/persistence/persistence_impl.py | 19 +++ .../persistence/tests/test__datatypes.py | 140 ++++++++++++++++++ 6 files changed, 254 insertions(+), 2 deletions(-) create mode 100644 packages/bundled_models/persistence/src/persistence/_datatypes.py create mode 100644 packages/bundled_models/persistence/src/persistence/persistence_impl.py create mode 100644 packages/bundled_models/persistence/tests/test__datatypes.py diff --git a/packages/bundled_models/persistence/pixi.lock b/packages/bundled_models/persistence/pixi.lock index c76a1f28..808e8d15 100644 --- a/packages/bundled_models/persistence/pixi.lock +++ b/packages/bundled_models/persistence/pixi.lock @@ -2559,7 +2559,7 @@ packages: - pypi: ./ name: pyearthtools-bundled-persistence version: 0.6.0 - sha256: dbb05e58245981cba55647de8b43a09454295a58a5faf8299d4d98ba3f3c6ae3 + sha256: 9e80485d242be8dba6bef64a468c827221089a186635053b1b09cd445c386b8d requires_dist: - pyearthtools-zoo>=0.5.0 - pyearthtools-data>=0.5.0 diff --git a/packages/bundled_models/persistence/src/persistence/__init__.py b/packages/bundled_models/persistence/src/persistence/__init__.py index 87de4174..8eb621e1 100644 --- a/packages/bundled_models/persistence/src/persistence/__init__.py +++ b/packages/bundled_models/persistence/src/persistence/__init__.py @@ -5,6 +5,7 @@ ) from persistence._impute import SimpleImpute +from persistence._datatypes import PetDataset __all__ = [ "PersistenceMethod", diff --git a/packages/bundled_models/persistence/src/persistence/_datatypes.py b/packages/bundled_models/persistence/src/persistence/_datatypes.py new file mode 100644 index 00000000..8a10ea8a --- /dev/null +++ b/packages/bundled_models/persistence/src/persistence/_datatypes.py @@ -0,0 +1,90 @@ +""" +Common data array/set transformations supported by the persistence model, the main usecase is to map +a function to each data variable independently. This is a common pattern as more often than not we +wouldn't be intermixing variables in basic pre-processing steps. + +TODO: this should be somewhere more common +""" + +from typing import Union, Generic +from collections.abc import Callable +import xarray as xr +import numpy as np +import numpy.typing as npt + +PetDataArrayLike = Union[xr.DataArray, xr.Dataset, npt.ArrayLike] + + +class PetDataset: + def __init__( + self, + arraylike: PetDataArrayLike, + dummy_varname="_dummyvarname", # used for xarray dataarrays and numpy arrays + dimnames: list[str] = None, # used only for numpy arrays + ): + """ + Takes a PetDataArrayLike and converts it to a PetDataset which is compatible with the + `map_each_var` computation. + + `dimnames` is only relevant for numpy - and only if using name-based indexing for retrieving + e.g. time dimension + """ + self.ds = PetDataset.from_arrlike(arraylike, dummy_varname, dimnames) + + @staticmethod + def from_np_array(arraylike: npt.ArrayLike, dummy_varname, dimnames) -> xr.Dataset: + return PetDataset.from_xr_dataarray( + xr.DataArray(np.asarray(arraylike), dims=dimnames), dummy_varname + ) + + @staticmethod + def from_xr_dataarray(arraylike: xr.DataArray, dummy_varname) -> xr.Dataset: + return xr.Dataset({dummy_varname: arraylike}) + + @staticmethod + def from_xr_dataset(arraylike: xr.Dataset) -> xr.Dataset: + return arraylike + + @staticmethod + def from_arrlike(arraylike, dummy_varname, dimnames) -> xr.Dataset: + # Order is important here, For example: + # xr.DataArray may be a npt.ArrayLike, but not the other way around. If we swap the order, + # the xr.DataArray constructor will never be reached. + + msg_type_error = """ + The provided data does not have a supported array type, supported array types are: + xr.DataArray, xr.Dataset and np.ndarray. + """ + + if isinstance(arraylike, xr.Dataset): + return PetDataset.from_xr_dataset(arraylike) + + if isinstance(arraylike, xr.DataArray): + return PetDataset.from_xr_dataarray(arraylike, dummy_varname) + + if isinstance(arraylike, (np.ndarray, list, tuple)): + return PetDataset.from_np_array(arraylike, dummy_varname, dimnames) + + # unsupported type + raise TypeError(msg_type_error) + + def map_each_var( + self, _fn: Callable[[xr.DataArray, ...], xr.DataArray], *_fn_args, **_fn_kwargs + ) -> xr.Dataset: + """ + Applies a function over each data array in the dataset. The return type will be dataset. + + The return type of each function operation itself will be per variable (dataarray). + + Only functions that have common structure associated to the variables in the Dataset will + work properly. + + IMPORTANT: global attributes and special variables may not be preserved. This operation is + destructive and for intermediate computation purposes only. + """ + dict_res = {} + + for k_var, v_da in self.ds.data_vars.items(): + dict_res[k_var] = _fn(v_da, *_fn_args, **_fn_kwargs) + + return xr.Dataset(dict_res) diff --git a/packages/bundled_models/persistence/src/persistence/_interface.py b/packages/bundled_models/persistence/src/persistence/_interface.py index 30840887..1f6e8971 100644 --- a/packages/bundled_models/persistence/src/persistence/_interface.py +++ b/packages/bundled_models/persistence/src/persistence/_interface.py @@ -7,6 +7,7 @@ from dataclasses import dataclass from collections.abc import Callable from contextlib import contextmanager +from typing import Union, Generic import numpy as np import xarray as xr @@ -95,6 +96,7 @@ class PersistenceDataChunk: IMPORTANT: data should not be chunked over time. """ + # ndarray with a mandatory time axis. Sorted ascending in time. The latest data point is assumed # to be the "reference" time. # NOTE: this API may change in the future depending on how temporal indexing is handled in the @@ -102,7 +104,7 @@ class PersistenceDataChunk: arr_chunk: np.ndarray # the time axis - this will be flattened - idx_time: int + idx_time: int # the method to use to calculate persistence method: PersistenceMethod diff --git a/packages/bundled_models/persistence/src/persistence/persistence_impl.py b/packages/bundled_models/persistence/src/persistence/persistence_impl.py new file mode 100644 index 00000000..d45b8960 --- /dev/null +++ b/packages/bundled_models/persistence/src/persistence/persistence_impl.py @@ -0,0 +1,19 @@ +def _compute_persistence_single(): + """ + Calculate the persistence of observation + + (C, M, D_(TxN), I) -> D_T + + where: + D = data provided - usually observations + (must include time dimension, may have multiple dimensions) + C = chunk strategy + (or none if doing it all in one go) + M = persistence method + (defaults to most recent observation) + I = simple imputation of missing values + (optional) + + Use imputation only if data is sparse and predictable. + """ + raise NotImplementedError() diff --git a/packages/bundled_models/persistence/tests/test__datatypes.py b/packages/bundled_models/persistence/tests/test__datatypes.py new file mode 100644 index 00000000..b179ad1c --- /dev/null +++ b/packages/bundled_models/persistence/tests/test__datatypes.py @@ -0,0 +1,140 @@ +""" +This test suite tests the use of PetDataset to create a common datatype construction for numpy and +xarray (dataarrays and datasets). + +NOTES: +- Since numpy and xarray dataarrays cannot be completely representable by datasets, they will either + be given dummy variables and dimension names, or user-specified variable and dimension names. + Creating a common interface to handle all this is tricky. +- While these dummy names are always options when creating a PetDataset, they should not affect + higher types - e.g. datasets will never be overwritten with the _dummyvarname or "dims()" (because + it may have several variables wtih different dimensions). +""" + +import xarray as xr +import numpy as np +import persistence as pet_persist + + +def _dummy_sum_fn(x: xr.DataArray, y: int, z: int = 5) -> xr.DataArray: + """ + Dummy function to test mapping, should return a data array, first argument must be a data array. + Can take other arguments that may be required for the computation + """ + return x.sum() + y - z + + +def test_petdataset_type_homomorphism_numpy(): + """ + Test type mapping with numpy arrays + """ + # defaults + test_data = np.ones((5, 2, 3)) + pet_ds = pet_persist.PetDataset(test_data) + res_ds = pet_ds.map_each_var(_dummy_sum_fn, 5) + assert "_dummyvarname" in pet_ds.ds.data_vars + # y = 5 + # z = 5 (default) + # sum = 5 * 2 * 3 = 30 + assert res_ds["_dummyvarname"] == 30 + + # with dummy array naming + pet_ds = pet_persist.PetDataset(test_data, dummy_varname="new_dummy_name") + res_ds = pet_ds.map_each_var(_dummy_sum_fn, 5, z=2) + assert "new_dummy_name" in pet_ds.ds.data_vars + # y = 5 + # z = 2 + # sum = 5 * 2 * 3 = 30 + # res = sum + 5 - 2 = 33 + assert res_ds["new_dummy_name"] == 33 + + # with dimension naming + pet_ds = pet_persist.PetDataset(test_data, dimnames=["x", "time", "y"]) + res_ds = pet_ds.map_each_var(_dummy_sum_fn, y=-10, z=-15) + # y = 5 + # z = 2 + # sum = 5 * 2 * 3 = 30 + # res = sum - 10 - (-15) = 35 + assert res_ds["_dummyvarname"] == 35 + assert set(pet_ds.ds.dims) == set(["x", "time", "y"]) + + +def test_petdataset_type_homomorphism_da(): + """ + Test type mapping with data arrays + """ + # defaults + test_data = xr.DataArray(np.ones((5, 2, 3)), dims=["the", "last", "resort"]) + pet_ds = pet_persist.PetDataset(test_data) + res_ds = pet_ds.map_each_var(_dummy_sum_fn, 5) + assert "_dummyvarname" in pet_ds.ds.data_vars + # y = 5 + # z = 5 (default) + # sum = 5 * 2 * 3 = 30 + assert res_ds["_dummyvarname"] == 30 + + # with dummy array naming + pet_ds = pet_persist.PetDataset(test_data, dummy_varname="new_dummy_name") + res_ds = pet_ds.map_each_var(_dummy_sum_fn, 5, z=2) + assert "new_dummy_name" in pet_ds.ds.data_vars + # y = 5 + # z = 2 + # sum = 5 * 2 * 3 = 30 + # res = sum + 5 - 2 = 33 + assert res_ds["new_dummy_name"] == 33 + + # with dimension naming + pet_ds = pet_persist.PetDataset(test_data, dimnames=["x", "time", "y"]) + res_ds = pet_ds.map_each_var(_dummy_sum_fn, y=-10, z=-15) + # y = 5 + # z = 2 + # sum = 5 * 2 * 3 = 30 + # res = sum - 10 - (-15) = 35 + assert res_ds["_dummyvarname"] == 35 + # dimnames should have no effect on dataarrays + assert set(pet_ds.ds.dims) == set(["the", "last", "resort"]) + + +def test_petdataset_type_homomorphism_ds(): + """ + Test type mapping with datasets + """ + # defaults + test_data = xr.Dataset( + { + "potato": xr.DataArray( + np.ones((5, 2, 3)), + dims=["the", "last", "resort"], + ), + "tomato": xr.DataArray( + np.ones((2, 1, 2)), + dims=["x", "y", "z"], + ), + } + ) + pet_ds = pet_persist.PetDataset(test_data) + res_ds = pet_ds.map_each_var(_dummy_sum_fn, 5) + + # _dummyvarname should be ignored for datasets by default + assert "_dummyvarname" not in pet_ds.ds.data_vars + assert res_ds["potato"] == 30 + assert res_ds["tomato"] == 4 + + # with dummy array naming + pet_ds = pet_persist.PetDataset(test_data, dummy_varname="new_dummy_name") + res_ds = pet_ds.map_each_var(_dummy_sum_fn, 5, z=2) + + # _dummyvarname should be ignored for datasets even when forced + assert "new_dummy_name" not in pet_ds.ds.data_vars + assert res_ds["potato"] == 33 + assert res_ds["tomato"] == 7 + + # with dimension naming + pet_ds = pet_persist.PetDataset(test_data, dimnames=["x", "time", "y"]) + res_ds = pet_ds.map_each_var(_dummy_sum_fn, y=-10, z=-15) + assert res_ds["potato"] == 35 + assert res_ds["tomato"] == 9 + + # dimnames should have no effect on dataarrays within the dataset + assert set(pet_ds.ds["potato"].dims) == set(["the", "last", "resort"]) + assert set(pet_ds.ds["tomato"].dims) == set(["x", "y", "z"]) From fb74a25ae0019a4d93ca1db4fb8f7778ebffff66 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Wed, 18 Feb 2026 10:57:29 +1100 Subject: [PATCH 10/28] [skip ci] WIP commit - persistence computation entrypoint in persistence_impl --- .../persistence/src/persistence/__init__.py | 4 +- .../persistence/src/persistence/_datatypes.py | 76 ++++++++++++++-- .../persistence/src/persistence/_interface.py | 36 ++++++-- .../src/persistence/persistence_impl.py | 90 ++++++++++++++++++- 4 files changed, 189 insertions(+), 17 deletions(-) diff --git a/packages/bundled_models/persistence/src/persistence/__init__.py b/packages/bundled_models/persistence/src/persistence/__init__.py index 8eb621e1..24083767 100644 --- a/packages/bundled_models/persistence/src/persistence/__init__.py +++ b/packages/bundled_models/persistence/src/persistence/__init__.py @@ -5,11 +5,13 @@ ) from persistence._impute import SimpleImpute -from persistence._datatypes import PetDataset +from persistence._datatypes import PetDataset, PetDataArrayLike __all__ = [ "PersistenceMethod", "PersistenceDataChunk", "PersistenceChunker", "SimpleImpute", + "PetDataset", + "PetDataArrayLike", ] diff --git a/packages/bundled_models/persistence/src/persistence/_datatypes.py b/packages/bundled_models/persistence/src/persistence/_datatypes.py index 8a10ea8a..ed6c5293 100644 --- a/packages/bundled_models/persistence/src/persistence/_datatypes.py +++ b/packages/bundled_models/persistence/src/persistence/_datatypes.py @@ -8,6 +8,7 @@ from typing import Union, Generic from collections.abc import Callable +from enum import StrEnum, auto import xarray as xr import numpy as np import numpy.typing as npt @@ -15,6 +16,12 @@ PetDataArrayLike = Union[xr.DataArray, xr.Dataset, npt.ArrayLike] +class PetInputDataType(StrEnum): + XR_DATAARRAY = "xr_dataarray" + XR_DATASET = "xr_dataset" + NP_ARRAY = "np_array" + UNKNOWN = auto() + class PetDataset: def __init__( self, @@ -29,20 +36,31 @@ def __init__( `dimnames` is only relevant for numpy - and only if using name-based indexing for retrieving e.g. time dimension """ + self.raw_type = PetInputDataType.UNKNOWN self.ds = PetDataset.from_arrlike(arraylike, dummy_varname, dimnames) + self.return_raw_result = False + + def with_return_raw_result(self, return_raw_result=bool): + """ + Optionally set this to return raw array from `map_each_var` + """ + self.return_raw_result = return_raw_result @staticmethod def from_np_array(arraylike: npt.ArrayLike, dummy_varname, dimnames) -> xr.Dataset: + self.raw_type = PetInputDataType.NP_ARRAY return PetDataset.from_xr_dataarray( - xr.DataArray(np.asarray(arraylike), dims=dimnames), dummy_varname + xr.DataArray(arraylike, dims=dimnames), dummy_varname ) @staticmethod def from_xr_dataarray(arraylike: xr.DataArray, dummy_varname) -> xr.Dataset: + self.raw_type = PetInputDataType.XR_DATAARRAY return xr.Dataset({dummy_varname: arraylike}) @staticmethod def from_xr_dataset(arraylike: xr.Dataset) -> xr.Dataset: + self.raw_type = PetInputDataType.XR_DATASET return arraylike @staticmethod @@ -63,14 +81,18 @@ def from_arrlike(arraylike, dummy_varname, dimnames) -> xr.Dataset: return PetDataset.from_xr_dataarray(arraylike, dummy_varname) if isinstance(arraylike, (np.ndarray, list, tuple)): + arraylike = np.asarray(arraylike) # force convert just in case return PetDataset.from_np_array(arraylike, dummy_varname, dimnames) # unsupported type raise TypeError(msg_type_error) def map_each_var( - self, _fn: Callable[[xr.DataArray, ...], xr.DataArray], *_fn_args, **_fn_kwargs - ) -> xr.Dataset: + self, + _fn: Callable[[xr.DataArray, ...], xr.DataArray], + *_fn_args, + **_fn_kwargs, + ) -> PetDataArrayLike """ Applies a function over each data array in the dataset. The return type will be dataset. @@ -81,10 +103,54 @@ def map_each_var( IMPORTANT: global attributes and special variables may not be preserved. This operation is destructive and for intermediate computation purposes only. + + Args: + _fn: takes a DataArray as its first input arg and produces a DataArray as output + _fn_args: additional positional arguments to provide to _fn + _fn_kwargs: additional keyword arguments to provide to _fn """ dict_res = {} + invalid_ret_err_msg = ( + "PetDataset.map_each_var: Expect function to return a single xr.DataArray" + ) for k_var, v_da in self.ds.data_vars.items(): - dict_res[k_var] = _fn(v_da, *_fn_args, **_fn_kwargs) + # sense check + assert isinstance(v_da, xr.DataArray) + + da_res = _fn(v_da, *_fn_args, **_fn_kwargs) + + if not isinstance(da_res, xr.DataArray): + raise RuntimeError(invalid_ret_err_msg) + + dict_res[k_var] = da_res + + ds_res = xr.Dataset(dict_res) + + if self.return_raw_result: + return self._raw_result(ds_res) + + # return upgraded dataset by default + return ds_res + + def _raw_result(self, ds: xr.Dataset) -> PetDataArrayLike: + """ + Converts a result back into the original data structure. Down-converting is a lot safer and + so less checks required. + + NOTE: the returned datatype may have dummy names attached, as such these results are for + intermediate computation purposes only, not for operational outputs. + """ + if self.raw_type == PetDataArrayLike.UNKNOWN: + # this should not happen - _raw_result should not be called externally + raise RuntimeError("PetDataset._raw_result: Invalid raw type encountered") + elif self.raw_type == PetDataArrayLike.XR_DATASET: + # nothing to do + return ds + elif self.raw_type == PetDataArrayLike.XR_DATAARRAY: + # extract the dataarray + return ds[self._dummyvarname] + elif self.raw_type == PetDataArrayLike.NP_ARRAY: + # extract the numpy array - note this may force a memory load. + return ds[self._dummyvarname].values - return xr.Dataset(dict_res) diff --git a/packages/bundled_models/persistence/src/persistence/_interface.py b/packages/bundled_models/persistence/src/persistence/_interface.py index 1f6e8971..ab0f488a 100644 --- a/packages/bundled_models/persistence/src/persistence/_interface.py +++ b/packages/bundled_models/persistence/src/persistence/_interface.py @@ -3,7 +3,7 @@ Persistence as a model. """ -from enum import Enum +from enum import StrEnum, auto from dataclasses import dataclass from collections.abc import Callable from contextlib import contextmanager @@ -29,7 +29,7 @@ """ -class PersistenceMethod(Enum): +class PersistenceMethod(StrEnum): """ Methods to use for persistence. @@ -57,10 +57,30 @@ class PersistenceMethod(Enum): multiple dimensions) would work better. But this is out of scope for now. """ - MOST_RECENT = 0 - MEDIAN_OF_THREE = 1 + MOST_RECENT = "most_recent" + MEDIAN_OF_THREE = "median_of_three" - def num_time_indices_required(self): + def flatten_non_temporal(self) -> bool: + """ + Whether or not to flatten non-temporal dimensions during computation. + For methods that do not need spatial dependence, this makes chunking a lot easier. + """ + _default = False + match self: + case PersistenceMethod.MOST_RECENT: + return True + case PersistenceMethod.MEDIAN_OF_THREE: + return True + case _: + raise NotImplementedError( + "PersistenceMethod: Invalid persistence method." + ) + return _default + + def num_time_indices_required(self) -> int: + """ + number of time indices required for computing a particular method + """ match self: case PersistenceMethod.MOST_RECENT: return 1 @@ -137,12 +157,12 @@ class PersistenceChunker: # lazy loaded data array da_lazy: xr.DataArray - # number of chunks to use - num_chunks: int - # the method - determines how much data needs to be loaded method: PersistenceMethod + # number of chunks to use + num_chunks: int + # axis index for time idx_time: int = None diff --git a/packages/bundled_models/persistence/src/persistence/persistence_impl.py b/packages/bundled_models/persistence/src/persistence/persistence_impl.py index d45b8960..7edfee16 100644 --- a/packages/bundled_models/persistence/src/persistence/persistence_impl.py +++ b/packages/bundled_models/persistence/src/persistence/persistence_impl.py @@ -1,19 +1,103 @@ -def _compute_persistence_single(): +import persistence as pet_persist + + +# TODO: convert to builder pattern +def _compute_persistence( + arr: pet_persist.PetDataArrayLike, + idx_time: int, + idx_chunk: int = None, + num_workers: int = None, + num_chunks: int = None, + method: PersistenceMethod | str = PersistenceMethod.MOST_RECENT, + simple_impute: bool = True, + return_raw_result: bool = False, +) -> pet_persist.PetDataArrayLike: """ Calculate the persistence of observation - (C, M, D_(TxN), I) -> D_T + (C, M, D_(TxN), I) -> D_(T'xN) where: D = data provided - usually observations (must include time dimension, may have multiple dimensions) - C = chunk strategy + C = chunk strategy (index, number of chunks) (or none if doing it all in one go) M = persistence method (defaults to most recent observation) I = simple imputation of missing values (optional) + T = time dimension + T' = forecast time/lead time + N = other dimensions + D_(T'xN) = data collapsed to persistence output Use imputation only if data is sparse and predictable. + + Args: + ds (array-like) - required: + ArrayLike - supports numpy and xarray + idx_time (int) - required : + the dimension for time index + idx_chunk (int): + the dimension used for chunking (ignored if the method flattens non-temporal + dimensions). Otherwise, if not specified, automatically chooses a chunking dimension. + num_workers (int): + number of workers to use for processing persistence, defaults to number of cpus. + num_chunks (int): + number of chunks to use, defaults to `min(num_cpu, len(chunk_dimension))` + method (string/enum): + see `PersistenceMethod`. Supports "most_recent" (default) and "median_of_three" + simple_impute (bool): + defaults to True. Set to False if nan needs to be preserved. + NOTE: methods that require multiple non-nan datapoints to function may be forced to nan. + return_raw_result (bool): + whether to return the result in the original data type. By default it returns a Dataset. + + Returns: + Original dataset with lead time filled with persistence values. + FUTUREWORK: + - for more complex modes (not yet implemented) the leadtimes are not constant. """ + # for a given leadtime: + # input data -> upgrade to PetDataset -> map_each_var -> _compute_persistence_single + # TODO: + # - lead time handling for more complex methods + method = pet_persist.PersistenceMethod(method) + pet_ds = pet_persist.PetDataset(arr) + ds_result = pet_ds.map_each_var( + _compute_persistence_single, + idx_time, + idx_chunk, + num_chunks, + method, + simple_impute, + return_raw_result, + ) + + return ds_result + +def _compute_persistence_single( + da: xr.DataArray, + idx_time: int, + idx_chunk: int = None, + num_chunks: int = None, + method: PersistenceMethod = PersistenceMethod.MOST_RECENT, + simple_impute: bool = True, +): + """ + Computes persistence for a single data array, has the same interface as _compute_persistence + except that the first argument is a data array. + """ + # input dataarray -> chunk -> impute -> compute persistence -> merge chunks + chunker = PersistenceChunker( + da_lazy=da, + method=method, + num_chunks=num_chunks, + idx_time=idx_time, + idx_chunk=idx_chunk, + ) + + # TODO: worker pool + # TODO: work chain i.e. slice -> impute -> compute + # TODO: merge result raise NotImplementedError() From 422bc8860d246eb0e28eaa6b70a73613006f4448 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Wed, 18 Feb 2026 11:08:50 +1100 Subject: [PATCH 11/28] [skip ci] wrong interface for return_raw_type --- .../persistence/src/persistence/persistence_impl.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/bundled_models/persistence/src/persistence/persistence_impl.py b/packages/bundled_models/persistence/src/persistence/persistence_impl.py index 7edfee16..1613afdc 100644 --- a/packages/bundled_models/persistence/src/persistence/persistence_impl.py +++ b/packages/bundled_models/persistence/src/persistence/persistence_impl.py @@ -63,7 +63,7 @@ def _compute_persistence( # TODO: # - lead time handling for more complex methods method = pet_persist.PersistenceMethod(method) - pet_ds = pet_persist.PetDataset(arr) + pet_ds = pet_persist.PetDataset(arr).with_return_raw_result(return_raw_result) ds_result = pet_ds.map_each_var( _compute_persistence_single, idx_time, @@ -71,7 +71,6 @@ def _compute_persistence( num_chunks, method, simple_impute, - return_raw_result, ) return ds_result From fa3c0567e6ab9ff870a8d995e660def87bead408 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Wed, 18 Feb 2026 11:12:05 +1100 Subject: [PATCH 12/28] [skip ci] WIP: ruff checks --- .../persistence/src/persistence/_datatypes.py | 6 +++--- .../persistence/src/persistence/_interface.py | 4 +++- .../persistence/src/persistence/persistence_impl.py | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/packages/bundled_models/persistence/src/persistence/_datatypes.py b/packages/bundled_models/persistence/src/persistence/_datatypes.py index ed6c5293..24cf1c31 100644 --- a/packages/bundled_models/persistence/src/persistence/_datatypes.py +++ b/packages/bundled_models/persistence/src/persistence/_datatypes.py @@ -22,6 +22,7 @@ class PetInputDataType(StrEnum): NP_ARRAY = "np_array" UNKNOWN = auto() + class PetDataset: def __init__( self, @@ -88,11 +89,11 @@ def from_arrlike(arraylike, dummy_varname, dimnames) -> xr.Dataset: raise TypeError(msg_type_error) def map_each_var( - self, + self, _fn: Callable[[xr.DataArray, ...], xr.DataArray], *_fn_args, **_fn_kwargs, - ) -> PetDataArrayLike + ) -> PetDataArrayLike: """ Applies a function over each data array in the dataset. The return type will be dataset. @@ -153,4 +154,3 @@ def _raw_result(self, ds: xr.Dataset) -> PetDataArrayLike: elif self.raw_type == PetDataArrayLike.NP_ARRAY: # extract the numpy array - note this may force a memory load. return ds[self._dummyvarname].values - diff --git a/packages/bundled_models/persistence/src/persistence/_interface.py b/packages/bundled_models/persistence/src/persistence/_interface.py index ab0f488a..e06abbdc 100644 --- a/packages/bundled_models/persistence/src/persistence/_interface.py +++ b/packages/bundled_models/persistence/src/persistence/_interface.py @@ -14,10 +14,12 @@ # 50% sparsity is reasonable, though some data like precipitation may be more sparse than this _DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER = 2 + # unlikely to have more than 1000 processes for persistence, due to diminishing returns - even on a # supercomputer _MAX_NUM_CHUNKS = 1000 +# subtype of np.mod that works only with ints. _mod_index: Callable[[int, int], np.uint] = np.mod _mod_index.__doc__ = """ Maps negative integer to a positive integer element in a ring. The ring has a domain of @@ -62,7 +64,7 @@ class PersistenceMethod(StrEnum): def flatten_non_temporal(self) -> bool: """ - Whether or not to flatten non-temporal dimensions during computation. + Whether or not to flatten non-temporal dimensions during computation. For methods that do not need spatial dependence, this makes chunking a lot easier. """ _default = False diff --git a/packages/bundled_models/persistence/src/persistence/persistence_impl.py b/packages/bundled_models/persistence/src/persistence/persistence_impl.py index 1613afdc..e275cb37 100644 --- a/packages/bundled_models/persistence/src/persistence/persistence_impl.py +++ b/packages/bundled_models/persistence/src/persistence/persistence_impl.py @@ -48,7 +48,7 @@ def _compute_persistence( method (string/enum): see `PersistenceMethod`. Supports "most_recent" (default) and "median_of_three" simple_impute (bool): - defaults to True. Set to False if nan needs to be preserved. + defaults to True. Set to False if nan needs to be preserved. NOTE: methods that require multiple non-nan datapoints to function may be forced to nan. return_raw_result (bool): whether to return the result in the original data type. By default it returns a Dataset. @@ -65,7 +65,7 @@ def _compute_persistence( method = pet_persist.PersistenceMethod(method) pet_ds = pet_persist.PetDataset(arr).with_return_raw_result(return_raw_result) ds_result = pet_ds.map_each_var( - _compute_persistence_single, + _compute_persistence_single, idx_time, idx_chunk, num_chunks, @@ -75,6 +75,7 @@ def _compute_persistence( return ds_result + def _compute_persistence_single( da: xr.DataArray, idx_time: int, From e075b286d443b210cc6f99f6428558fa98b57942 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Mon, 23 Feb 2026 14:46:01 +1100 Subject: [PATCH 13/28] [skip ci] wip add median of three method using numpy. Needs to be refactored. --- .../persistence/src/persistence/_datatypes.py | 24 ++++---- .../persistence/src/persistence/_median.py | 58 +++++++++++++++---- .../persistence/tests/test__median.py | 43 ++++++++++++++ 3 files changed, 101 insertions(+), 24 deletions(-) create mode 100644 packages/bundled_models/persistence/tests/test__median.py diff --git a/packages/bundled_models/persistence/src/persistence/_datatypes.py b/packages/bundled_models/persistence/src/persistence/_datatypes.py index 24cf1c31..feca2f8d 100644 --- a/packages/bundled_models/persistence/src/persistence/_datatypes.py +++ b/packages/bundled_models/persistence/src/persistence/_datatypes.py @@ -38,7 +38,7 @@ def __init__( e.g. time dimension """ self.raw_type = PetInputDataType.UNKNOWN - self.ds = PetDataset.from_arrlike(arraylike, dummy_varname, dimnames) + self.ds = self.from_arrlike(arraylike, dummy_varname, dimnames) self.return_raw_result = False def with_return_raw_result(self, return_raw_result=bool): @@ -47,25 +47,23 @@ def with_return_raw_result(self, return_raw_result=bool): """ self.return_raw_result = return_raw_result - @staticmethod - def from_np_array(arraylike: npt.ArrayLike, dummy_varname, dimnames) -> xr.Dataset: + def from_np_array( + self, arraylike: npt.ArrayLike, dummy_varname, dimnames + ) -> xr.Dataset: self.raw_type = PetInputDataType.NP_ARRAY - return PetDataset.from_xr_dataarray( + return self.from_xr_dataarray( xr.DataArray(arraylike, dims=dimnames), dummy_varname ) - @staticmethod - def from_xr_dataarray(arraylike: xr.DataArray, dummy_varname) -> xr.Dataset: + def from_xr_dataarray(self, arraylike: xr.DataArray, dummy_varname) -> xr.Dataset: self.raw_type = PetInputDataType.XR_DATAARRAY return xr.Dataset({dummy_varname: arraylike}) - @staticmethod - def from_xr_dataset(arraylike: xr.Dataset) -> xr.Dataset: + def from_xr_dataset(self, arraylike: xr.Dataset) -> xr.Dataset: self.raw_type = PetInputDataType.XR_DATASET return arraylike - @staticmethod - def from_arrlike(arraylike, dummy_varname, dimnames) -> xr.Dataset: + def from_arrlike(self, arraylike, dummy_varname, dimnames) -> xr.Dataset: # Order is important here, For example: # xr.DataArray may be a npt.ArrayLike, but not the other way around. If we swap the order, # the xr.DataArray constructor will never be reached. @@ -76,14 +74,14 @@ def from_arrlike(arraylike, dummy_varname, dimnames) -> xr.Dataset: """ if isinstance(arraylike, xr.Dataset): - return PetDataset.from_xr_dataset(arraylike) + return self.from_xr_dataset(arraylike) if isinstance(arraylike, xr.DataArray): - return PetDataset.from_xr_dataarray(arraylike, dummy_varname) + return self.from_xr_dataarray(arraylike, dummy_varname) if isinstance(arraylike, (np.ndarray, list, tuple)): arraylike = np.asarray(arraylike) # force convert just in case - return PetDataset.from_np_array(arraylike, dummy_varname, dimnames) + return self.from_np_array(arraylike, dummy_varname, dimnames) # unsupported type raise TypeError(msg_type_error) diff --git a/packages/bundled_models/persistence/src/persistence/_median.py b/packages/bundled_models/persistence/src/persistence/_median.py index 3aae5f2d..015d4dfc 100644 --- a/packages/bundled_models/persistence/src/persistence/_median.py +++ b/packages/bundled_models/persistence/src/persistence/_median.py @@ -1,11 +1,47 @@ -# WIP: -# import -# -# def py_median_of_three( -# ds: xr.Dataset, -# time_dim: str, -# reference_time: datetime.datetime, -# ): -# """ -# Computes the median of three -# """ +import numpy as np +import warnings + +# TODO: get this from common definition - requires refactor +_LOOKBACK = 3 + + +def _median_of_three_numpy(arr: np.ndarray, idx_time: int) -> np.ndarray: + """ + Computes median of three along the time index, ignores nans; if a + particular coordinate is all nan for the required time indices, the + output is nan for that entry. + """ + # safety: this should have been handled at the top level + len_time = arr.shape[idx_time] + assert len_time >= _LOOKBACK + + # --- + # select relenvant array indices by time, based on lookback + # + # TODO: this should happen someplace higher up assumes latest obs is at the end, similar to + # _LOOKBACK. + idx_end = len_time + idx_start = idx_end - _LOOKBACK + idx_slice = slice(idx_start, idx_end, 1) # start, end, step + # generator for nd-index slicing + idx_all = slice(None, None, None) + nd_slice = (idx_slice if i == idx_time else idx_all for i in range(len(arr.shape))) + # sliced array that only has the latest 3 values + arr_slice = arr[*tuple(nd_slice)] + # --- + + # --- + # calculate the median along the time axis + # + # NOTE: ignore numpy warnings as allowing all `nan` is intentional + # + # NOTE: `keepdims=True` because we want to keep the dimensional structure of the variable + # being computed at a higher level. + # + # TODO: this should be replaced by a fast median of three algorithm using if/else statements + # or a ternary operator equivilent. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + arr_median = np.nanmedian(arr_slice, axis=idx_time, keepdims=True) + return arr_median + # --- diff --git a/packages/bundled_models/persistence/tests/test__median.py b/packages/bundled_models/persistence/tests/test__median.py new file mode 100644 index 00000000..8b99d740 --- /dev/null +++ b/packages/bundled_models/persistence/tests/test__median.py @@ -0,0 +1,43 @@ +import numpy as np +from persistence._median import _median_of_three_numpy + + +def test_median_of_three_numpy_basic(): + """ + Tests that the dimensions are preserved except the time dimension which is + reduced (but not squeezed) to one + """ + + # --- case 1 --- + # create a simple array and throw in an outlier for sense check + input_arr = np.array([[1, 2, 3], [5, 2, 6], [0, 191, 4]]) + expect_arr = np.array([[2], [5], [4]]) + idx_time = 1 # second dimension (idx=1) is time + result_arr = _median_of_three_numpy(input_arr, idx_time) + assert np.allclose(result_arr, expect_arr) + + # --- case 2 --- + # check dimensionality is preserved for >2 dimensions + # the values actually don't matter here. + input_arr = np.full((5, 4, 3, 4, 5), 1, dtype=np.float64) + idx_time = 3 # arbitrarily make fourth dimension time (idx_time = 3) + expect_shape = (5, 4, 3, 1, 5) + result_arr = _median_of_three_numpy(input_arr, idx_time) + result_shape = result_arr.shape + assert expect_shape == result_shape + + +def test_median_of_three_numpy_all_nans(): + """ + Test that all nans doesn't spit out a warning and that the associated + dimension is filled with a `nan` + """ + pass + + +def test_median_of_three_numpy_partial_nan(): + """ + Test that partial nans are still handled. i.e. median of two numbers will + just be their mean and median of one number will just be itself. + """ + pass From 17cf489384938052bc3a2f37d73bc591bf05885f Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Mon, 23 Feb 2026 15:15:30 +1100 Subject: [PATCH 14/28] [skip ci] add remaining tests for median calc with numpy --- .../bundled_models/persistence/tests/test__median.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/packages/bundled_models/persistence/tests/test__median.py b/packages/bundled_models/persistence/tests/test__median.py index 8b99d740..6e8f63a5 100644 --- a/packages/bundled_models/persistence/tests/test__median.py +++ b/packages/bundled_models/persistence/tests/test__median.py @@ -32,7 +32,11 @@ def test_median_of_three_numpy_all_nans(): Test that all nans doesn't spit out a warning and that the associated dimension is filled with a `nan` """ - pass + input_arr = np.array([[1, 2, 3], [5, 2, 6], [np.nan, np.nan, np.nan]]) + expect_arr = np.array([[2], [5], [np.nan]]) + idx_time = 1 # second dimension (idx=1) is time + result_arr = _median_of_three_numpy(input_arr, idx_time) + assert np.allclose(result_arr, expect_arr, equal_nan=True) def test_median_of_three_numpy_partial_nan(): @@ -40,4 +44,8 @@ def test_median_of_three_numpy_partial_nan(): Test that partial nans are still handled. i.e. median of two numbers will just be their mean and median of one number will just be itself. """ - pass + input_arr = np.array([[1, 2, 3], [5, 2, np.nan], [5, np.nan, np.nan]]) + expect_arr = np.array([[2], [3.5], [5]]) + idx_time = 1 # second dimension (idx=1) is time + result_arr = _median_of_three_numpy(input_arr, idx_time) + assert np.allclose(result_arr, expect_arr) From 643bd1d23c583add97974a0c7bc17c30d7933fb3 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Tue, 24 Feb 2026 10:56:33 +1100 Subject: [PATCH 15/28] [skip ci] refactor private modules --- .../bundled_models/persistence/src/persistence/__init__.py | 6 +++--- .../src/persistence/{_daskconfig.py => daskconfig.py} | 0 .../persistence/{_mostrecent.py => interface/__init__.py} | 0 .../persistence/src/persistence/interface/_compute.py | 0 .../src/persistence/{ => interface}/_interface.py | 0 .../persistence/src/persistence/methods/__init__.py | 0 .../persistence/src/persistence/{ => methods}/_impute.py | 0 .../persistence/src/persistence/{ => methods}/_median.py | 0 .../persistence/src/persistence/methods/_mostrecent.py | 0 .../persistence/src/persistence/{_datatypes.py => types.py} | 0 .../bundled_models/persistence/tests/test__daskconfig.py | 2 +- packages/bundled_models/persistence/tests/test__median.py | 2 +- 12 files changed, 5 insertions(+), 5 deletions(-) rename packages/bundled_models/persistence/src/persistence/{_daskconfig.py => daskconfig.py} (100%) rename packages/bundled_models/persistence/src/persistence/{_mostrecent.py => interface/__init__.py} (100%) create mode 100644 packages/bundled_models/persistence/src/persistence/interface/_compute.py rename packages/bundled_models/persistence/src/persistence/{ => interface}/_interface.py (100%) create mode 100644 packages/bundled_models/persistence/src/persistence/methods/__init__.py rename packages/bundled_models/persistence/src/persistence/{ => methods}/_impute.py (100%) rename packages/bundled_models/persistence/src/persistence/{ => methods}/_median.py (100%) create mode 100644 packages/bundled_models/persistence/src/persistence/methods/_mostrecent.py rename packages/bundled_models/persistence/src/persistence/{_datatypes.py => types.py} (100%) diff --git a/packages/bundled_models/persistence/src/persistence/__init__.py b/packages/bundled_models/persistence/src/persistence/__init__.py index 24083767..e1534ca8 100644 --- a/packages/bundled_models/persistence/src/persistence/__init__.py +++ b/packages/bundled_models/persistence/src/persistence/__init__.py @@ -1,11 +1,11 @@ -from persistence._interface import ( +from persistence.interface._interface import ( PersistenceMethod, PersistenceDataChunk, PersistenceChunker, ) -from persistence._impute import SimpleImpute -from persistence._datatypes import PetDataset, PetDataArrayLike +from persistence.methods._impute import SimpleImpute +from persistence.types import PetDataset, PetDataArrayLike __all__ = [ "PersistenceMethod", diff --git a/packages/bundled_models/persistence/src/persistence/_daskconfig.py b/packages/bundled_models/persistence/src/persistence/daskconfig.py similarity index 100% rename from packages/bundled_models/persistence/src/persistence/_daskconfig.py rename to packages/bundled_models/persistence/src/persistence/daskconfig.py diff --git a/packages/bundled_models/persistence/src/persistence/_mostrecent.py b/packages/bundled_models/persistence/src/persistence/interface/__init__.py similarity index 100% rename from packages/bundled_models/persistence/src/persistence/_mostrecent.py rename to packages/bundled_models/persistence/src/persistence/interface/__init__.py diff --git a/packages/bundled_models/persistence/src/persistence/interface/_compute.py b/packages/bundled_models/persistence/src/persistence/interface/_compute.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/bundled_models/persistence/src/persistence/_interface.py b/packages/bundled_models/persistence/src/persistence/interface/_interface.py similarity index 100% rename from packages/bundled_models/persistence/src/persistence/_interface.py rename to packages/bundled_models/persistence/src/persistence/interface/_interface.py diff --git a/packages/bundled_models/persistence/src/persistence/methods/__init__.py b/packages/bundled_models/persistence/src/persistence/methods/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/bundled_models/persistence/src/persistence/_impute.py b/packages/bundled_models/persistence/src/persistence/methods/_impute.py similarity index 100% rename from packages/bundled_models/persistence/src/persistence/_impute.py rename to packages/bundled_models/persistence/src/persistence/methods/_impute.py diff --git a/packages/bundled_models/persistence/src/persistence/_median.py b/packages/bundled_models/persistence/src/persistence/methods/_median.py similarity index 100% rename from packages/bundled_models/persistence/src/persistence/_median.py rename to packages/bundled_models/persistence/src/persistence/methods/_median.py diff --git a/packages/bundled_models/persistence/src/persistence/methods/_mostrecent.py b/packages/bundled_models/persistence/src/persistence/methods/_mostrecent.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/bundled_models/persistence/src/persistence/_datatypes.py b/packages/bundled_models/persistence/src/persistence/types.py similarity index 100% rename from packages/bundled_models/persistence/src/persistence/_datatypes.py rename to packages/bundled_models/persistence/src/persistence/types.py diff --git a/packages/bundled_models/persistence/tests/test__daskconfig.py b/packages/bundled_models/persistence/tests/test__daskconfig.py index 74d43565..f7472e31 100644 --- a/packages/bundled_models/persistence/tests/test__daskconfig.py +++ b/packages/bundled_models/persistence/tests/test__daskconfig.py @@ -5,7 +5,7 @@ from dataclasses import dataclass import numpy as np import persistence as pet_persist -import persistence._daskconfig as pet_daskconfig +import persistence.daskconfig as pet_daskconfig @dataclass diff --git a/packages/bundled_models/persistence/tests/test__median.py b/packages/bundled_models/persistence/tests/test__median.py index 6e8f63a5..801f06a6 100644 --- a/packages/bundled_models/persistence/tests/test__median.py +++ b/packages/bundled_models/persistence/tests/test__median.py @@ -1,5 +1,5 @@ import numpy as np -from persistence._median import _median_of_three_numpy +from persistence.methods._median import _median_of_three_numpy def test_median_of_three_numpy_basic(): From a7da4dc421d8d80c3fafa6848d9fd1dc127032ed Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Thu, 26 Feb 2026 17:05:15 +1100 Subject: [PATCH 16/28] [skip ci] refactor interface - midway --- .../persistence/src/persistence/__init__.py | 17 - .../src/persistence/interface/_backends.py | 28 ++ .../src/persistence/interface/_chunker.py | 214 +++++++++++++ .../src/persistence/interface/_compute.py | 0 .../src/persistence/interface/_interface.py | 298 ++++++------------ .../src/persistence/interface/_metadata.py | 40 +++ .../src/persistence/interface/_method.py | 53 ++++ .../persistence/tests/test__interface.py | 137 ++++++-- 8 files changed, 529 insertions(+), 258 deletions(-) create mode 100644 packages/bundled_models/persistence/src/persistence/interface/_backends.py create mode 100644 packages/bundled_models/persistence/src/persistence/interface/_chunker.py delete mode 100644 packages/bundled_models/persistence/src/persistence/interface/_compute.py create mode 100644 packages/bundled_models/persistence/src/persistence/interface/_metadata.py create mode 100644 packages/bundled_models/persistence/src/persistence/interface/_method.py diff --git a/packages/bundled_models/persistence/src/persistence/__init__.py b/packages/bundled_models/persistence/src/persistence/__init__.py index e1534ca8..e69de29b 100644 --- a/packages/bundled_models/persistence/src/persistence/__init__.py +++ b/packages/bundled_models/persistence/src/persistence/__init__.py @@ -1,17 +0,0 @@ -from persistence.interface._interface import ( - PersistenceMethod, - PersistenceDataChunk, - PersistenceChunker, -) - -from persistence.methods._impute import SimpleImpute -from persistence.types import PetDataset, PetDataArrayLike - -__all__ = [ - "PersistenceMethod", - "PersistenceDataChunk", - "PersistenceChunker", - "SimpleImpute", - "PetDataset", - "PetDataArrayLike", -] diff --git a/packages/bundled_models/persistence/src/persistence/interface/_backends.py b/packages/bundled_models/persistence/src/persistence/interface/_backends.py new file mode 100644 index 00000000..58d8d0b7 --- /dev/null +++ b/packages/bundled_models/persistence/src/persistence/interface/_backends.py @@ -0,0 +1,28 @@ +from enum import StrEnum, auto + +class PersistenceBackendType(StrEnum): + """ + Supported compute backends. + + NOTE: only NUMPY is currently supported + """ + + NUMPY = "numpy" + NUMBA = "numba" + RUST = "rust" + UNKNOWN = auto() + + # overridable + def check_support(self): + """ + This check only guarentees that a particular computation mechanism or library is available. + Individual methods may still not support a particular backend and that check is handled at a + lower level. + """ + match self: + case PersistenceBackendType.NUMPY: + return + case _: + raise NotImplementedError( + f"PersistenceBackendType: {self} is not supported" + ) diff --git a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py new file mode 100644 index 00000000..5c95fc40 --- /dev/null +++ b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py @@ -0,0 +1,214 @@ +from dataclasses import dataclass +import numpy as np +import xarray as xr +import functools + +from persistence.interface._metadata import PersistenceMetadata +from persistence.types import PetDataArrayLike + + +@dataclass +class PersistenceDataChunk: + """ + Container to hold a reference to the persistence metadata. + """ + metadata: PersistenceMetadata + arr_chunk: PetDataArrayLike + + +@dataclass +class PersistenceChunker: + """ + The persistence chunker chunks a xarray dataarray as part of a generator (lazy). + + + The chunking algorithm is as follows: + + Divide the total size (product of the data shape) by the desired number of chunks (rounded + up, min chunk size = 1). This is the desired chunk size. + + Working backwards from the fastest varying index/axis/dimension (len - 1), find if the + desired chunk size is greater tha the product of the cardinality any slower varying indices. + (natural element = 1) e.g. + + product[len - 1] = 1 + product[len - 2] = shape[len - 1] * product[len - 1] + product[len - 3] = shape[len - 2] * product[len - 2] + ... + + If the chunk size is smaller than the product, stop. Create a marker at this index - call it + the "stop" index (i.e. the most significant index used for the chunk size calculation). The + product at the given iteration is the _actual_ chunk size. + + Then, for all indices that are more significant the "stop" index, increment it as a multi + index ring to find the start and end indices of the hyperslab. + + In other words, the chunks are designed in such a way that indices that are faster varying + than the "stop" index are always at their cardinality (max size), and slower varying indices + are incremented and used for selection. Increments are over the fastest of the slowest + varying index (i.e. fastest most significant index). + + Note: the time index is a special case and should be ignored. + + Note: the most significant index is the slowest varying index and the least significant + index is the fastest varying index. i.e. + + x[i0,...] v.s. x[...,iN] => i0 is slow varying, iN is fast varying. + + Note: It is *usually* more efficient to to increment chunks by the slower varying indices - + as this *usually* guarentees that the chunks are contiguous in memory (C-style). But + for updating individual values in a chunk the opposite is true. i.e. traversing chunks + v.s. traversing elements. Here we want the former for chunking, and the latter for + computation. Which is why we chunk with slower varying indices and compute with faster + varying indices (with whatever backend of choice). + + Note: The reason why dask isn't used (or at least forced into synchronous mode), + is because its configuration in PET (but possibly in general) is hard to pin down. + + Note: we could have used numpy.nditer with a external loop, but we would like to keep the + structure of the array and not flatten it. Further, we are only dealing with a max of + 1000 so any benefit would be minimal. + + FUTUREWORK: The loaders should present options to use direct mechanisms to load particular + types of data rather than xarray. For now this class has no control over the data loader. + """ + + # --- helper methods --- + @staticmethod + def _b10_to_mi(b10: int, mi_size: list[int]) -> list[int]: + """ + Given: + 1. a base10 (integer) representation of the product of a multiindex + 2. a list of the cardinality of each index (size of each index) + convert the base10 representation of a multiindex back to a multiindex. + """ + assert b10 >= 0 + assert all([x is not None and x >= 0 for x in mi_size]) + + rem = b10 # set remainder to the orignal base10 value + + # incrementing the most significant shifts the hyperslab by the product of the size of every + # other index after it. This is a running product that is the "base" of a given multi index. + # the least significant index will have a base of 1. + mi_sizeshift = mi_size[1:] + [1] + prod = functools.reduce(lambda x, y: x * y, mi_sizeshift) + + num_idx = len(mi_size) # number of indices + mi = [None for i in range(num_idx)] # initialize multi index to return + + for i, s in enumerate(mi_sizeshift): + # calculate quotient/remainder + quo, rem = divmod(rem, prod) + + # update multi-index forwards (most-significant first) + mi[i] = quo + + # update product by reverting the most recent size (i.e. divide) the minimum product + # must be one. + prod = max(prod // s, 1) + + assert all([x is not None and x >= 0 for x in mi]) + assert len(mi) == len(mi_size) + return mi + + @staticmethod + def _mi_to_b10(mi: list[int], mi_size: list[int]) -> int: + """ + Given: + 1. a list of indices (for each dimension) + 2. a list of the cardinality of each index (size of each index) + convert the multiindex (1.) into a base10 (integer) representation. + """ + assert len(mi) == len(mi_size) + assert all([x is not None and x >= 0 for x in mi]) + assert all([x is not None and x >= 0 for x in mi_size]) + + prodscan = 1 # running accumulation of product + b10 = 0 # calculated using prodsum + + # need to reverse arrays since least significant needs to be computed first + for i, v in enumerate(zip(mi[::-1], mi_size[::-1])): + ix, s = v + b10 += ix * prodscan + # update product with latest size + prodscan *= s + + assert b10 >= 0 + return b10 + + @staticmethod + def _inc_mi(mi: list[int], mi_size: list[int], inc=1) -> list[int]: + """ + Increments a multindex by 1, note: this is the inefficient way, but it doesn't need to be + efficient - chunk sizes are hard capped to 1000. Note: the fastest varying index (last + index) is incremented first since that minimizes cache misses. + + Algorithm: + + Convert multi index to base10, then + add 1 to base10 value (or inc if specified) - trivial increment, then + convert back to multiindex + """ + assert inc > 0 + assert len(mi) == len(mi_size) + assert all([x is not None and x >= 0 for x in mi]) + assert all([x is not None and x >= 0 for x in mi_size]) + + fn_b10 = functools.partial(PersistenceChunker._mi_to_b10, mi_size=mi_size) + fn_b10_inv = functools.partial(PersistenceChunker._b10_to_mi, mi_size=mi_size) + mi_next = fn_b10_inv(fn_b10(mi) + inc) + + if mi_next[0] >= mi_size[0]: + raise OverflowError( + f"PersistenceChunker: increment multindex - overflow {mi} + {inc} goes past the" + f" maximum sizes: {mi_size}." + ) + + assert all([x is not None and x >= 0 and x < s for x, s in zip(mi_next, mi_size)]) + return mi_next + + @staticmethod + def _compute_greedy_chunksize(desired_numchunks: int, mi_size: list[int]) -> int: + """ + This is a greedy chunksize calculation, because it prefers having entire dimensions as part + of a chunk rather than partial extents in a dimension. Although this is the only chunking + strategy that will be conceivably used in the near future. + + TODO: make this a Structure + Returns a tuple containing + 1. actual chunk size + 3. actual chunk count + 2. the position (least significant) of the first index that should be be used for + incrementing chunks (using multi-indexing) + """ + assert desired_numchunks > 1 + + total_size = functools.reduce(lambda x, y: x * y, mi_size) + desired_chunksize = int(max(1, total_size // desired_numchunks)) + + num_idx = len(mi_size) + prodsize = 1 + actual_chunksize = None + first_chunkindex = None + + for i, s in enumerate(mi_size[::-1]): + if prodsize >= desired_chunksize: + first_chunkindex = num_idx - i - 1 + actual_chunksize = prodsize + break + prodsize *= s + + # single chunk + if first_index is None or actual_chunksize is None: + actual_chunksize = prodsize + actual_numchunks = 1 + + actual_numchunks = total_size // actual_chunksize + + assert actual_chunksize >= desired_chunksize + assert actual_numchunks <= desired_numchunks + + return (actual_chunksize, actual_numchunks, first_chunkindex) + + + # --- object-specific methods --- diff --git a/packages/bundled_models/persistence/src/persistence/interface/_compute.py b/packages/bundled_models/persistence/src/persistence/interface/_compute.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages/bundled_models/persistence/src/persistence/interface/_interface.py b/packages/bundled_models/persistence/src/persistence/interface/_interface.py index e06abbdc..5b27440c 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_interface.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_interface.py @@ -3,8 +3,9 @@ Persistence as a model. """ +import multiprocessing from enum import StrEnum, auto -from dataclasses import dataclass +from dataclasses import dataclass, field from collections.abc import Callable from contextlib import contextmanager from typing import Union, Generic @@ -12,247 +13,128 @@ import numpy as np import xarray as xr -# 50% sparsity is reasonable, though some data like precipitation may be more sparse than this -_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER = 2 +from persistence.types import PetDataArrayLike +from persistence.methods._impute import SimpleImpute +from persistence.interface._metadata import PersistenceMetadata +from persistence.interface._method import PersistenceMethod +from persistence.interface._chunker import PersistenceDataChunk # unlikely to have more than 1000 processes for persistence, due to diminishing returns - even on a # supercomputer _MAX_NUM_CHUNKS = 1000 -# subtype of np.mod that works only with ints. -_mod_index: Callable[[int, int], np.uint] = np.mod -_mod_index.__doc__ = """ -Maps negative integer to a positive integer element in a ring. The ring has a domain of -`[0, (cardinality - 1)]`. - -sample usage: get the positive equivilent index for an array. E.g. "-1" in python is the last -element, this will standardize it to len(.) - 1. The reason for doing this is to make sure index -comparisons are accurately represented. -""" -class PersistenceMethod(StrEnum): +# TODO: no implementation yet +@dataclass +class PersistenceComputePool: """ - Methods to use for persistence. + Generates a compute pool by creating a chunker and assigning chunks to various workers that + compute the specified persistence methods. - MEDIAN_OF_THREE: - computes the median of the three most recent observations. - fallback = MOST_RECENT + Takes an array, and persistence metadata. This is internal and built to be used with + PetDataset.map_each_var. - MOST_RECENT: - uses the most-recent value as persistence. + 1. make chunker as per PersistenceChunker + 2. retrieve chunks from PersistenceChunker.generate_chunks + 3. create a job_wrapper over PersistenceCompute to run each PersistenceDataChunk object against + the persistence method to generate the output chunk + """ - If there are nans, previous observations are used instead, up until the `max_lookback` threshold - as determined by the "sparsity_multiplier". + da: xr.DataArray + metadata: PersistenceMetadata - E.g. if the sparsity multiplier was "3" i.e. 66.67% of the data is `nan`, median of three - (which needs exactly 3 non-nan values) will look for non-nan values up to 9 indices prior in - order to fill any missing values. + @staticmethod + def _job_wrapper(chunk: PersistenceDataChunk) -> PetDataArrayLike: + pass - For now the imputation is kept simple and uses `mean` for speed reasons. e.g. for median of 3 - with a sparsity multiplier of 3, this would be the mean over a 9 by N slab - with N being the - cardinality of the remaining dimensions in a give data array or chunk. + def _make_chunker(self): + pass - FUTUREWORK: - Simpler imputations work better with complex learning models. Given that the persistence - models are not at all complex, a clustering algorithm like KNN (by using e.g. kd-trees, for - multiple dimensions) would work better. But this is out of scope for now. - """ + def compute_chunks(self) -> PetDataArrayLike: + pass - MOST_RECENT = "most_recent" - MEDIAN_OF_THREE = "median_of_three" - def flatten_non_temporal(self) -> bool: - """ - Whether or not to flatten non-temporal dimensions during computation. - For methods that do not need spatial dependence, this makes chunking a lot easier. - """ - _default = False - match self: - case PersistenceMethod.MOST_RECENT: - return True - case PersistenceMethod.MEDIAN_OF_THREE: - return True +# TODO: the variable references are not right - need to use self.metadata +@dataclass +class PersistenceCompute: + arr: PetDataArrayLike + metadata: PersistenceMetadata + + def _method_impl(self, arr_preprocessed: PetDataArrayLike) -> PetDataArrayLike: + match self.backend: + case PersistenceMethod.NUMPY: + return self._method_impl_numpy(arr_preprocessed) + case PersistenceMethod.NUMBA: + return self._method_impl_numba(arr_preprocessed) + case PersistenceMethod.RUST: + return self._method_impl_rust(arr_preprocessed) case _: - raise NotImplementedError( - "PersistenceMethod: Invalid persistence method." - ) - return _default + raise NotImplementedError("PersistenceCompute: Unknown backend") - def num_time_indices_required(self) -> int: - """ - number of time indices required for computing a particular method - """ - match self: - case PersistenceMethod.MOST_RECENT: - return 1 + def _method_impl_numpy(self, arr_preprocessed) -> PetDataArrayLike: + match self.method: case PersistenceMethod.MEDIAN_OF_THREE: - return 3 + return _median_of_three_numpy(arr_preprocessed, self.idx_time) + case PersistenceMethod.MOST_RECENT: + raise NotImplementedError("TODO") case _: raise NotImplementedError( - "PersistenceMethod: Invalid persistence method." + f"PersistenceCompute: compute method {self.method} has not been implemented" ) - def min_lookback( - self, sparsity_multiplier=_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER - ): - """ - The minimum amount of lookback required to compute the corresponding metric. - By default we assume a 50% sparsity and require at least double the number of values - required for the compuation. - """ - if sparsity_multiplier < 1: - raise ValueError("PersistenceMethod: Sparsity multiplier must be >= 1") - - return self.num_time_indices_required() * sparsity_multiplier - - -@dataclass -class PersistenceDataChunk: - """ - Sample usage pattern: - 1. split dataset into chunks - 2. represent chunk as 'PersistenceDataChunk' - 3. perform computation - 4. grab output chunk and insert into destination dataset - - IMPORTANT: data should not be chunked over time. - """ - - # ndarray with a mandatory time axis. Sorted ascending in time. The latest data point is assumed - # to be the "reference" time. - # NOTE: this API may change in the future depending on how temporal indexing is handled in the - # pipeline. - arr_chunk: np.ndarray + def _method_impl_numba( + self, arr_preprocessed: PetDataArrayLike + ) -> PetDataArrayLike: + raise NotImplementedError("numba backend is not supported") - # the time axis - this will be flattened - idx_time: int + def _method_impl_rust(self, arr_preprocessed) -> PetDataArrayLike: + raise NotImplementedError("rust backend is not supported") - # the method to use to calculate persistence - method: PersistenceMethod + def _slice(self) -> PetDataArrayLike: + # slice lookback data + len_time = arr.shape[idx_time] + if len_time < self.num_lookback: + raise ValueError( + "PersistenceCompute: input data does not have enough historical time indices to compute this method." + ) -@dataclass -class PersistenceChunker: - """ - Takes a lazy-loaded xarray dataarray (or similar) and slices it into chunks, processing them - either serially or in parallel. The result is merged into the full field. - - Chunking will happen over non-temporal dimensions as they are generally invariant to the - persistence computations (and even if they have spatial dependence this dependence is very - local in nature compared to the full field size). - - The process is: - - pivot on time dimension - -> chunk on non-time dimension - -> compute persistence for each chunk, aggregating over time - -> reshape into output form - - IMPORTANT: This method may not always speed things up or use less memory. It entirely depends on - the underlying data source being stored/chunked at rest, and how xarray decides to retrieve it. - - FUTUREWORK: usage of rust and/or parquet for dataloading and intermediate caching will be - explored in order to speed up this process. - """ - - # lazy loaded data array - da_lazy: xr.DataArray + idx_end = len_time + idx_start = idx_end - self.num_lookback + idx_slice = slice(idx_start, idx_end, 1) # start, end, step - # the method - determines how much data needs to be loaded - method: PersistenceMethod + # generator for nd-index slicing + idx_all = slice(None, None, None) + nd_slice = ( + idx_slice if i == idx_time else idx_all for i in range(len(arr.shape)) + ) - # number of chunks to use - num_chunks: int + # sliced array that only has the latest 3 values + arr_slice = arr[*tuple(nd_slice)] - # axis index for time - idx_time: int = None + return arr_slice - # axis index for chunk - idx_chunk: int = None + def _impute(self, arr_sliced) -> PetDataArrayLike: + # NOTE: only simple impute is currently supported + if self.do_impute: + imputer = SimpleImpute(arr_sliced) + return imputer.impute_mean() - # the time dimension name normally "time" - dimname_time: str = None + # default - do nothing + return arr_sliced - # the dimension name to chunk along, or default to a non-time - dimname_chunk: str = None + def compute(self) -> PetDataArrayLike: + # check backend support + self.metadata.backend.check_support() - def __post_init__(self): - # --- handle time dimension --- - if self.idx_time is None: - if self.dimname_time not in self.da_lazy.dims: - raise KeyError( - f"PersistenceChunker: time dimension {self.dimname_time} not found in input array" - ) - self.idx_time = self.da_lazy.dims.index(self.dimname_time) - - # --- handle chunk dimension --- - if self.idx_chunk is None: - if self.dimname_chunk is None: - # --- default chunk dimension --- - # attempt to choose the previous index as the chunk index, so that it doesn't - # overlap with the time index. modulo operation is used so that negative indices - # are cycled. - self.idx_chunk = _modidx(self.idx_time - 1, self.da_lazy.dims) - dimkeys = self.dims.keys() - self.dimname_chunk = dimkeys[self.idx_chunk] - else: - # --- check and update chunk dimension --- - if self.dimname_chunk not in self.da_lazy.dims: - raise KeyError( - f"PersistenceChunker: chunk dimension {self.dimname_chunk} not found in input array" - ) - self.idx_chunk = self.da_lazy.dims.index(self.dimname_chunk) - - # --- check chunk/time index compatibilty --- - if self.idx_time == self.idx_chunk: - raise ValueError("PersistenceChunker: cannot chunk over time dimension") - - # --- check chunk size --- - if self.num_chunks < 1: - raise ValueError( - "PersistenceChunker: number of chunks must be greater than or equal to 1" - ) + # slice: to num_lookback indices + arr_sliced: PetDataArrayLike = self._slice(self.arr) - if self.num_chunks > self.da_lazy.shape[self.idx_chunk]: - raise ValueError( - "PersistenceChunker: num_chunks must be less than the axis length" - ) + # preprocess: currently just (maybe) impute + arr_preprocessed: PetDataArrayLike = self._impute(arr_sliced) - if self.num_chunks > _MAX_NUM_CHUNKS: - raise ValueError( - f"PersistenceChunker: num_chunks is too large. Must be <{_MAX_NUM_CHUNKS}" - ) + # compute: using specified persistence method and preprocessed array + arr_persist: PetDataArrayLike = self._method_impl(arr_preprocessed) - # safety (tests only): check that indices are appropriately set - assert self.idx_chunk is not None - assert self.idx_time is not None - - # safety: these are usually handled by the underlying compute indexers, - # which will spit out an error if the user inputs the wrong index. - assert self.idx_time in self.da_lazy - assert self.idx_chunk in self.da_lazy - - def generate_chunks(self): - """ - Generator that extracts chunks from the underlying data array. - sample application: - - minimize memory bloat (lazy loading chunks) - - better utilization of CPU by splitting embarassingly parallel dimensions across processes - (multi processing) - """ - - # --- iterate and split chunks --- - chunk_counter = 0 - while chunk_counter < self.num_chunks: - slice_chunk = self._get_chunk_slice(chunk_counter) - slice_time = self._get_time_slice(chunk_counter) - # --- yield reference to array --- - # still lazy at this point, until it is loaded and moved/copied into the forked process. - yield self.da_lazy.isel( - { - self.dimname_time: slice_time, - self.dimname_chunk: slice_chunk, - } - ) - chunk_counter += 1 + return arr_persist diff --git a/packages/bundled_models/persistence/src/persistence/interface/_metadata.py b/packages/bundled_models/persistence/src/persistence/interface/_metadata.py new file mode 100644 index 00000000..0086a0da --- /dev/null +++ b/packages/bundled_models/persistence/src/persistence/interface/_metadata.py @@ -0,0 +1,40 @@ +from dataclasses import dataclass, field +from multiprocessing import cpu_count +from persistence.interface._backends import PersistenceBackendType +from persistence.interface._method import PersistenceMethod + +@dataclass +class PersistenceMetadata: + """ + Reference to common data that is passed around during persistence computations. + """ + idx_time_dim: int # index of time dimension + method: PersistenceMethod # persistence method to use + + # --- (kw)args with defaults --- + num_workers: int = field(default_factory=cpu_count) + + # --- + # NOTE: + # + # A hyperslab/cube is bound by orthogonal hyperplanes, each with its surface parallel to + # a unique axis or dimension. In our case a hyperslab is a chunk. + # + # The above constraint simplifies retrieval of chunks, without needing to flatten or change + # the underlying data structure. On the other hand, the constraint makes it harder to + # accomodate every possible chunk size/count. + # + # Therefore, the number of chunks requested by the user is a desire, not a guarentee. + # The actual chunksize is computed at runtime, and depends on the data shape. + # + # The runtime algorithm must abide by the constraints of hyperslab selection while choosing a + # chunk size that is close to the desired chunk size. + desired_num_chunks: int = 1 + # --- + + do_impute: bool = True + backend: PersistenceBackendType = PersistenceBackendType.NUMPY + + # --- chunk count and size: these are mutable --- + num_chunks: int | None = None + chunk_size: int | None = None diff --git a/packages/bundled_models/persistence/src/persistence/interface/_method.py b/packages/bundled_models/persistence/src/persistence/interface/_method.py new file mode 100644 index 00000000..8845535f --- /dev/null +++ b/packages/bundled_models/persistence/src/persistence/interface/_method.py @@ -0,0 +1,53 @@ +from enum import StrEnum, auto + +# 50% sparsity is reasonable, though some data like precipitation may be more sparse than this +_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER = 2 + +class PersistenceMethod(StrEnum): + """ + Methods to use for persistence. + + MEDIAN_OF_THREE: + computes the median of the three most recent observations. + + MOST_RECENT: + uses the most-recent value as persistence. + + Additionally, num_lookback is used to determine how many indices in the past are required from a + dataslab in order to compute a persistence method. + + This is determined by the actual number of indices required multiplied by a sparsity factor to + account for missing values. Missing values will optionally be imputed. + """ + + MOST_RECENT = "most_recent" + MEDIAN_OF_THREE = "median_of_three" + UNKNOWN = auto() + + def num_time_indices_required(self) -> int: + """ + number of time indices required for computing a particular method + """ + match self: + case PersistenceMethod.MOST_RECENT: + return 1 + case PersistenceMethod.MEDIAN_OF_THREE: + return 3 + case _: + raise NotImplementedError( + "PersistenceMethod: Invalid persistence method." + ) + + def min_lookback( + self, sparsity_multiplier=_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER + ): + """ + The minimum amount of lookback required to compute the corresponding metric. + By default we assume a 50% sparsity and require at least double the number of values + required for the compuation. + """ + if sparsity_multiplier < 1: + raise ValueError("PersistenceMethod: Sparsity multiplier must be >= 1") + + return self.num_time_indices_required() * sparsity_multiplier + diff --git a/packages/bundled_models/persistence/tests/test__interface.py b/packages/bundled_models/persistence/tests/test__interface.py index 8d58563b..e568bc0e 100644 --- a/packages/bundled_models/persistence/tests/test__interface.py +++ b/packages/bundled_models/persistence/tests/test__interface.py @@ -29,63 +29,134 @@ def test_persistence_data_chunk_obj(): arr_chunk = np.random.randint(0, 10, (2, 5, 8)) persistence_method = pet_persist.PersistenceMethod.MOST_RECENT idx_time: int = 1 # len = 5 + + metadata = pet_persist.PersistenceMetadata( + idx_time_dim=idx_time, + method=persistence_method, + ) + datachunk = pet_persist.PersistenceDataChunk( arr_chunk=arr_chunk, - idx_time=idx_time, - method=persistence_method, + metadata=metadata, ) - assert datachunk.arr_chunk.shape.index(5) == datachunk.idx_time - assert datachunk.method.min_lookback() == 2 + + assert datachunk.arr_chunk.shape.index(5) == datachunk.metadata.idx_time_dim + assert datachunk.metadata.method.min_lookback() == 2 def test_persistence_chunker_obj(): """ Basic test to check object creation: PersistenceChunker """ - # --- index variant --- da = xr.DataArray( np.random.randint(0, 10, (2, 5, 8)), dims=["x0", "time", "x2"], ) idx_time: int = 1 # len = 5 - idx_chunk: int = 2 # len = 8 num_chunks: int = 4 # each chunk is 2x5x2 persistence_method = pet_persist.PersistenceMethod.MOST_RECENT - - chunker = pet_persist.PersistenceChunker( - da_lazy=da, - idx_time=idx_time, - idx_chunk=idx_chunk, - num_chunks=num_chunks, + metadata = pet_persist.PersistenceMetadata( + idx_time_dim=idx_time, method=persistence_method, + num_chunks=num_chunks, + ) + chunker = pet_persist.PersistenceChunker( + da=da, + metadata=metadata, ) # sense checks - assert da.shape.index(5) == chunker.idx_time - assert da.shape.index(8) == chunker.idx_chunk - assert chunker.num_chunks == 4 - assert chunker.method.num_time_indices_required() == 1 + assert da.shape.index(5) == chunker.metadata.idx_time_dim + assert chunker.metadata.num_chunks == 4 + assert chunker.metadata.method.num_time_indices_required() == 1 - # --- name variant --- +def test_chunker_multi_index_increment(): + """ + Tests the scenario in the docstrings for mult index increment + + i.e. + shape = (2, 4, 10, 2) + chunk_size = 47 (or increment size) + + Also does a double increment and a manual isel on the dataarray to make sure the sizes are as + expected. + + For this particular purpose we shall include a dummy dimension - time and it should be ignored. + + (2, 4, 5*, 10, 2) + + * time dimension + + as per the doc string example we expect giving a start index of all zeros and a increment (chunk + size) of 47, the next index we should receive is: + + (0, 2, 5*, 3, 1) + """ da = xr.DataArray( - np.random.randint(0, 10, (2, 5, 8)), - dims=["x0", "time", "x2"], + np.random.randint(0, 10, (2, 4, 5, 10, 2)), + dims=["x0", "x1", "time", "x3", "x4"], ) - dimname_time: str = "time" # len = 5 - dimname_chunk: str = "x2" # len = 8 - num_chunks: int = 4 # each chunk is 2x5x2 - persistence_method = pet_persist.PersistenceMethod.MOST_RECENT + idx_time: int = 2 + chunk_size: int = 47 - chunker = pet_persist.PersistenceChunker( - da_lazy=da, - dimname_time=dimname_time, - dimname_chunk=dimname_chunk, - num_chunks=num_chunks, + # NOTE: num_chunks is a dummy and not used since we want to explicitly test "47" + # still we set it abnormally high here to check that it is clipped to the data cardinality + # appropriately. + num_chunks: int = 999 + + persistence_method = pet_persist.PersistenceMethod.MOST_RECENT + metadata = pet_persist.PersistenceMetadata( + idx_time_dim=idx_time, method=persistence_method, + num_chunks=999, + ) + chunker = pet_persist.PersistenceChunker( + da=da, + metadata=metadata, ) - # sense checks - assert da.shape.index(5) == chunker.idx_time - assert da.shape.index(8) == chunker.idx_chunk - assert chunker.num_chunks == 4 - assert chunker.method.num_time_indices_required() == 1 + assert chunker.metadata.num_chunks == 2 * 4 * 10 * 2 + + start_index = (0, 0, 0, 0, 0) + end_index = chunker.increment_multi_index(start_index, chunk_size) + + assert end_index == [0, 2, 5, 3, 1] + + # check slicing + np_start_index = np.asarray(list(start_index)) + np_end_index = np.asarray(end_index) + 1 + + # assert xarray dataarray dims returns a tuple (since tuples are ordered sets) + assert isinstance(da.dims, tuple) + + dim_names = list(da.dims) + multi_slice = { + dim_names[i]: slice(v[0], v[1], 1) + for v, i in enumerate(zip(np_start_index, np_end_index)) + } + da_slice = da.isel(**multi_slice) + import pdb; pdb.set_trace() + da_slice.shape + + + + + + +def test_chunker_multi_index_increment_with_single_dim(): + """ + Tests multi index increment for the case where there is only a single dimension This should + return the entire array back as-is since there can only be one dimension in this case and that + dimension cannot be chunked - i.e. time + """ + pass + +def test_chunker_multi_index_increment_unit_cardinality(): + """ + Tests multi index increment for the case where there are multiple indices but the indices all + have a cardinality of 1 => we can only have one chunk, regardless of what we set num_chunks to. + """ + # set num_chunks to 10 arbitrarily + + # chunks should be trimmed to min(10, np.prod(all_dims_except_time) => 1) = 1 + pass From 5e9a02963b6516f04a613a3721b17ccfad6a65c1 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Fri, 27 Feb 2026 13:54:35 +1100 Subject: [PATCH 17/28] [skip ci] work in progress implementation of chunker - partially tested --- .../src/persistence/interface/_backends.py | 1 + .../src/persistence/interface/_chunker.py | 124 +++++++++++++++--- .../src/persistence/interface/_interface.py | 1 - .../src/persistence/interface/_metadata.py | 8 +- .../src/persistence/interface/_method.py | 2 +- 5 files changed, 113 insertions(+), 23 deletions(-) diff --git a/packages/bundled_models/persistence/src/persistence/interface/_backends.py b/packages/bundled_models/persistence/src/persistence/interface/_backends.py index 58d8d0b7..cdea879a 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_backends.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_backends.py @@ -1,5 +1,6 @@ from enum import StrEnum, auto + class PersistenceBackendType(StrEnum): """ Supported compute backends. diff --git a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py index 5c95fc40..a92d7602 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py @@ -3,17 +3,33 @@ import xarray as xr import functools +from typing import Generator + from persistence.interface._metadata import PersistenceMetadata from persistence.types import PetDataArrayLike +@dataclass +class PersistenceChunkInfo: + num_chunks: int + size_chunk: int + + # --- + # least significant chunk index (fastest varying), most significant is 0, indices are + # incremented from least significant (fast) to most significant (slow) + lsi_chunk: int + # --- + + @dataclass class PersistenceDataChunk: """ Container to hold a reference to the persistence metadata. """ - metadata: PersistenceMetadata + arr_chunk: PetDataArrayLike + metadata: PersistenceMetadata + chunk_info: PersistenceChunkInfo @dataclass @@ -23,7 +39,7 @@ class PersistenceChunker: The chunking algorithm is as follows: - + Divide the total size (product of the data shape) by the desired number of chunks (rounded up, min chunk size = 1). This is the desired chunk size. @@ -47,7 +63,7 @@ class PersistenceChunker: than the "stop" index are always at their cardinality (max size), and slower varying indices are incremented and used for selection. Increments are over the fastest of the slowest varying index (i.e. fastest most significant index). - + Note: the time index is a special case and should be ignored. Note: the most significant index is the slowest varying index and the least significant @@ -62,7 +78,7 @@ class PersistenceChunker: computation. Which is why we chunk with slower varying indices and compute with faster varying indices (with whatever backend of choice). - Note: The reason why dask isn't used (or at least forced into synchronous mode), + Note: The reason why dask isn't used (or at least forced into synchronous mode), is because its configuration in PET (but possibly in general) is hard to pin down. Note: we could have used numpy.nditer with a external loop, but we would like to keep the @@ -73,7 +89,10 @@ class PersistenceChunker: types of data rather than xarray. For now this class has no control over the data loader. """ - # --- helper methods --- + da: xr.DataArray + metadata: PersistenceMetadata + chunk_info: PersistenceChunkInfo | None = None + @staticmethod def _b10_to_mi(b10: int, mi_size: list[int]) -> list[int]: """ @@ -82,9 +101,9 @@ def _b10_to_mi(b10: int, mi_size: list[int]) -> list[int]: 2. a list of the cardinality of each index (size of each index) convert the base10 representation of a multiindex back to a multiindex. """ - assert b10 >= 0 + assert b10 >= 0 assert all([x is not None and x >= 0 for x in mi_size]) - + rem = b10 # set remainder to the orignal base10 value # incrementing the most significant shifts the hyperslab by the product of the size of every @@ -133,7 +152,7 @@ def _mi_to_b10(mi: list[int], mi_size: list[int]) -> int: # update product with latest size prodscan *= s - assert b10 >= 0 + assert b10 >= 0 return b10 @staticmethod @@ -147,7 +166,7 @@ def _inc_mi(mi: list[int], mi_size: list[int], inc=1) -> list[int]: Convert multi index to base10, then add 1 to base10 value (or inc if specified) - trivial increment, then - convert back to multiindex + convert back to multiindex """ assert inc > 0 assert len(mi) == len(mi_size) @@ -164,11 +183,16 @@ def _inc_mi(mi: list[int], mi_size: list[int], inc=1) -> list[int]: f" maximum sizes: {mi_size}." ) - assert all([x is not None and x >= 0 and x < s for x, s in zip(mi_next, mi_size)]) + assert all( + [x is not None and x >= 0 and x < s for x, s in zip(mi_next, mi_size)] + ) return mi_next @staticmethod - def _compute_greedy_chunksize(desired_numchunks: int, mi_size: list[int]) -> int: + def _compute_chunkinfo_greedy( + desired_numchunks: int, + mi_size: list[int], + ) -> PersistenceChunkInfo: """ This is a greedy chunksize calculation, because it prefers having entire dimensions as part of a chunk rather than partial extents in a dimension. Although this is the only chunking @@ -181,11 +205,14 @@ def _compute_greedy_chunksize(desired_numchunks: int, mi_size: list[int]) -> int 2. the position (least significant) of the first index that should be be used for incrementing chunks (using multi-indexing) """ - assert desired_numchunks > 1 + assert desired_numchunks >= 1 + + if isinstance(mi_size, tuple): + mi_size = list(mi_size) total_size = functools.reduce(lambda x, y: x * y, mi_size) desired_chunksize = int(max(1, total_size // desired_numchunks)) - + num_idx = len(mi_size) prodsize = 1 actual_chunksize = None @@ -199,16 +226,81 @@ def _compute_greedy_chunksize(desired_numchunks: int, mi_size: list[int]) -> int prodsize *= s # single chunk - if first_index is None or actual_chunksize is None: + if first_chunkindex is None or actual_chunksize is None: actual_chunksize = prodsize actual_numchunks = 1 + first_chunkindex = num_idx - 1 actual_numchunks = total_size // actual_chunksize assert actual_chunksize >= desired_chunksize assert actual_numchunks <= desired_numchunks - return (actual_chunksize, actual_numchunks, first_chunkindex) + return PersistenceChunkInfo( + num_chunks=actual_numchunks, + size_chunk=actual_chunksize, + lsi_chunk=first_chunkindex, + ) + + def __post_init__(self): + # --- + # suppress time index for calculations. + # + # NOTE: + # + # Expanding an array by one dimension with a dimensionality 1 for example has no impact on + # the chunk sizes - this is equivilent to squeezing or removing the dimension. + # + # 0 is not right here, since that'd imply a empty slice. + _shape_notime = list(self.da.shape) + _shape_notime[self.metadata.idx_time_dim] = 1 + # --- + + self.chunk_info = self._compute_chunkinfo_greedy( + self.metadata.num_chunks_desired, + _shape_notime, + ) + + def generate_chunks(self) -> Generator[PersistenceDataChunk]: + """ + Evaluate chunks by loading each chunk into memory, the chunks are lazily loaded but eagerly + evaluated in memory in the backend. Chunks should ideally be contiguous in memory. (Except + for time). + This generator generally would be fed into a multiprocessing worker pool in conjunction with + a method to process each chunk. + """ + _shape_notime = list(self.da.shape) + _shape_notime[self.metadata.idx_time_dim] = 1 + mi_prev = [0 for _ in _shape_notime] + + for i in enumerate(_shape_notime): + mi_next = _inc_mi(_shape_notime) + + # safety: don't want assume sets or dict keys because they may be unordered (depending + # on the version of python). if this behaviour of xarray changes this will be caught in + # tests. + assert isinstance(da.dims, tuple) or isinstance(da.dims, list) + + # use hyperslab selection for dimensions other than time. The time dimension is never + # chunked and selected as a whole. + arr_chunk = da.isel( + { + da.dims[i]: ( + slice(mi_prev[i], mi_next[i] + 1, 1) + if i != self.metadata.idx_time_dim + else slice(0, da.shape[self.metadata.idx_time_dim]) + ) + for i in range(len(_shape_notime)) + } + ).values + + # pass chunk + yield PersistenceDataChunk( + arr_chunk=arr_chunk, + metadata=self.metadata, + chunk_info=self.chunk_info, + ) - # --- object-specific methods --- + # explicit copy + mi_prev = [x for x in mi_next] diff --git a/packages/bundled_models/persistence/src/persistence/interface/_interface.py b/packages/bundled_models/persistence/src/persistence/interface/_interface.py index 5b27440c..aa293708 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_interface.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_interface.py @@ -24,7 +24,6 @@ _MAX_NUM_CHUNKS = 1000 - # TODO: no implementation yet @dataclass class PersistenceComputePool: diff --git a/packages/bundled_models/persistence/src/persistence/interface/_metadata.py b/packages/bundled_models/persistence/src/persistence/interface/_metadata.py index 0086a0da..adb3b2c3 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_metadata.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_metadata.py @@ -3,11 +3,13 @@ from persistence.interface._backends import PersistenceBackendType from persistence.interface._method import PersistenceMethod + @dataclass class PersistenceMetadata: """ Reference to common data that is passed around during persistence computations. """ + idx_time_dim: int # index of time dimension method: PersistenceMethod # persistence method to use @@ -29,12 +31,8 @@ class PersistenceMetadata: # # The runtime algorithm must abide by the constraints of hyperslab selection while choosing a # chunk size that is close to the desired chunk size. - desired_num_chunks: int = 1 + num_chunks_desired: int = 1 # --- do_impute: bool = True backend: PersistenceBackendType = PersistenceBackendType.NUMPY - - # --- chunk count and size: these are mutable --- - num_chunks: int | None = None - chunk_size: int | None = None diff --git a/packages/bundled_models/persistence/src/persistence/interface/_method.py b/packages/bundled_models/persistence/src/persistence/interface/_method.py index 8845535f..02708377 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_method.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_method.py @@ -3,6 +3,7 @@ # 50% sparsity is reasonable, though some data like precipitation may be more sparse than this _DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER = 2 + class PersistenceMethod(StrEnum): """ Methods to use for persistence. @@ -50,4 +51,3 @@ def min_lookback( raise ValueError("PersistenceMethod: Sparsity multiplier must be >= 1") return self.num_time_indices_required() * sparsity_multiplier - From 5aaafce6723f4233b6f815808fed1f4d7119b99b Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Fri, 27 Feb 2026 21:40:32 +1100 Subject: [PATCH 18/28] [skip ci] wip test chunker works as expected for general scenarios --- .../src/persistence/interface/_chunker.py | 59 ++++---- .../tests/interface/test__chunker.py | 139 ++++++++++++++++++ .../persistence/tests/test__interface.py | 16 +- 3 files changed, 176 insertions(+), 38 deletions(-) create mode 100644 packages/bundled_models/persistence/tests/interface/test__chunker.py diff --git a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py index a92d7602..baf38fad 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py @@ -198,8 +198,7 @@ def _compute_chunkinfo_greedy( of a chunk rather than partial extents in a dimension. Although this is the only chunking strategy that will be conceivably used in the near future. - TODO: make this a Structure - Returns a tuple containing + Returns a structure (PersistenceChunkInfo) containing 1. actual chunk size 3. actual chunk count 2. the position (least significant) of the first index that should be be used for @@ -229,7 +228,7 @@ def _compute_chunkinfo_greedy( if first_chunkindex is None or actual_chunksize is None: actual_chunksize = prodsize actual_numchunks = 1 - first_chunkindex = num_idx - 1 + first_chunkindex = 0 actual_numchunks = total_size // actual_chunksize @@ -243,22 +242,23 @@ def _compute_chunkinfo_greedy( ) def __post_init__(self): + # safety: don't want assume sets or dict keys because they may be unordered (depending + # on the version of python). if this behaviour of xarray changes this will be caught in + # tests. + assert isinstance(self.da.dims, tuple) or isinstance(self.da.dims, list) # --- # suppress time index for calculations. - # # NOTE: - # # Expanding an array by one dimension with a dimensionality 1 for example has no impact on # the chunk sizes - this is equivilent to squeezing or removing the dimension. - # - # 0 is not right here, since that'd imply a empty slice. - _shape_notime = list(self.da.shape) - _shape_notime[self.metadata.idx_time_dim] = 1 + # Forcing to 0 is not right here, since that'd imply a empty slice. + shape_notime = list(self.da.shape) + shape_notime[self.metadata.idx_time_dim] = 1 # --- self.chunk_info = self._compute_chunkinfo_greedy( self.metadata.num_chunks_desired, - _shape_notime, + shape_notime, ) def generate_chunks(self) -> Generator[PersistenceDataChunk]: @@ -270,28 +270,24 @@ def generate_chunks(self) -> Generator[PersistenceDataChunk]: This generator generally would be fed into a multiprocessing worker pool in conjunction with a method to process each chunk. """ - _shape_notime = list(self.da.shape) - _shape_notime[self.metadata.idx_time_dim] = 1 - mi_prev = [0 for _ in _shape_notime] - - for i in enumerate(_shape_notime): - mi_next = _inc_mi(_shape_notime) - - # safety: don't want assume sets or dict keys because they may be unordered (depending - # on the version of python). if this behaviour of xarray changes this will be caught in - # tests. - assert isinstance(da.dims, tuple) or isinstance(da.dims, list) - + shape_notime = list(self.da.shape) + shape_notime[self.metadata.idx_time_dim] = 1 + shape_notime_trimmed = shape_notime[:(self.chunk_info.lsi_chunk + 1)] + mi_inc = [0 for _ in shape_notime_trimmed] + pcr = PersistenceChunker + num_chunks = self.chunk_info.num_chunks + + for _ in range(self.chunk_info.num_chunks): # use hyperslab selection for dimensions other than time. The time dimension is never # chunked and selected as a whole. - arr_chunk = da.isel( + arr_chunk = self.da.isel( { - da.dims[i]: ( - slice(mi_prev[i], mi_next[i] + 1, 1) - if i != self.metadata.idx_time_dim - else slice(0, da.shape[self.metadata.idx_time_dim]) + self.da.dims[i]: ( + slice(mi_inc[i], mi_inc[i] + 1, 1) + if i < self.chunk_info.lsi_chunk + 1 and i != self.metadata.idx_time_dim + else slice(0, self.da.shape[i]) ) - for i in range(len(_shape_notime)) + for i in range(len(shape_notime)) } ).values @@ -302,5 +298,8 @@ def generate_chunks(self) -> Generator[PersistenceDataChunk]: chunk_info=self.chunk_info, ) - # explicit copy - mi_prev = [x for x in mi_next] + # increment index and break if overflow is detected. + try: + mi_inc = pcr._inc_mi(mi_inc, mi_size=shape_notime_trimmed) + except OverflowError: + break diff --git a/packages/bundled_models/persistence/tests/interface/test__chunker.py b/packages/bundled_models/persistence/tests/interface/test__chunker.py new file mode 100644 index 00000000..b12ba555 --- /dev/null +++ b/packages/bundled_models/persistence/tests/interface/test__chunker.py @@ -0,0 +1,139 @@ +import functools +import xarray as xr +import numpy as np + +import persistence.interface._chunker as _chunker +import persistence.interface._metadata as _metadata +import persistence.interface._method as _method + +_pcr = _chunker.PersistenceChunker +_pci = _chunker.PersistenceChunkInfo +_pdc = _chunker.PersistenceDataChunk +_pma = _metadata.PersistenceMetadata +_pmd = _method.PersistenceMethod + + +def test_generate_chunks_default(): + """ + default chunk count is 1, i.e. no chunks or the entire dataset is a single chunk, this should + give the same result as ..._single_large_chunk. + + This is a separate test because the default may change, but we still want to retain the test + below for a single large chunk + """ + + +def test_generate_chunks_common_usecases(): + """ + common usecases for chunking + + Assume we have a large data slab of dimensions + (3, 8, 10*, 5, 4) + + 10* => is time and should be ignored + + total size = 3 * 8 * 5 * 4 = 480 + + we test the following chunk sizes: + 2 => chunk start index = 3, chunk size = 4, chunk_shape = (1, 1, 10, 1, 4) + 15 => chunk start index = 1, chunk size = 20, chunk_shape = (1, 1, 10, 5, 4) + 21 => chunk start index = 0, chunk size = 160, chunk_shape = (1, 8, 10, 5, 4) + """ + arr_shape = [3, 8, 10, 5, 4] + arr_shape_notime = [v if i != 2 else 1 for i, v in enumerate(arr_shape)] + size_total = functools.reduce(lambda x, y: x * y, arr_shape_notime) + num_chunks = [size_total // 2, size_total // 15, size_total // 21] + exp_result = [ + (3, 4, [1, 1, 10, 1, 4]), + (2, 20, [1, 1, 10, 5, 4]), + (0, 160, [1, 8, 10, 5, 4]), + ] + idx_time_dim = 2 + test_data = xr.DataArray(np.ones(arr_shape), dims=["x0", "x1", "t", "x2", "x3"]) + method = _pmd.MOST_RECENT # dummy + + for i, nchk in enumerate(num_chunks): + metadata = _pma(idx_time_dim=idx_time_dim, num_chunks_desired=nchk, method=method) + chunker = _pcr(da=test_data, metadata=metadata) + for data_chunk in chunker.generate_chunks(): + assert data_chunk.chunk_info.lsi_chunk == exp_result[i][0] + assert data_chunk.chunk_info.size_chunk == exp_result[i][1] + assert data_chunk.chunk_info.num_chunks == size_total // exp_result[i][1] + assert list(data_chunk.arr_chunk.shape) == exp_result[i][2] + + +def test_generate_chunks_single_large_chunk(): + """ + explicitly set chunk sizes = 1 + """ + pass + + +def test_generate_chunks_each_element_is_a_chunk(): + """ + exlicitly set num_chunks = total size + """ + pass + + +def test_generate_chunks_edge_cases(): + """ + - desired num chunks is less than 1 + - desired num chunks is greater than the max supported chunk size + """ + pass + + +def test_chunk_caculation_single_worker(): + """ + basic test of multiprocessing pool processing the generated chunks, but with a single worker. + This should work in most setups. + + TODO: copy the notes below to the compute pool - this is a temporary location + + NOTE: chunking only saves memory if num_chunks > num_workers. And that too only during + processing since we only load a fraction of the input array at a given time. + + NOTE: regardless, the final array will be joined in-memory, this is unavoidable unless each + worker writes straight to disk - which is out of scope. So the minimum memory usage will always + be greater than the size of the entire hypercube for a single time instance (persistence returns + 1 time point) + + + Alg: + + 1. get dimension order + 2. retrieve chunks (numpy arrays) + 3. perform compute + 4. join numy arrays -> will be of the form + + arr[x0, x1, x2, ..., t, ...] = slab + + OR + + arr[x0, x1, t, x2, ...] = slab + + here, x0, x1, x2 are the multi-indices that are incremented when filling in the slabs, + because the cardinality of t in the slabe is guarenteed to equal 1, both scenarios are + just as efficient, since it will not affect memory striding. (Usually if t is processed + as a whole, the first scenario is more efficient) + """ + + +# TODO: +# --- optional tests that are run only if the system can handle it --- +# @pytest.mark.skipif( +# mem < "1GiB", reason="system memory is not large enough to run test" +# ) +# def test_chunking_large_data_large_chunks(): +# """ +# skip if system does not have enough memory +# """ +# pass +# +# +# def test_multiprocessing_pool_ingest(): +# """ +# skip if system only has a single worker +# """ +# pass diff --git a/packages/bundled_models/persistence/tests/test__interface.py b/packages/bundled_models/persistence/tests/test__interface.py index e568bc0e..20626c89 100644 --- a/packages/bundled_models/persistence/tests/test__interface.py +++ b/packages/bundled_models/persistence/tests/test__interface.py @@ -30,7 +30,7 @@ def test_persistence_data_chunk_obj(): persistence_method = pet_persist.PersistenceMethod.MOST_RECENT idx_time: int = 1 # len = 5 - metadata = pet_persist.PersistenceMetadata( + metadata = pet_persist.PersistenceMetadata( idx_time_dim=idx_time, method=persistence_method, ) @@ -55,7 +55,7 @@ def test_persistence_chunker_obj(): idx_time: int = 1 # len = 5 num_chunks: int = 4 # each chunk is 2x5x2 persistence_method = pet_persist.PersistenceMethod.MOST_RECENT - metadata = pet_persist.PersistenceMetadata( + metadata = pet_persist.PersistenceMetadata( idx_time_dim=idx_time, method=persistence_method, num_chunks=num_chunks, @@ -70,6 +70,7 @@ def test_persistence_chunker_obj(): assert chunker.metadata.num_chunks == 4 assert chunker.metadata.method.num_time_indices_required() == 1 + def test_chunker_multi_index_increment(): """ Tests the scenario in the docstrings for mult index increment @@ -105,7 +106,7 @@ def test_chunker_multi_index_increment(): num_chunks: int = 999 persistence_method = pet_persist.PersistenceMethod.MOST_RECENT - metadata = pet_persist.PersistenceMetadata( + metadata = pet_persist.PersistenceMetadata( idx_time_dim=idx_time, method=persistence_method, num_chunks=999, @@ -135,12 +136,10 @@ def test_chunker_multi_index_increment(): for v, i in enumerate(zip(np_start_index, np_end_index)) } da_slice = da.isel(**multi_slice) - import pdb; pdb.set_trace() - da_slice.shape - - - + import pdb + pdb.set_trace() + da_slice.shape def test_chunker_multi_index_increment_with_single_dim(): @@ -151,6 +150,7 @@ def test_chunker_multi_index_increment_with_single_dim(): """ pass + def test_chunker_multi_index_increment_unit_cardinality(): """ Tests multi index increment for the case where there are multiple indices but the indices all From 3a76fc9a78e17d5a450335e60f09f293c5d3b8e1 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Sat, 28 Feb 2026 20:13:21 +1100 Subject: [PATCH 19/28] [skip ci] wip test chunker - minor bug fix --- .../src/persistence/interface/_chunker.py | 10 +++++--- .../tests/interface/test__chunker.py | 25 +++++++++++++------ 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py index baf38fad..6303c795 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +import math import numpy as np import xarray as xr import functools @@ -210,7 +211,7 @@ def _compute_chunkinfo_greedy( mi_size = list(mi_size) total_size = functools.reduce(lambda x, y: x * y, mi_size) - desired_chunksize = int(max(1, total_size // desired_numchunks)) + desired_chunksize = int(max(1, math.ceil(total_size / desired_numchunks))) num_idx = len(mi_size) prodsize = 1 @@ -218,7 +219,7 @@ def _compute_chunkinfo_greedy( first_chunkindex = None for i, s in enumerate(mi_size[::-1]): - if prodsize >= desired_chunksize: + if prodsize >= desired_chunksize and s != 1: first_chunkindex = num_idx - i - 1 actual_chunksize = prodsize break @@ -272,7 +273,7 @@ def generate_chunks(self) -> Generator[PersistenceDataChunk]: """ shape_notime = list(self.da.shape) shape_notime[self.metadata.idx_time_dim] = 1 - shape_notime_trimmed = shape_notime[:(self.chunk_info.lsi_chunk + 1)] + shape_notime_trimmed = shape_notime[: (self.chunk_info.lsi_chunk + 1)] mi_inc = [0 for _ in shape_notime_trimmed] pcr = PersistenceChunker num_chunks = self.chunk_info.num_chunks @@ -284,7 +285,8 @@ def generate_chunks(self) -> Generator[PersistenceDataChunk]: { self.da.dims[i]: ( slice(mi_inc[i], mi_inc[i] + 1, 1) - if i < self.chunk_info.lsi_chunk + 1 and i != self.metadata.idx_time_dim + if i < self.chunk_info.lsi_chunk + 1 + and i != self.metadata.idx_time_dim else slice(0, self.da.shape[i]) ) for i in range(len(shape_notime)) diff --git a/packages/bundled_models/persistence/tests/interface/test__chunker.py b/packages/bundled_models/persistence/tests/interface/test__chunker.py index b12ba555..bcb1c11e 100644 --- a/packages/bundled_models/persistence/tests/interface/test__chunker.py +++ b/packages/bundled_models/persistence/tests/interface/test__chunker.py @@ -35,17 +35,24 @@ def test_generate_chunks_common_usecases(): total size = 3 * 8 * 5 * 4 = 480 we test the following chunk sizes: - 2 => chunk start index = 3, chunk size = 4, chunk_shape = (1, 1, 10, 1, 4) - 15 => chunk start index = 1, chunk size = 20, chunk_shape = (1, 1, 10, 5, 4) - 21 => chunk start index = 0, chunk size = 160, chunk_shape = (1, 8, 10, 5, 4) + - chunk start index = 3, chunksize = 4, chunkshape = (1, 1, 10, 1, 4) + - chunk start index = 1, chunksize = 20, chunkshape = (1, 1, 10, 5, 4) + - chunk start index = 0, chunksize = 160, chunkshape = (1, 8, 10, 5, 4) + + the desired chunks that can result in the above results are: + - 4 >= chunksize > 1, 120 <= numchunks < 480, choose 479 arbitrarily + - 20 >= chunksize > 4, 24 <= numchunks < 120, choose 24 arbitrarily + - 160 >= chunksize > 20, 3 <= numchunks < 24, choose 11 arbitrarily + + """ arr_shape = [3, 8, 10, 5, 4] arr_shape_notime = [v if i != 2 else 1 for i, v in enumerate(arr_shape)] size_total = functools.reduce(lambda x, y: x * y, arr_shape_notime) - num_chunks = [size_total // 2, size_total // 15, size_total // 21] + num_chunks = [479, 24, 11] exp_result = [ (3, 4, [1, 1, 10, 1, 4]), - (2, 20, [1, 1, 10, 5, 4]), + (1, 20, [1, 1, 10, 5, 4]), (0, 160, [1, 8, 10, 5, 4]), ] idx_time_dim = 2 @@ -53,7 +60,9 @@ def test_generate_chunks_common_usecases(): method = _pmd.MOST_RECENT # dummy for i, nchk in enumerate(num_chunks): - metadata = _pma(idx_time_dim=idx_time_dim, num_chunks_desired=nchk, method=method) + metadata = _pma( + idx_time_dim=idx_time_dim, num_chunks_desired=nchk, method=method + ) chunker = _pcr(da=test_data, metadata=metadata) for data_chunk in chunker.generate_chunks(): assert data_chunk.chunk_info.lsi_chunk == exp_result[i][0] @@ -130,8 +139,8 @@ def test_chunk_caculation_single_worker(): # skip if system does not have enough memory # """ # pass -# -# +# +# # def test_multiprocessing_pool_ingest(): # """ # skip if system only has a single worker From 0f6fb0b3e4500af23df068ae2a75439e849362ff Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Sat, 28 Feb 2026 23:28:24 +1100 Subject: [PATCH 20/28] [skip ci] wip abstract out computation pool and add some TODOs/documentation --- .../src/persistence/interface/_backend.py | 65 ++++++ .../src/persistence/interface/_backends.py | 29 --- .../src/persistence/interface/_chunker.py | 79 ++++++- .../src/persistence/interface/_compute.py | 210 ++++++++++++++++++ .../src/persistence/interface/_interface.py | 137 +----------- .../src/persistence/interface/_metadata.py | 2 +- .../tests/interface/test__chunker.py | 31 +-- 7 files changed, 356 insertions(+), 197 deletions(-) create mode 100644 packages/bundled_models/persistence/src/persistence/interface/_backend.py delete mode 100644 packages/bundled_models/persistence/src/persistence/interface/_backends.py create mode 100644 packages/bundled_models/persistence/src/persistence/interface/_compute.py diff --git a/packages/bundled_models/persistence/src/persistence/interface/_backend.py b/packages/bundled_models/persistence/src/persistence/interface/_backend.py new file mode 100644 index 00000000..997c6149 --- /dev/null +++ b/packages/bundled_models/persistence/src/persistence/interface/_backend.py @@ -0,0 +1,65 @@ +from enum import StrEnum, auto + + +class PersistenceBackendType(StrEnum): + """ + Enumeration of supported compute backends for persistence computations. + + --- + + SUPPORTED BACKENDS (as of 2026-02-28): + - NUMPY (20260228) + - others are WIP + + Note: "supported" implies that the backend is supported by the build system, it does not imply + that the particular persistence method itself is supported for that backend. + + --- + + Backends are configured at the "build" level in pyproject.toml, e.g. for rust this may be + maturin/pyO3, which usually handles most of the heavy lifting. + + numba might require certain system dependencies - e.g. llvm, to function since it requires + building on the fly. + + For C/zig this would involve using: + a. ziglang/zig-pypi to build the zig packages into wheels and running them on the fly using + sys.execute to execute the wheel as a module, building/running zig on-the-fly. Avoids + having to distribute the pre-built dependencies, but may not work well with specific + interfaces like `numpy`. + b. using setuptools-zig to build them into a "integrated" library and packaging the build + into the wheel/distribution + c. using cffi or ctypes. + + Methods a. and b. would require extending Python.h directly, and hence are preferrable, since + they don't involve foreign calls. Unlike numba, method a. exists for zig where jit compilation + can happen without dependency on additional system libraries. + + All of the above methods generally avoid (or at least have the ability to avoid) the need for + conda environments and are pretty light weight. + """ + + C = "c" + C_ZIG = "zig" + NUMBA = "numba" + NUMPY = "numpy" + RUST = "rust" + UNKNOWN = auto() + + def check_support(self): + """ + As per the module documentation, this method only tells you if a particular backend is + supported by the *build system*, it doesn't imply that the backend is useable for any given + method. + + Therefore, this check can and should be done as early as possible. Whereas method + compatiblilty will be checked later into the runtime but still early enough point in the + code, before attempting the computation. (see `PersistenceCompute` for more details) + """ + match self: + case PersistenceBackendType.NUMPY: + return + case _: + raise NotImplementedError( + f"PersistenceBackendType: {self} is not supported" + ) diff --git a/packages/bundled_models/persistence/src/persistence/interface/_backends.py b/packages/bundled_models/persistence/src/persistence/interface/_backends.py deleted file mode 100644 index cdea879a..00000000 --- a/packages/bundled_models/persistence/src/persistence/interface/_backends.py +++ /dev/null @@ -1,29 +0,0 @@ -from enum import StrEnum, auto - - -class PersistenceBackendType(StrEnum): - """ - Supported compute backends. - - NOTE: only NUMPY is currently supported - """ - - NUMPY = "numpy" - NUMBA = "numba" - RUST = "rust" - UNKNOWN = auto() - - # overridable - def check_support(self): - """ - This check only guarentees that a particular computation mechanism or library is available. - Individual methods may still not support a particular backend and that check is handled at a - lower level. - """ - match self: - case PersistenceBackendType.NUMPY: - return - case _: - raise NotImplementedError( - f"PersistenceBackendType: {self} is not supported" - ) diff --git a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py index 6303c795..1a6960ea 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py @@ -9,35 +9,70 @@ from persistence.interface._metadata import PersistenceMetadata from persistence.types import PetDataArrayLike +# --- +# 1000 chunks is more than enough for most usecases. Persistence methods should not be using large +# amounts of historical data, and therefore should not need heavy chunking for data to fit in +# memory. +# +# If memory is an issue, this needs to be solved at a higher level where properties of the chunk +# strategy at the storage level are known and data can be optimally bounded (spatially or otherwise) +# before reaching the persistence chunker. +# +# Further the minimum memory usage is lower bounded by an entire single time slice of the of the +# data being processed since that is the output, and also is affected by the number of parallel +# workers used. +# +# Increasing chunk counts past a certain certain amount is therefore counter-productive. +_MAX_NUM_CHUNKS = 1000 +# --- + @dataclass class PersistenceChunkInfo: - num_chunks: int - size_chunk: int - # --- # least significant chunk index (fastest varying), most significant is 0, indices are # incremented from least significant (fast) to most significant (slow) lsi_chunk: int # --- + num_chunks: int + size_chunk: int + dim_names: list[str] @dataclass class PersistenceDataChunk: """ - Container to hold a reference to the persistence metadata. + The reason this is a class is that, there could be more useful info here in the future such as + start/end slices, and the chunk identifier, but for now its just a shallow wrapper and + effectively a type alias. """ - arr_chunk: PetDataArrayLike - metadata: PersistenceMetadata - chunk_info: PersistenceChunkInfo + arr_chunk: np.ndarray @dataclass class PersistenceChunker: """ - The persistence chunker chunks a xarray dataarray as part of a generator (lazy). + The persistence chunker chunks a xarray dataarray and relays them using a generator (lazy). + + Important: + + This is not a general purpose chunker. It is tailored for persistence and has a critical + assumption that the time dimension will not be chunked during the computation (it may still + be chunked in storage - this is fine). The chunking strategy is also intentionally + simplistic and greedy. + Depending on the method we could require 1 historical entry or 200. Therefore, there is no + "optimal" choice of chunks and workers here, since the data is not guaranteed to be stored + optimally for every choice of persistence method. The reason why persistence is so much + different to other models, is because we aren't storing any weights everything is done + on-the-fly. + + Hence, if there are issues with memory, the solution should be at a higher level where the + chunking strategy of the stored data is known, and appropriately bounded or alternatively + prepared offline with a storage strategy conducive to persistence calculations, BEFORE being + passed into this chunker. This may introduce a storage burden, but is imperative for any + sort of baseline model that cannot rely on stored weights, to function. The chunking algorithm is as follows: @@ -193,6 +228,7 @@ def _inc_mi(mi: list[int], mi_size: list[int], inc=1) -> list[int]: def _compute_chunkinfo_greedy( desired_numchunks: int, mi_size: list[int], + dim_names: list[str], ) -> PersistenceChunkInfo: """ This is a greedy chunksize calculation, because it prefers having entire dimensions as part @@ -240,6 +276,7 @@ def _compute_chunkinfo_greedy( num_chunks=actual_numchunks, size_chunk=actual_chunksize, lsi_chunk=first_chunkindex, + dim_names=dim_names, ) def __post_init__(self): @@ -247,12 +284,31 @@ def __post_init__(self): # on the version of python). if this behaviour of xarray changes this will be caught in # tests. assert isinstance(self.da.dims, tuple) or isinstance(self.da.dims, list) + + # check for chunks + if ( + self.metadata.desired_numchunks < 1 + or self.metadata.desired_numchunks > _MAX_NUM_CHUNKS + ): + err_msg = f"specified num chunks is invalid, valid range: 0 < num chunks <= {_MAX_NUM_CHUNKS}" + raise ValueError(err_msg) + # --- # suppress time index for calculations. # NOTE: - # Expanding an array by one dimension with a dimensionality 1 for example has no impact on - # the chunk sizes - this is equivilent to squeezing or removing the dimension. - # Forcing to 0 is not right here, since that'd imply a empty slice. + # Expanding an array by one dimension with a dimensionality 1, for example, has no impact + # on the chunk size, since the retraction operation of squeezing out the dimension, of + # size 1, also does not affect chunk size. Therefore, to suppress a dimension we set its + # size to 1 or drop it. Forcing to 0 is not right here, since that'd result in a empty array. + # + # Since we want to preserve structure, we can't drop it so our only remaining option is to + # force the size to 1. + # + # TODO: add a fast return for the special case below: + # The special case of this approach is if there is only one dimension (1-d array), + # however, this case is easily handled since the dimension has to be time. And because it + # cannot be chunked by definition of this class, the chunker should just spit out the + # entire array as a single chunk. shape_notime = list(self.da.shape) shape_notime[self.metadata.idx_time_dim] = 1 # --- @@ -260,6 +316,7 @@ def __post_init__(self): self.chunk_info = self._compute_chunkinfo_greedy( self.metadata.num_chunks_desired, shape_notime, + self.da.shape, ) def generate_chunks(self) -> Generator[PersistenceDataChunk]: diff --git a/packages/bundled_models/persistence/src/persistence/interface/_compute.py b/packages/bundled_models/persistence/src/persistence/interface/_compute.py new file mode 100644 index 00000000..e72b4451 --- /dev/null +++ b/packages/bundled_models/persistence/src/persistence/interface/_compute.py @@ -0,0 +1,210 @@ +import multiprocessing +from enum import StrEnum, auto +from dataclasses import dataclass, field +from collections.abc import Callable +from contextlib import contextmanager +from typing import Union, Generator + +import numpy as np +import xarray as xr + +from persistence.types import PetDataArrayLike +from persistence.methods._impute import SimpleImpute +from persistence.interface._metadata import PersistenceMetadata +from persistence.interface._method import PersistenceMethod +from persistence.interface._chunker import PersistenceDataChunk, PersistenceChunker +from persistence.interface._backend import PersistenceBackendType + + +# TODO: no implementation yet +@dataclass +class PersistenceComputePool: + """ + Generates a compute pool and uses the given chunk genarator along with the configured method to + perform the computations. + + Joins the chunks back together at completion according to the FIFO order. + + Computation here happens at a lower structural level (numpy or chosen system backend). + + --- + + Algorithm (see compute_chunks): + + 1. retrieve chunks (numpy arrays) + 2. perform compute on each chunk depending on the persistence method + 3. join numy arrays -> will be of the form + + for i in nd-index: + + arr[x0, x1, x2, ..., t, ...] + = arr[x0, x1, x2, ...] + = slab + + OR + + arr[x0, x1, t, x2, ...] + = arr[x0, x1, 1, x2, ...] + = slab + + here, x0, x1, x2 are the multi-indices that are incremented when filling in the slabs. + + Because the persistence methods all reduce the time index to a cardinality of 1, both of + these scenarios are equally efficient. + 4. use the stored data-array information (shapes/dimnames) + + --- + + Important: + + - As per the rest of the persistence structures, the time dimension existing is crucial, and the + time dimension is what is aggregated over, and therfore not chunked. It is instead simpler to + act on, and chunk the embarassingly parallel independent dimensions (e.g. spatial dimensions). + + - Persistence computation is single-variate, it may in the future infer something from the + dimensionality, but it may not infer information from other variables. + + - In other words, coordinate information may be considered, but not other variables in a + provided dataset. Therefore, the absolute highest level structure returnable by this + computation a DataArray. + + - The reason for this is that multi-variate persistence models are an anti-pattern, since + persistence models inherently shouldn't do any inference, physics, or _parametric_ statistical + learning. Unparamaterized methods, i.e. methods that do NOT use knowledge of what the + coordinates or other variables represent - other than the trivial inference that they are + different dimensions and have a certain shape, are okay. + + Future considerations: + + - There could be methods in the future that aggregate based on neighbouring dimensions, in such + a scenario, the computation is still parameterless, but the methods could derive additional + statistical patterns and "state" parameters that could improve performance. This may cause + some non-determinism based on how chunks are chosen. + + - However, as long as these filters are semi bounded - e.g. "9 parameter savitzky golay filter", + then there is a guarantee that despte how large the chunks are the maximum number of + neighbouring parameters used in any "smarts" is 9 - spatially this could be a convolutional + 3x3 grid for example doing some smoothing or noise inference. And therefore, maintain some + level of determinism as long as the chunk sizes don't fall below this criteria. + + - Regardless, `PersistenceMetadata` and `PersistenceChunkInfo` are easily serialisable + structures that can be logged as part as experiments. + + - For now the only independent parameter that is known by the algorithms, is the time dimension. + """ + + chunk_generator: Generator[PersistenceDataChunk] # the chunks used for computation + chunk_info: PersistenceChunkInfo + metadata: PersistenceMetadata + ordered_dimnames: list[str] # used for re-creating the dataarray + + @staticmethod + def _job_wrapper(chunk: PersistenceDataChunk) -> np.ndarray: + # TODO: implementation required + # essentially takes in a chunk and returns a numpy ndarray + # 1. get PersistenceCompute "unit" or obj from the chunk generator (and metadata reference) + # - this will embed additional info such as the time index, and compute options such as + # the backend and whether or not to impute - required by the compute methods + # 2. run obj.compute(chunk) + # 3. return the resulting numpy array + pass + + def compute_chunks(self) -> xr.DataArray: + # 1. prepare a empty numpy container in memory with the appropriate data shape + # 2. create a multiprocessing worker pool based on metadata + # 3. use a combination of chunk_info and the infrastructure to attempt to tweak the worker + # size if needed. (e.g. if OOM is predicted). Or otherwise spit out a warning. + # 4. run the _job_wrapper against the provided chunk_generator and fill the container + # prepared in 1. as per the description in the module docs. + # 5. re-wrap the dataarray meta information (mainly dimension names) onto the numpy + # structure. + # 6. return the dataarray + # + # NOTE: there will be a higher level structure that recombines the variables of the data + # array. + pass + + +# TODO: the variable references are not right - need to use self.metadata +@dataclass +class PersistenceCompute: + arr: PetDataArrayLike + metadata: PersistenceMetadata + + def _method_impl(self, arr: np.ndarray) -> np.ndarray: + match self.backend: + case PersistenceMethod.NUMPY: + return self._method_impl_numpy(arr) + case PersistenceMethod.NUMBA: + return self._method_impl_numba(arr) + case PersistenceMethod.RUST: + return self._method_impl_rust(arr) + case _: + raise NotImplementedError("PersistenceCompute: Unknown backend") + + def _method_impl_numpy(self, arr: np.ndarray) -> np.ndarray: + match self.method: + case PersistenceMethod.MEDIAN_OF_THREE: + return _median_of_three_numpy(arr, self.idx_time) + case PersistenceMethod.MOST_RECENT: + raise NotImplementedError("TODO") + case _: + raise NotImplementedError( + f"PersistenceCompute: compute method {self.method} has not been implemented" + ) + + def _method_impl_numba(self, arr: np.ndarray) -> np.ndarray: + raise NotImplementedError("numba backend is not supported") + + def _method_impl_rust(self, arr: np.ndarray) -> np.ndarray: + raise NotImplementedError("rust backend is not supported") + + # TODO: This slicer should go into the chunker, since we want to slice out data as early as + # possible. + def _slice(self) -> PetDataArrayLike: + # slice lookback data + len_time = arr.shape[idx_time] + + if len_time < self.num_lookback: + raise ValueError( + "PersistenceCompute: input data does not have enough historical time indices to compute this method." + ) + + idx_end = len_time + idx_start = idx_end - self.num_lookback + idx_slice = slice(idx_start, idx_end, 1) # start, end, step + + # generator for nd-index slicing + idx_all = slice(None, None, None) + nd_slice = ( + idx_slice if i == idx_time else idx_all for i in range(len(arr.shape)) + ) + + # sliced array that only has the latest 3 values + arr_slice = arr[*tuple(nd_slice)] + + return arr_slice + + def _impute(self, arr_sliced) -> PetDataArrayLike: + # NOTE: only simple impute is currently supported + if self.do_impute: + imputer = SimpleImpute(arr_sliced) + return imputer.impute_mean() + + # default - do nothing + return arr_sliced + + def compute(self) -> PetDataArrayLike: + # check backend support + self.metadata.backend.check_support() + + # slice: to num_lookback indices + arr_sliced: PetDataArrayLike = self._slice(self.arr) + + # preprocess: currently just (maybe) impute + arr_preprocessed: PetDataArrayLike = self._impute(arr_sliced) + + # compute: using specified persistence method and preprocessed array + arr_persist: PetDataArrayLike = self._method_impl(arr_preprocessed) + + return arr_persist diff --git a/packages/bundled_models/persistence/src/persistence/interface/_interface.py b/packages/bundled_models/persistence/src/persistence/interface/_interface.py index aa293708..e60b9208 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_interface.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_interface.py @@ -2,138 +2,5 @@ Module that contains the interface required to "hook" into other pipeline methods in order to run Persistence as a model. """ - -import multiprocessing -from enum import StrEnum, auto -from dataclasses import dataclass, field -from collections.abc import Callable -from contextlib import contextmanager -from typing import Union, Generic - -import numpy as np -import xarray as xr - -from persistence.types import PetDataArrayLike -from persistence.methods._impute import SimpleImpute -from persistence.interface._metadata import PersistenceMetadata -from persistence.interface._method import PersistenceMethod -from persistence.interface._chunker import PersistenceDataChunk - -# unlikely to have more than 1000 processes for persistence, due to diminishing returns - even on a -# supercomputer -_MAX_NUM_CHUNKS = 1000 - - -# TODO: no implementation yet -@dataclass -class PersistenceComputePool: - """ - Generates a compute pool by creating a chunker and assigning chunks to various workers that - compute the specified persistence methods. - - Takes an array, and persistence metadata. This is internal and built to be used with - PetDataset.map_each_var. - - 1. make chunker as per PersistenceChunker - 2. retrieve chunks from PersistenceChunker.generate_chunks - 3. create a job_wrapper over PersistenceCompute to run each PersistenceDataChunk object against - the persistence method to generate the output chunk - """ - - da: xr.DataArray - metadata: PersistenceMetadata - - @staticmethod - def _job_wrapper(chunk: PersistenceDataChunk) -> PetDataArrayLike: - pass - - def _make_chunker(self): - pass - - def compute_chunks(self) -> PetDataArrayLike: - pass - - -# TODO: the variable references are not right - need to use self.metadata -@dataclass -class PersistenceCompute: - arr: PetDataArrayLike - metadata: PersistenceMetadata - - def _method_impl(self, arr_preprocessed: PetDataArrayLike) -> PetDataArrayLike: - match self.backend: - case PersistenceMethod.NUMPY: - return self._method_impl_numpy(arr_preprocessed) - case PersistenceMethod.NUMBA: - return self._method_impl_numba(arr_preprocessed) - case PersistenceMethod.RUST: - return self._method_impl_rust(arr_preprocessed) - case _: - raise NotImplementedError("PersistenceCompute: Unknown backend") - - def _method_impl_numpy(self, arr_preprocessed) -> PetDataArrayLike: - match self.method: - case PersistenceMethod.MEDIAN_OF_THREE: - return _median_of_three_numpy(arr_preprocessed, self.idx_time) - case PersistenceMethod.MOST_RECENT: - raise NotImplementedError("TODO") - case _: - raise NotImplementedError( - f"PersistenceCompute: compute method {self.method} has not been implemented" - ) - - def _method_impl_numba( - self, arr_preprocessed: PetDataArrayLike - ) -> PetDataArrayLike: - raise NotImplementedError("numba backend is not supported") - - def _method_impl_rust(self, arr_preprocessed) -> PetDataArrayLike: - raise NotImplementedError("rust backend is not supported") - - def _slice(self) -> PetDataArrayLike: - # slice lookback data - len_time = arr.shape[idx_time] - - if len_time < self.num_lookback: - raise ValueError( - "PersistenceCompute: input data does not have enough historical time indices to compute this method." - ) - - idx_end = len_time - idx_start = idx_end - self.num_lookback - idx_slice = slice(idx_start, idx_end, 1) # start, end, step - - # generator for nd-index slicing - idx_all = slice(None, None, None) - nd_slice = ( - idx_slice if i == idx_time else idx_all for i in range(len(arr.shape)) - ) - - # sliced array that only has the latest 3 values - arr_slice = arr[*tuple(nd_slice)] - - return arr_slice - - def _impute(self, arr_sliced) -> PetDataArrayLike: - # NOTE: only simple impute is currently supported - if self.do_impute: - imputer = SimpleImpute(arr_sliced) - return imputer.impute_mean() - - # default - do nothing - return arr_sliced - - def compute(self) -> PetDataArrayLike: - # check backend support - self.metadata.backend.check_support() - - # slice: to num_lookback indices - arr_sliced: PetDataArrayLike = self._slice(self.arr) - - # preprocess: currently just (maybe) impute - arr_preprocessed: PetDataArrayLike = self._impute(arr_sliced) - - # compute: using specified persistence method and preprocessed array - arr_persist: PetDataArrayLike = self._method_impl(arr_preprocessed) - - return arr_persist +# TODO: this is no longer required, as it has been disected into separate modules. +# "persistence_impl.py" will instead be the actual interface into the computation. diff --git a/packages/bundled_models/persistence/src/persistence/interface/_metadata.py b/packages/bundled_models/persistence/src/persistence/interface/_metadata.py index adb3b2c3..7231ca51 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_metadata.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_metadata.py @@ -1,6 +1,6 @@ from dataclasses import dataclass, field from multiprocessing import cpu_count -from persistence.interface._backends import PersistenceBackendType +from persistence.interface._backend import PersistenceBackendType from persistence.interface._method import PersistenceMethod diff --git a/packages/bundled_models/persistence/tests/interface/test__chunker.py b/packages/bundled_models/persistence/tests/interface/test__chunker.py index bcb1c11e..8ce8f6ea 100644 --- a/packages/bundled_models/persistence/tests/interface/test__chunker.py +++ b/packages/bundled_models/persistence/tests/interface/test__chunker.py @@ -27,10 +27,10 @@ def test_generate_chunks_common_usecases(): """ common usecases for chunking - Assume we have a large data slab of dimensions + Assume a reasonable number of dimensions for this test. (3, 8, 10*, 5, 4) - 10* => is time and should be ignored + 10* => is the time dimension and should be ignored by the chunking strategy. total size = 3 * 8 * 5 * 4 = 480 @@ -44,7 +44,14 @@ def test_generate_chunks_common_usecases(): - 20 >= chunksize > 4, 24 <= numchunks < 120, choose 24 arbitrarily - 160 >= chunksize > 20, 3 <= numchunks < 24, choose 11 arbitrarily - + NOTE: + The first two cases above are intentionally edge cases and sit at the boundaries. + More edge cases such as: + - intentionally bad settings of chunks, + - impact of chunking along the first/last index, + - the position of the time index, + - testing defaults, + are covered in other tests. """ arr_shape = [3, 8, 10, 5, 4] arr_shape_notime = [v if i != 2 else 1 for i, v in enumerate(arr_shape)] @@ -108,24 +115,6 @@ def test_chunk_caculation_single_worker(): be greater than the size of the entire hypercube for a single time instance (persistence returns 1 time point) - - Alg: - - 1. get dimension order - 2. retrieve chunks (numpy arrays) - 3. perform compute - 4. join numy arrays -> will be of the form - - arr[x0, x1, x2, ..., t, ...] = slab - - OR - - arr[x0, x1, t, x2, ...] = slab - - here, x0, x1, x2 are the multi-indices that are incremented when filling in the slabs, - because the cardinality of t in the slabe is guarenteed to equal 1, both scenarios are - just as efficient, since it will not affect memory striding. (Usually if t is processed - as a whole, the first scenario is more efficient) """ From 1b9d3f90df24ce1ac9e14184c2256d75cac9d3bc Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Sun, 1 Mar 2026 12:59:23 +1100 Subject: [PATCH 21/28] [skip ci] refactor chunker logic to chunk time based on required historical indices; add more documentation for compute logic --- .../src/persistence/interface/_chunker.py | 111 ++++++++++++------ .../src/persistence/interface/_compute.py | 28 ++++- .../src/persistence/interface/_metadata.py | 49 +++++++- .../src/persistence/interface/_method.py | 4 +- .../tests/interface/test__chunker.py | 15 +-- 5 files changed, 160 insertions(+), 47 deletions(-) diff --git a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py index 1a6960ea..9689316a 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py @@ -129,6 +129,10 @@ class PersistenceChunker: metadata: PersistenceMetadata chunk_info: PersistenceChunkInfo | None = None + # TODO: + # add data shape as an explicit input, as even da.shape may trigger a computation depending on + # the underlying storage type. + @staticmethod def _b10_to_mi(b10: int, mi_size: list[int]) -> list[int]: """ @@ -237,9 +241,10 @@ def _compute_chunkinfo_greedy( Returns a structure (PersistenceChunkInfo) containing 1. actual chunk size - 3. actual chunk count - 2. the position (least significant) of the first index that should be be used for + 2. actual chunk count + 3. the position (least significant) of the first index that should be be used for incrementing chunks (using multi-indexing) + 4. dimension names (passed through) """ assert desired_numchunks >= 1 @@ -287,15 +292,17 @@ def __post_init__(self): # check for chunks if ( - self.metadata.desired_numchunks < 1 - or self.metadata.desired_numchunks > _MAX_NUM_CHUNKS + self.metadata.num_chunks_desired < 1 + or self.metadata.num_chunks_desired > _MAX_NUM_CHUNKS ): err_msg = f"specified num chunks is invalid, valid range: 0 < num chunks <= {_MAX_NUM_CHUNKS}" raise ValueError(err_msg) # --- - # suppress time index for calculations. + # Suppress time index for calculations. + # # NOTE: + # # Expanding an array by one dimension with a dimensionality 1, for example, has no impact # on the chunk size, since the retraction operation of squeezing out the dimension, of # size 1, also does not affect chunk size. Therefore, to suppress a dimension we set its @@ -303,12 +310,6 @@ def __post_init__(self): # # Since we want to preserve structure, we can't drop it so our only remaining option is to # force the size to 1. - # - # TODO: add a fast return for the special case below: - # The special case of this approach is if there is only one dimension (1-d array), - # however, this case is easily handled since the dimension has to be time. And because it - # cannot be chunked by definition of this class, the chunker should just spit out the - # entire array as a single chunk. shape_notime = list(self.da.shape) shape_notime[self.metadata.idx_time_dim] = 1 # --- @@ -316,9 +317,65 @@ def __post_init__(self): self.chunk_info = self._compute_chunkinfo_greedy( self.metadata.num_chunks_desired, shape_notime, - self.da.shape, + self.da.dims, ) + # check that the input data shape has enough time indices to support the persistence + # calculation. + len_time_max = self.da.shape[self.metadata.idx_time_dim] + len_time_prp = self.metadata.len_time_preprocess() + + if len_time_prp > len_time_max: + raise ValueError( + "PersistenceChunker: critical failure, data source does not have enough time" + " indices for this persistence method." + ) + + def _get_dim_slices(self, mi: list[int]) -> dict[str, slice]: + """ + maps slices to dimension names. + + 1. slices time based on required number of historical data for imputation/persistence + calculations. + + NOTE: + + This is an added safety, since it is expected that something higher level would have + sliced this by now. But, in case the data-array points (lazily) to the entire history + (for example), this slicing makes certain that the data that is loaded into memory is + still reasonably bounded. + + 2. slices other indices based on required chunk sizes + """ + assert self.chunk_info is not None and self.chunk_info.lsi_chunk is not None + assert all([x is not None and x >= 0 for x in mi]) + + dict_slice_dims = {} + len_time_max = self.da.shape[self.metadata.idx_time_dim] + len_time_prp = self.metadata.len_time_preprocess() + # this is static for all chunks + slice_time = slice(len_time_max - len_time_prp, len_time_max) + + for idx, name in enumerate(self.da.dims): + dim_size = self.da.shape[idx] + + # time dimension => use special time slicing + if idx == self.metadata.idx_time_dim: + # assert time dimension name is stored correctly - random safety check + assert name == self.chunk_info.dim_names[self.metadata.idx_time_dim] + dict_slice_dims[name] = slice_time + + # multi-indexer dimension => 1^m slice => incremental chunk of size 1 + elif idx < self.chunk_info.lsi_chunk + 1: + dict_slice_dims[name] = slice(mi[idx], mi[idx] + 1) + + # chunk dimension => N_i^(n-m) slice => use the entire dimension as a chunk (N_i) + else: + dict_slice_dims[name] = slice(0, dim_size) + + assert all(n in dict_slice_dims for n in self.chunk_info.dim_names) + return dict_slice_dims + def generate_chunks(self) -> Generator[PersistenceDataChunk]: """ Evaluate chunks by loading each chunk into memory, the chunks are lazily loaded but eagerly @@ -328,37 +385,21 @@ def generate_chunks(self) -> Generator[PersistenceDataChunk]: This generator generally would be fed into a multiprocessing worker pool in conjunction with a method to process each chunk. """ + # TODO: add a fast return for the special case when time is the only dimension. shape_notime = list(self.da.shape) shape_notime[self.metadata.idx_time_dim] = 1 shape_notime_trimmed = shape_notime[: (self.chunk_info.lsi_chunk + 1)] mi_inc = [0 for _ in shape_notime_trimmed] - pcr = PersistenceChunker - num_chunks = self.chunk_info.num_chunks for _ in range(self.chunk_info.num_chunks): - # use hyperslab selection for dimensions other than time. The time dimension is never - # chunked and selected as a whole. - arr_chunk = self.da.isel( - { - self.da.dims[i]: ( - slice(mi_inc[i], mi_inc[i] + 1, 1) - if i < self.chunk_info.lsi_chunk + 1 - and i != self.metadata.idx_time_dim - else slice(0, self.da.shape[i]) - ) - for i in range(len(shape_notime)) - } - ).values - - # pass chunk - yield PersistenceDataChunk( - arr_chunk=arr_chunk, - metadata=self.metadata, - chunk_info=self.chunk_info, - ) + dict_slice_dims = self._get_dim_slices(mi_inc) + arr_chunk = self.da.isel(dict_slice_dims) + + # pass chunk to caller + yield PersistenceDataChunk(arr_chunk) # increment index and break if overflow is detected. try: - mi_inc = pcr._inc_mi(mi_inc, mi_size=shape_notime_trimmed) + mi_inc = self._inc_mi(mi_inc, mi_size=shape_notime_trimmed) except OverflowError: break diff --git a/packages/bundled_models/persistence/src/persistence/interface/_compute.py b/packages/bundled_models/persistence/src/persistence/interface/_compute.py index e72b4451..964228ed 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_compute.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_compute.py @@ -16,7 +16,6 @@ from persistence.interface._backend import PersistenceBackendType -# TODO: no implementation yet @dataclass class PersistenceComputePool: """ @@ -29,7 +28,7 @@ class PersistenceComputePool: --- - Algorithm (see compute_chunks): + Algorithm (see `compute_chunks`): 1. retrieve chunks (numpy arrays) 2. perform compute on each chunk depending on the persistence method @@ -55,6 +54,29 @@ class PersistenceComputePool: --- + Further, to reiterate the assumption, in persistence methods chunks are loaded lazily, but + evaluated eagerly, in otherwords the computation itself should not use `dask`. And loading is + forced to be synchronous e.g. + + load chunk 1 ---> compute [worker 1] + | finish compute + *>>> load chunk 2 ---> compute [worker 2] + | + *>>> load chunk 3 ---> compute [worker 3] + |---> at this point, we should only have: + - two chunks in memory with multiple time indices + - one "result" chunk with the reduced time dimension + + *>>> the time taken to load a chunk into memory + + Keep the above in mind when running this program, as it may help to debug issues. + Any scheduling/wait time implementation is out of scope here, and in fact is an anti-pattern. + + (This does not mean scheduling cannot be used - it just needs to be used at a higher level and + at a distributed compute level - NOT at a single node compute level) + + --- + Important: - As per the rest of the persistence structures, the time dimension existing is crucial, and the @@ -74,6 +96,8 @@ class PersistenceComputePool: coordinates or other variables represent - other than the trivial inference that they are different dimensions and have a certain shape, are okay. + --- + Future considerations: - There could be methods in the future that aggregate based on neighbouring dimensions, in such diff --git a/packages/bundled_models/persistence/src/persistence/interface/_metadata.py b/packages/bundled_models/persistence/src/persistence/interface/_metadata.py index 7231ca51..50b19cf2 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_metadata.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_metadata.py @@ -1,7 +1,10 @@ from dataclasses import dataclass, field from multiprocessing import cpu_count from persistence.interface._backend import PersistenceBackendType -from persistence.interface._method import PersistenceMethod +from persistence.interface._method import ( + PersistenceMethod, + _DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER, +) @dataclass @@ -14,6 +17,9 @@ class PersistenceMetadata: method: PersistenceMethod # persistence method to use # --- (kw)args with defaults --- + # IMPORTANT: These are essentially tuning parameters that affect performance. The defaults are + # usually okay, but they need to be considered carefully for certain systems with limited + # computational power. num_workers: int = field(default_factory=cpu_count) # --- @@ -36,3 +42,44 @@ class PersistenceMetadata: do_impute: bool = True backend: PersistenceBackendType = PersistenceBackendType.NUMPY + + # --- + # multiplier to determine how much data to load, essentially + # + # S * N, where, + # N = Minimum amount of data required for computing a method + # S = this multiplier. + # + # The default is conservatively set at 2 so that it is capable of treating missing values, while + # not overzealously loading things into memory. + # + # If a dataset does not have missing values this can be set to 1, to minimize the load on memory. + # + # On the other hand some datasets may need a much larger sparsity multiplier as they are mostly + # sparse - this can be useful when values from historical observations quite far into the past + # can still be useful for persistence. + sparsity_multiplier: int = _DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER + # --- + + def len_time_preprocess(self) -> int: + """ + number of historical time indices required for preprocessing, e.g. imputation to fill + missing values. + + This is used during the chunking and pre-processing phase. + """ + _len = int(self.method.min_lookback(self.sparsity_multiplier)) + assert _len >= 1 + return _len + + def len_time_compute(self) -> int: + """ + number of historical time indices required for the persistence computation. + + This is used during the compute phase. + """ + _len = int(self.method.num_time_indices_required()) + # safety: this must always be smaller than or equal to the pre-processing length + assert _len <= self.len_time_preprocess() + assert _len >= 1 + return _len diff --git a/packages/bundled_models/persistence/src/persistence/interface/_method.py b/packages/bundled_models/persistence/src/persistence/interface/_method.py index 02708377..94b47b6d 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_method.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_method.py @@ -41,7 +41,7 @@ def num_time_indices_required(self) -> int: def min_lookback( self, sparsity_multiplier=_DEFAULT_PERSISTENCE_SPARSITY_MULTIPLIER - ): + ) -> int: """ The minimum amount of lookback required to compute the corresponding metric. By default we assume a 50% sparsity and require at least double the number of values @@ -50,4 +50,4 @@ def min_lookback( if sparsity_multiplier < 1: raise ValueError("PersistenceMethod: Sparsity multiplier must be >= 1") - return self.num_time_indices_required() * sparsity_multiplier + return int(self.num_time_indices_required() * sparsity_multiplier) diff --git a/packages/bundled_models/persistence/tests/interface/test__chunker.py b/packages/bundled_models/persistence/tests/interface/test__chunker.py index 8ce8f6ea..90164b06 100644 --- a/packages/bundled_models/persistence/tests/interface/test__chunker.py +++ b/packages/bundled_models/persistence/tests/interface/test__chunker.py @@ -57,24 +57,25 @@ def test_generate_chunks_common_usecases(): arr_shape_notime = [v if i != 2 else 1 for i, v in enumerate(arr_shape)] size_total = functools.reduce(lambda x, y: x * y, arr_shape_notime) num_chunks = [479, 24, 11] + # with MEDIAN_OF_THREE we expect 2 * 3 = 6 indices for time + method = _pmd.MEDIAN_OF_THREE exp_result = [ - (3, 4, [1, 1, 10, 1, 4]), - (1, 20, [1, 1, 10, 5, 4]), - (0, 160, [1, 8, 10, 5, 4]), + (3, 4, [1, 1, 6, 1, 4]), + (1, 20, [1, 1, 6, 5, 4]), + (0, 160, [1, 8, 6, 5, 4]), ] idx_time_dim = 2 test_data = xr.DataArray(np.ones(arr_shape), dims=["x0", "x1", "t", "x2", "x3"]) - method = _pmd.MOST_RECENT # dummy for i, nchk in enumerate(num_chunks): metadata = _pma( idx_time_dim=idx_time_dim, num_chunks_desired=nchk, method=method ) chunker = _pcr(da=test_data, metadata=metadata) + assert chunker.chunk_info.lsi_chunk == exp_result[i][0] + assert chunker.chunk_info.size_chunk == exp_result[i][1] + assert chunker.chunk_info.num_chunks == size_total // exp_result[i][1] for data_chunk in chunker.generate_chunks(): - assert data_chunk.chunk_info.lsi_chunk == exp_result[i][0] - assert data_chunk.chunk_info.size_chunk == exp_result[i][1] - assert data_chunk.chunk_info.num_chunks == size_total // exp_result[i][1] assert list(data_chunk.arr_chunk.shape) == exp_result[i][2] From e5534a1cdc7a39d2e6dad1118164cbfb9e231c52 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Sun, 1 Mar 2026 19:21:18 +1100 Subject: [PATCH 22/28] [skip ci] basic tests for persistence compute base class --- .../src/persistence/interface/_chunker.py | 7 +- .../src/persistence/interface/_compute.py | 78 +++++++++---------- .../tests/interface/test__compute.py | 28 +++++++ 3 files changed, 68 insertions(+), 45 deletions(-) create mode 100644 packages/bundled_models/persistence/tests/interface/test__compute.py diff --git a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py index 9689316a..469df6ed 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py @@ -321,14 +321,13 @@ def __post_init__(self): ) # check that the input data shape has enough time indices to support the persistence - # calculation. + # calculation (including preprocessing). len_time_max = self.da.shape[self.metadata.idx_time_dim] len_time_prp = self.metadata.len_time_preprocess() - if len_time_prp > len_time_max: raise ValueError( - "PersistenceChunker: critical failure, data source does not have enough time" - " indices for this persistence method." + "PersistenceChunker: input DataArray does have enough time indices for this" + " persistence method." ) def _get_dim_slices(self, mi: list[int]) -> dict[str, slice]: diff --git a/packages/bundled_models/persistence/src/persistence/interface/_compute.py b/packages/bundled_models/persistence/src/persistence/interface/_compute.py index 964228ed..a31be87e 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_compute.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_compute.py @@ -10,9 +10,14 @@ from persistence.types import PetDataArrayLike from persistence.methods._impute import SimpleImpute +from persistence.methods._median import _median_of_three_numpy from persistence.interface._metadata import PersistenceMetadata from persistence.interface._method import PersistenceMethod -from persistence.interface._chunker import PersistenceDataChunk, PersistenceChunker +from persistence.interface._chunker import ( + PersistenceDataChunk, + PersistenceChunker, + PersistenceChunkInfo, +) from persistence.interface._backend import PersistenceBackendType @@ -156,20 +161,20 @@ class PersistenceCompute: metadata: PersistenceMetadata def _method_impl(self, arr: np.ndarray) -> np.ndarray: - match self.backend: - case PersistenceMethod.NUMPY: + match self.metadata.backend: + case PersistenceBackendType.NUMPY: return self._method_impl_numpy(arr) - case PersistenceMethod.NUMBA: + case PersistenceBackendType.NUMBA: return self._method_impl_numba(arr) - case PersistenceMethod.RUST: + case PersistenceBackendType.RUST: return self._method_impl_rust(arr) case _: raise NotImplementedError("PersistenceCompute: Unknown backend") def _method_impl_numpy(self, arr: np.ndarray) -> np.ndarray: - match self.method: + match self.metadata.method: case PersistenceMethod.MEDIAN_OF_THREE: - return _median_of_three_numpy(arr, self.idx_time) + return _median_of_three_numpy(arr, self.metadata.idx_time_dim) case PersistenceMethod.MOST_RECENT: raise NotImplementedError("TODO") case _: @@ -183,52 +188,43 @@ def _method_impl_numba(self, arr: np.ndarray) -> np.ndarray: def _method_impl_rust(self, arr: np.ndarray) -> np.ndarray: raise NotImplementedError("rust backend is not supported") - # TODO: This slicer should go into the chunker, since we want to slice out data as early as - # possible. - def _slice(self) -> PetDataArrayLike: - # slice lookback data - len_time = arr.shape[idx_time] - - if len_time < self.num_lookback: - raise ValueError( - "PersistenceCompute: input data does not have enough historical time indices to compute this method." - ) - - idx_end = len_time - idx_start = idx_end - self.num_lookback - idx_slice = slice(idx_start, idx_end, 1) # start, end, step - - # generator for nd-index slicing - idx_all = slice(None, None, None) - nd_slice = ( - idx_slice if i == idx_time else idx_all for i in range(len(arr.shape)) + def _slice_time(self, arr: np.ndarray) -> np.ndarray: + """ + Further slices the data chunk into a smaller chunk required for the computation (usually + after imputation. + """ + # slice out data required for the computation + len_time_max = arr.shape[self.metadata.idx_time_dim] + len_time_cmp = self.metadata.len_time_compute() + arr_sliced = np.take( + arr, + range(len_time_max - len_time_cmp, len_time_max), + axis=self.metadata.idx_time_dim, ) - # sliced array that only has the latest 3 values - arr_slice = arr[*tuple(nd_slice)] + return arr_sliced - return arr_slice + def _impute(self, arr: np.ndarray) -> np.ndarray: + # default to pass-through + arr_imputed = arr - def _impute(self, arr_sliced) -> PetDataArrayLike: - # NOTE: only simple impute is currently supported - if self.do_impute: - imputer = SimpleImpute(arr_sliced) - return imputer.impute_mean() + if self.metadata.do_impute: + imputer = SimpleImpute(arr) + arr_imputed = imputer.impute_mean() - # default - do nothing - return arr_sliced + return arr_imputed - def compute(self) -> PetDataArrayLike: + def compute(self) -> np.ndarray: # check backend support self.metadata.backend.check_support() # slice: to num_lookback indices - arr_sliced: PetDataArrayLike = self._slice(self.arr) + arr_sliced: np.ndarray = self._slice_time(self.arr) - # preprocess: currently just (maybe) impute - arr_preprocessed: PetDataArrayLike = self._impute(arr_sliced) + # impute: fill missing values + arr_imputed: np.ndarray = self._impute(arr_sliced) # compute: using specified persistence method and preprocessed array - arr_persist: PetDataArrayLike = self._method_impl(arr_preprocessed) + arr_persist: np.ndarray = self._method_impl(arr_imputed) return arr_persist diff --git a/packages/bundled_models/persistence/tests/interface/test__compute.py b/packages/bundled_models/persistence/tests/interface/test__compute.py new file mode 100644 index 00000000..4c77328e --- /dev/null +++ b/packages/bundled_models/persistence/tests/interface/test__compute.py @@ -0,0 +1,28 @@ +from persistence.interface._compute import PersistenceCompute, PersistenceComputePool +from persistence.interface._method import PersistenceMethod +from persistence.interface._metadata import PersistenceMetadata +from persistence.interface._backend import PersistenceBackendType + +import numpy as np + + +def test_persistence_compute_generic(): + """ + Perform a basic computation using the persistence compute method + """ + input_shape = [4, 5, 2, 6, 10] # 6* is the time index + import functools + + time_dim = 3 + total_size = functools.reduce(lambda x, y: x * y, input_shape) + arr = np.arange(total_size).reshape(input_shape) + metadata = PersistenceMetadata( + idx_time_dim=time_dim, + method=PersistenceMethod.MEDIAN_OF_THREE, + num_chunks_desired=21, + do_impute=True, + backend=PersistenceBackendType.NUMPY, + ) + pc = PersistenceCompute(arr=arr, metadata=metadata) + arr_out = pc.compute() + assert list(arr_out.shape) == [4, 5, 2, 1, 10] From 2960569daaf838ac6a5b733fdb4144021209bea5 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Mon, 2 Mar 2026 12:43:08 +1100 Subject: [PATCH 23/28] [skip ci] wip process pool and some additional considerations required to merge numpy arrays --- .../src/persistence/interface/_chunker.py | 14 +- .../src/persistence/interface/_compute.py | 90 +++++--- .../tests/interface/test__compute.py | 204 ++++++++++++++++-- 3 files changed, 262 insertions(+), 46 deletions(-) diff --git a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py index 469df6ed..dceda520 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py @@ -49,6 +49,10 @@ class PersistenceDataChunk: arr_chunk: np.ndarray + # chunks are calculated independently and in different workers so a reference + # to the metadata is convenient. This is a small over-head. + metadata: PersistenceMetadata + @dataclass class PersistenceChunker: @@ -285,9 +289,9 @@ def _compute_chunkinfo_greedy( ) def __post_init__(self): - # safety: don't want assume sets or dict keys because they may be unordered (depending - # on the version of python). if this behaviour of xarray changes this will be caught in - # tests. + # safety: don't want assume sets or dict keys because they may be unordered (depending on + # the version of python). However, most likely, dict is okay as long as we don't support + # python<=3.7 assert isinstance(self.da.dims, tuple) or isinstance(self.da.dims, list) # check for chunks @@ -395,10 +399,10 @@ def generate_chunks(self) -> Generator[PersistenceDataChunk]: arr_chunk = self.da.isel(dict_slice_dims) # pass chunk to caller - yield PersistenceDataChunk(arr_chunk) + yield PersistenceDataChunk(arr_chunk, self.metadata) # increment index and break if overflow is detected. try: mi_inc = self._inc_mi(mi_inc, mi_size=shape_notime_trimmed) except OverflowError: - break + return diff --git a/packages/bundled_models/persistence/src/persistence/interface/_compute.py b/packages/bundled_models/persistence/src/persistence/interface/_compute.py index a31be87e..a7033ff9 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_compute.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_compute.py @@ -1,3 +1,4 @@ +import concurrent.futures import multiprocessing from enum import StrEnum, auto from dataclasses import dataclass, field @@ -125,33 +126,74 @@ class PersistenceComputePool: chunk_generator: Generator[PersistenceDataChunk] # the chunks used for computation chunk_info: PersistenceChunkInfo metadata: PersistenceMetadata - ordered_dimnames: list[str] # used for re-creating the dataarray @staticmethod def _job_wrapper(chunk: PersistenceDataChunk) -> np.ndarray: - # TODO: implementation required - # essentially takes in a chunk and returns a numpy ndarray - # 1. get PersistenceCompute "unit" or obj from the chunk generator (and metadata reference) - # - this will embed additional info such as the time index, and compute options such as - # the backend and whether or not to impute - required by the compute methods - # 2. run obj.compute(chunk) - # 3. return the resulting numpy array - pass - - def compute_chunks(self) -> xr.DataArray: - # 1. prepare a empty numpy container in memory with the appropriate data shape - # 2. create a multiprocessing worker pool based on metadata - # 3. use a combination of chunk_info and the infrastructure to attempt to tweak the worker - # size if needed. (e.g. if OOM is predicted). Or otherwise spit out a warning. - # 4. run the _job_wrapper against the provided chunk_generator and fill the container - # prepared in 1. as per the description in the module docs. - # 5. re-wrap the dataarray meta information (mainly dimension names) onto the numpy - # structure. - # 6. return the dataarray - # - # NOTE: there will be a higher level structure that recombines the variables of the data - # array. - pass + """ + This wrapper needs to be static, as we may not want the state info of + this class to propagate. + """ + return PersistenceCompute(chunk.arr_chunk, chunk.metadata).compute() + + def map_and_join_chunks(self) -> xr.DataArray: + """ + 1. Send chunks to workers + 2. Each worker runs the jobwrapper which invokes the configured persistence method + 3. Join the resulting list of numpy results along the time dimension + 4. Re-insert dimension names from chunk_info + + TODO: this should only be called via a main guard or entrypoint + + Calling forkserver preload and early inheriting any modules that may be forked is a + desirable way to call this, if multi-platform compatiblity is needed: + + e.g. + + if __name__ == "__main__": + ctx = multiprocessing.get_context("fork_server") + ctx.set_forkserver_preload(["module_name", "__main__"]) + args = parse_args(...) + generator = build_generator(args) + + with concurrent.futures.ProcessPoolExecutor(..., mp_context=ctx) as exec: + res = exec.map(fn, iter(generator)) + # do stuff with result + """ + # dispatch chunks to workers + # TODO: forkserver does not work with windows/mac + with concurrent.futures.ProcessPoolExecutor( + self.metadata.num_workers, + mp_context=multiprocessing.get_context("forkserver"), + ) as pp_exec: + arr_res_chunks = pp_exec.map( + PersistenceComputePool._job_wrapper, iter(self.chunk_generator) + ) + + raise NotImplementedError(""" + TODO: need to preallocate numpy array before writing back, to avoid slow downs, also + easier indexing - this is the retraction of what we did for the chunker. + + We need to: + 1. recreate the multi index slices as per chunker (either explicitly or using + PersistenceChunker_get_dim_slices, noting that this returns a dict, and we need make + sure to preserve order, using chunk_info.dim_names) + 2. Then it's a matter of iterating over the chunks and filling them with + + arr[s0, s1, s2, ...] = result_slab + + s0, s1, s2 are incrementable indices, "..." just means select everything for all + other indices. + """) + # join results + # TODO: + arr_res = np.empty + arr_res = np.block(np.array(arr_res_chunks)) + + # re-instate naming and structure + # NOTE: metadata is not preserved as this is a compute-only operation + da_res = xr.DataArray(arr_res, dims=self.chunk_info.dim_names) + + return da_res # TODO: the variable references are not right - need to use self.metadata diff --git a/packages/bundled_models/persistence/tests/interface/test__compute.py b/packages/bundled_models/persistence/tests/interface/test__compute.py index 4c77328e..3daf05fa 100644 --- a/packages/bundled_models/persistence/tests/interface/test__compute.py +++ b/packages/bundled_models/persistence/tests/interface/test__compute.py @@ -1,28 +1,198 @@ -from persistence.interface._compute import PersistenceCompute, PersistenceComputePool -from persistence.interface._method import PersistenceMethod -from persistence.interface._metadata import PersistenceMetadata -from persistence.interface._backend import PersistenceBackendType +""" +Tests various compute methods and backends at a high level. The focus is on structural preservation +of the various computations that are dispatched into multiprocessing workers. Also ensuring correct +mapping to the method/backend given the user input. + +NOTE: this only does a very basic test of the method itself. Actual implementation and computational +accuracy of the method, and any edge cases are tested elsewhere. +""" import numpy as np +import xarray as xr +import functools + +from persistence.interface._backend import PersistenceBackendType +from persistence.interface._chunker import PersistenceChunker +from persistence.interface._compute import PersistenceCompute, PersistenceComputePool +from persistence.interface._metadata import PersistenceMetadata +from persistence.interface._method import PersistenceMethod -def test_persistence_compute_generic(): +def _compute_single( + method: PersistenceMethod, + backend: PersistenceBackendType, + random=False, # defaults to "arange" i.e. value = 1-d index reshaped into nd-array + shape_input=(4, 5, 2, 6, 10), + numchunks=21, + time_index=3, +) -> (PersistenceMetadata, np.ndarray, np.ndarray): """ - Perform a basic computation using the persistence compute method + Helper function to create example data for a single computation. + + Useful for comparison of single workers vs pools, for various persistence methods and backends + + Returns references to: + - metadata + - input array (np.ndarray) + - output array (np.ndarray) """ - input_shape = [4, 5, 2, 6, 10] # 6* is the time index - import functools + # repeatability - re-seed rng state and bind it to `rng` variable + rng = np.random.default_rng(seed=42) - time_dim = 3 - total_size = functools.reduce(lambda x, y: x * y, input_shape) - arr = np.arange(total_size).reshape(input_shape) + # derive array shape + shape_input = list(shape_input) + total_size = functools.reduce(lambda x, y: x * y, shape_input) + + # choose whether to use linear increments (essentially the equivilent 1d index as the value or a + # random number as the value + arr_in = None + if random: + arr_in = np.arange(total_size).reshape(shape_input) + else: + arr_in = rng.random(shape_input) + + # specify metadata (mocked user input) metadata = PersistenceMetadata( - idx_time_dim=time_dim, - method=PersistenceMethod.MEDIAN_OF_THREE, - num_chunks_desired=21, + idx_time_dim=time_index, + method=method, + num_chunks_desired=numchunks, do_impute=True, - backend=PersistenceBackendType.NUMPY, + backend=backend, ) - pc = PersistenceCompute(arr=arr, metadata=metadata) + + # compute output + pc = PersistenceCompute(arr=arr_in, metadata=metadata) arr_out = pc.compute() - assert list(arr_out.shape) == [4, 5, 2, 1, 10] + + # expect the array shape to be the same except for time dimension which should be reduced to 1 + expect_shape = [ + s if i != metadata.idx_time_dim else 1 for i, s in enumerate(arr_in.shape) + ] + + # simple shape assert + assert expect_shape == list(arr_out.shape) + # return meta information for further tests in caller + return metadata, arr_in, arr_out + + +def _compute_pool( + method: PersistenceMethod, + backend: PersistenceBackendType, + _fn_compute_single=_compute_single, + *_fn_extra_args, + **_fn_extra_kwargs, +) -> (PersistenceMetadata, xr.DataArray, xr.DataArray): + """ + Same as _compute_single but for xarrays and using chunked pools. + + Cheats a bit by using _compute_single as a default to avoid repetition for basic tests. + + Returns references to: + - metadata + - input array (xr.DataArray) + - output array (xr.DataArray) + """ + metadata, arr_in, arr_out = _fn_compute_single( + method, backend, *_fn_extra_args, **_fn_extra_kwargs + ) + + # upgrade to data arrays with dummy names, except for the time index which will be 't' + dim_names = [ + "x" + str(i) if i != metadata.idx_time_dim else "t" + for i in range(len(arr_in.shape)) + ] + + # upgrade to dataarray + da_in = xr.DataArray(arr_in, dims=dim_names) + + # chunk generator + chunker = PersistenceChunker(da=da_in, metadata=metadata) + + # propagate information to compute pool + pcp = PersistenceComputePool( + chunk_generator=chunker.generate_chunks(), + chunk_info=chunker.chunk_info, + metadata=metadata, + ) + + # compute and retrieve chunks (joined back into data array) + da_out = pcp.map_and_join_chunks() + + # expect the array shape to be the same except for time dimension which should be reduced to 1 + expect_shape = [ + s if i != metadata.idx_time_dim else 1 for i, s in enumerate(arr_in.shape) + ] + + # simple shape assert + assert list(da_out.shape) == expect_shape + # dimnames should not have changed - NOTE: this may regress if xarray decides to deprecate dims + # in favour of sizes, in which case we should be extracting the "keys" as an ordered tuple. + assert dim_names == list(da_out.dims) + # single worker and pool should have the same values + assert np.allclose(da_out.values, arr_out) + # return meta information for further tests in caller + return metadata, da_in, da_out + + +def test_compute_medianofthree_workerpool_numpy(): + """ + method: median of three + backend: numpy + + expect lookback of 6 used for imputation (default) + expect lookback of 3 used for median of three computation (definition) + expect dimension shape to be preserved and only the time dimension to be reduced to 1 + expect dimension names to be mapped to the right shape + expected array can be easily constructed using a manual equivilent numpy operation e.g.: + 1. create a range of numbers + 2. compute median the trivial way over the axis + 3. sense check a few cherrypicked numbers + 4. compare the output against the output of the worker pool + 5. repeat the above, but for a random array (in which case 3. is not necessary - and in fact + cannot be done deterministically) + + Most of the same above strategy can be repeated for most of the other tests. + + ([numpy array], metadata) -> xarray dataarray + """ + # values = 1-d index + _, da_in, da_out = _compute_pool( + PersistenceMethod.MEDIAN_OF_THREE, + PersistenceBackendType.NUMPY, + ) + + # cherry picked tests (TODO) + + # values = random (TODO) + + +def test_compute_mostrecent_workerpool_numpy(): + """ + Sense check for most recent computation method + """ + pass + + +def test_no_impute_workerpool_numpy(): + """ + Check when imputation is disabled - should preserve nans + """ + pass + + +def test_compute_backend_supported(): + """ + Sense check for supported backends - should succeed + + NOTE: individual backend support themselves are done in tests of form _ + e.g. test_compute_medianofthree_workerpool_numpy tests the median of three computation on the + `numpy` backend pool + """ + pass + + +def test_compute_backend_unsupported(): + """ + Sense check for unsupported backends - should error out + """ + pass From eae9cb2b77f58a1fd3a0a58ce2d660430cd51d73 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Mon, 2 Mar 2026 16:19:01 +1100 Subject: [PATCH 24/28] [skip ci] force compute context to fork server --- .../src/persistence/interface/_chunker.py | 7 +- .../src/persistence/interface/_compute.py | 67 +++++++++---------- 2 files changed, 39 insertions(+), 35 deletions(-) diff --git a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py index dceda520..ad559361 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py @@ -37,6 +37,7 @@ class PersistenceChunkInfo: num_chunks: int size_chunk: int dim_names: list[str] + shape_full: list[int] @dataclass @@ -53,6 +54,9 @@ class PersistenceDataChunk: # to the metadata is convenient. This is a small over-head. metadata: PersistenceMetadata + # list containing slices of each dimension that make up the chunk + slice_dims: list[slice] + @dataclass class PersistenceChunker: @@ -286,6 +290,7 @@ def _compute_chunkinfo_greedy( size_chunk=actual_chunksize, lsi_chunk=first_chunkindex, dim_names=dim_names, + shape_full=mi_size, ) def __post_init__(self): @@ -399,7 +404,7 @@ def generate_chunks(self) -> Generator[PersistenceDataChunk]: arr_chunk = self.da.isel(dict_slice_dims) # pass chunk to caller - yield PersistenceDataChunk(arr_chunk, self.metadata) + yield PersistenceDataChunk(arr_chunk, self.metadata, list(dict_slice_dims.values())) # increment index and break if overflow is detected. try: diff --git a/packages/bundled_models/persistence/src/persistence/interface/_compute.py b/packages/bundled_models/persistence/src/persistence/interface/_compute.py index a7033ff9..df708e75 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_compute.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_compute.py @@ -5,6 +5,7 @@ from collections.abc import Callable from contextlib import contextmanager from typing import Union, Generator +from collections import namedtuple import numpy as np import xarray as xr @@ -22,6 +23,8 @@ from persistence.interface._backend import PersistenceBackendType +ChunkResult = namedtuple('ChunkResult', ['array', 'slice_dims']) + @dataclass class PersistenceComputePool: """ @@ -128,12 +131,15 @@ class PersistenceComputePool: metadata: PersistenceMetadata @staticmethod - def _job_wrapper(chunk: PersistenceDataChunk) -> np.ndarray: + def _job_wrapper(chunk: PersistenceDataChunk) -> ChunkResult: """ This wrapper needs to be static, as we may not want the state info of this class to propagate. """ - return PersistenceCompute(chunk.arr_chunk, chunk.metadata).compute() + return ChunkResult( + array=PersistenceCompute(chunk.arr_chunk, chunk.metadata).compute(), + slice_dims=chunk.slice_dims, + ) def map_and_join_chunks(self) -> xr.DataArray: """ @@ -159,38 +165,31 @@ def map_and_join_chunks(self) -> xr.DataArray: res = exec.map(fn, iter(generator)) # do stuff with result """ - # dispatch chunks to workers - # TODO: forkserver does not work with windows/mac - with concurrent.futures.ProcessPoolExecutor( - self.metadata.num_workers, - mp_context=multiprocessing.get_context("forkserver"), - ) as pp_exec: - arr_res_chunks = pp_exec.map( - PersistenceComputePool._job_wrapper, iter(self.chunk_generator) - ) - - raise NotImplementedError(""" - TODO: need to preallocate numpy array before writing back, to avoid slow downs, also - easier indexing - this is the retraction of what we did for the chunker. - - We need to: - 1. recreate the multi index slices as per chunker (either explicitly or using - PersistenceChunker_get_dim_slices, noting that this returns a dict, and we need make - sure to preserve order, using chunk_info.dim_names) - 2. Then it's a matter of iterating over the chunks and filling them with - - arr[s0, s1, s2, ...] = result_slab - - s0, s1, s2 are incrementable indices, "..." just means select everything for all - other indices. - """) - # join results - # TODO: - arr_res = np.empty - arr_res = np.block(np.array(arr_res_chunks)) - - # re-instate naming and structure - # NOTE: metadata is not preserved as this is a compute-only operation + # compute result shape by suppressing the time dimension + shape_res = [ + v if i != self.metadata.idx_time_dim else 1 + for i, v in enumerate(self.chunk_info.shape_full) + ] + arr_res = np.empty(shape_res) + + if self.metadata.num_workers <= 1: + # loop through instead + for chunk in iter(self.chunk_generator): + arr_res_chunk = PersistenceComputePool._job_wrapper(chunk) + arr_res[chunk.slice_dims] = arr_res_chunk + else: + # dispatch chunks to workers + # TODO: forkserver does/may not work with windows/mac, unless main-guarded + with concurrent.futures.ProcessPoolExecutor( + self.metadata.num_workers, + mp_context=multiprocessing.get_context("forkserver"), + ) as pp_exec: + results = pp_exec.map( + PersistenceComputePool._job_wrapper, iter(self.chunk_generator) + ) + for res_chunk in iter(results): + arr_res[*res_chunk.slice_dims] = res_chunk.array + da_res = xr.DataArray(arr_res, dims=self.chunk_info.dim_names) return da_res From e496905cd9f2a57f7a0705d54c25b4307176e883 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Wed, 4 Mar 2026 12:36:28 +1100 Subject: [PATCH 25/28] [skip ci] some documentation and some initial stubs to work on integrating with pipelines --- packages/bundled_models/persistence/pixi.lock | 20 +-- .../bundled_models/persistence/pyproject.toml | 7 +- .../src/persistence/interface/_chunker.py | 4 +- .../src/persistence/interface/_compute.py | 3 +- .../src/persistence/persistence_impl.py | 154 +++++++++++++++--- .../src/persistence/registered_model.py | 59 +++++++ .../persistence/tests/test__interface.py | 3 - 7 files changed, 212 insertions(+), 38 deletions(-) create mode 100644 packages/bundled_models/persistence/src/persistence/registered_model.py diff --git a/packages/bundled_models/persistence/pixi.lock b/packages/bundled_models/persistence/pixi.lock index 808e8d15..6fad0157 100644 --- a/packages/bundled_models/persistence/pixi.lock +++ b/packages/bundled_models/persistence/pixi.lock @@ -2556,16 +2556,6 @@ packages: - pkg:pypi/pyarrow?source=hash-mapping size: 4776275 timestamp: 1770672664641 -- pypi: ./ - name: pyearthtools-bundled-persistence - version: 0.6.0 - sha256: 9e80485d242be8dba6bef64a468c827221089a186635053b1b09cd445c386b8d - requires_dist: - - pyearthtools-zoo>=0.5.0 - - pyearthtools-data>=0.5.0 - - pyearthtools-pipeline>=0.5.0 - - hydra-core - requires_python: '>=3.11,<3.14' - pypi: https://files.pythonhosted.org/packages/b3/f8/f47b90fbeaf36e112b1a93fc313d5f0bc9f0051ae8be734173787a00271a/pyearthtools_data-0.5.1-py3-none-any.whl name: pyearthtools-data version: 0.5.1 @@ -2595,6 +2585,16 @@ packages: - intake ; extra == 'intake' - intake-esm ; extra == 'intake' requires_python: '>=3.11' +- pypi: ./ + name: pyearthtools-persistence + version: 0.6.0 + sha256: b1a739e368b0b6b224e7bd805870c2d5c9e66183a749d55c2f4ae7739e268bf6 + requires_dist: + - pyearthtools-zoo>=0.5.0 + - pyearthtools-data>=0.5.0 + - pyearthtools-pipeline>=0.5.0 + - hydra-core + requires_python: '>=3.11,<3.14' - pypi: https://files.pythonhosted.org/packages/f2/f8/beda8582d430075031ac8835aced207d7bc639469451c932fdf1c0b2ed5c/pyearthtools_pipeline-0.5.1-py3-none-any.whl name: pyearthtools-pipeline version: 0.5.1 diff --git a/packages/bundled_models/persistence/pyproject.toml b/packages/bundled_models/persistence/pyproject.toml index 56c79cff..0e487cea 100644 --- a/packages/bundled_models/persistence/pyproject.toml +++ b/packages/bundled_models/persistence/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools"] build-backend = "setuptools.build_meta" [project] -name = "pyearthtools-bundled-persistence" +name = "pyearthtools-persistence" version = "0.6.0" description = "Persistence Bundled Model" readme = "README.md" @@ -56,8 +56,7 @@ warn_unused_configs = true ignore_missing_imports = true [tool.hatch.version] -# TODO: is this the right path? -path = "src/pyearthtools/pipeline/__init__.py" +path = "src/persistence/__init__.py" [tool.hatch.build.targets.wheel] packages = ["src/pyearthtools/"] @@ -67,7 +66,7 @@ channels = ["conda-forge"] platforms = ["linux-64"] [tool.pixi.pypi-dependencies] -pyearthtools-bundled-persistence = { path = ".", editable = true } +pyearthtools-persistence = { path = ".", editable = true } [tool.pixi.tasks] diff --git a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py index ad559361..c1a5979d 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_chunker.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_chunker.py @@ -404,7 +404,9 @@ def generate_chunks(self) -> Generator[PersistenceDataChunk]: arr_chunk = self.da.isel(dict_slice_dims) # pass chunk to caller - yield PersistenceDataChunk(arr_chunk, self.metadata, list(dict_slice_dims.values())) + yield PersistenceDataChunk( + arr_chunk, self.metadata, list(dict_slice_dims.values()) + ) # increment index and break if overflow is detected. try: diff --git a/packages/bundled_models/persistence/src/persistence/interface/_compute.py b/packages/bundled_models/persistence/src/persistence/interface/_compute.py index df708e75..f1ba2602 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/_compute.py +++ b/packages/bundled_models/persistence/src/persistence/interface/_compute.py @@ -23,7 +23,8 @@ from persistence.interface._backend import PersistenceBackendType -ChunkResult = namedtuple('ChunkResult', ['array', 'slice_dims']) +ChunkResult = namedtuple("ChunkResult", ["array", "slice_dims"]) + @dataclass class PersistenceComputePool: diff --git a/packages/bundled_models/persistence/src/persistence/persistence_impl.py b/packages/bundled_models/persistence/src/persistence/persistence_impl.py index e275cb37..e014eda7 100644 --- a/packages/bundled_models/persistence/src/persistence/persistence_impl.py +++ b/packages/bundled_models/persistence/src/persistence/persistence_impl.py @@ -1,19 +1,132 @@ -import persistence as pet_persist +""" +Runs persistence model on the data loaded from the pipeline. Chunks the input data from the pipeline +and uses multiprocessing (if specified to do so). +Persistence potentially needs to be computed on the fly. Depending on the persistence method, and +model it is being compared against, the computation may require ingestion of a reasonable amount of +historical data. -# TODO: convert to builder pattern -def _compute_persistence( +The common use-case is to offload the data loading to something at a higher level (pet-pipeline). + +This module can't control the loading process, instead what it controls is the way in which the +chunks are indexed, so that they can be _processed_ (CPU not IO) efficiently. + +Examples of what can be done: + - choice of backend (e.g. numba/rust etc., defaulting to numpy) - wip currently only numpy is + supported + + - choice of number of chunks and workers for python to slice data into multiple workers. + (embarassingly parallel) + + - choice of persistence method + + - flexiblity in how the input array/slab is provided, currently supports: + - numpy array (<--- almost any hypercube datastructure can be converted to this) + - xarray dataset + - xarray dataarray + +CAUTION: + + Due to the way data is stored and loaded, multiprocessing may sometimes be necessary but should + be used with caution. Some tips, when in doubt just set the workers to 1, but you may still + chunk the data if required due to memory issues. + + Again, the chunking here is not to do with loading, its to do with efficient processing. + Assumedly the data is already chunked as it is loaded via some other framework. The chunking + applied here is on top of that to further sub-slice things to take into account the need to + ingest a large amount of data for aggregation computations. + +ANTIPATTERNS (for developers): + + - do not chunk over time (except for specific exceptions) + + - do not use external multiprocessing/threading like dask + + - do not use multiprocessing IF the compute backend already does it efficiently, UNLESS we are + IO bound. + + - do not use threading. IO bound issues should be resolved at a higher level because persistence + methods (currently) have no control over how the data is loaded - actually this is the same + for everything in PET that delegates data loading to the pipeline. + + - do not implemnent methods with heavy parametric statical inference or methods that are aware + of the "meaning" of orthogonal dimensions in the hypercube other than "time". + + - do not do any overly clever chunk/worker optimization - this is the user's responsiblity + + - do not assume this will be called as a library (but can be if the OS allows it and its been + tested sufficiently). + +IMPORTANT: + + The "proper" way to run this module is a standalone process/script. But it _may_ work as part of + a script/pipeline _if_ the underling OS supports it. See the executor pool defined in + `interface._compute` and the main guard at the bottom. + +FUTUREWORK: + + - Add the ability to bypass python completely for data loading. + + - Current architecture expects data to be lazily loaded from python but eagerly computed by + the backend, which may still be python or could be something like rust or C. + + - The target alternative or toggle is for this to be inverted in a way that the data loading + itself is done by the backend, allowing for even better control over the processing. + + - Persistence computation is relatively isolated enough from "frameworks" to be a perfect + candidate to do this. +""" + + +def predict( arr: pet_persist.PetDataArrayLike, - idx_time: int, - idx_chunk: int = None, + idx_time_dim: int, num_workers: int = None, num_chunks: int = None, - method: PersistenceMethod | str = PersistenceMethod.MOST_RECENT, + method: PersistenceMethod | str = PersistenceMethod.MEDIAN_OF_THREE, simple_impute: bool = True, - return_raw_result: bool = False, ) -> pet_persist.PetDataArrayLike: """ - Calculate the persistence of observation + Calculate the persistence of historical observations, to be used as a baseline for other models. + + Persistence methods essentially compute either: + + a. reduce an array with multiple time indices into 1 time index, given the input with + multiple time indices (the number of time indices required, depends on the perisistence + method). ---> single time index + + b. A stochastic signal that has the maximum likelihood (depending on method) of representing + the data at the leadtime given the short amount of contextual history. E.g. this could be + the starting context using a. followed by some behaviour inferred from day cycles inferred + from the historical data. ---> multi-time indices, maybe autoregressive + + Only a. is currently supported. + + What persistence tries to answer is the following: + + Given some trivial, human comprehendable methods, am "I" - this program - able to apply the + method(s) according to the user configuration on some limited amount of historical data to + produce output that is competitive (speed, memory usage, accuracy, skill etc.) to the model + that I'm compared against. + + Because if the answer is "yes I can match this complex algorithm, then that invalidates the + need for the complex algorithm, especially since persistence is explainable and bounded to + the observations by definition. + + If the answer is "no" then the follow up is, how does this compare with other competitive + models, which essentially paves grounds for verfication and ranking models. + + + The general idea is that we are transforming a set of user requirements and a nd-dataarray into + a time reduced (single time index) nd-dataarray if n > 1 (otherwise we'd just get back a single + scalar). In this process we would also be doing chunking, multiprocessing, and offloading to + a different compute backend, if requested. By default no data splicing occurs and the backend is + chosen to be numpy. + + The above is repeated for each "variable" in the input data structure independently, where the + concept of a "variable" only applies in the case that the input is a `xr.Dataset` _or_ if the + underling `xr.DataArray` has a "name". The results are recomposed back into the original data + structure with/or without variables - depending. (C, M, D_(TxN), I) -> D_(T'xN) @@ -34,13 +147,10 @@ def _compute_persistence( Use imputation only if data is sparse and predictable. Args: - ds (array-like) - required: + arr (array-like) - required: ArrayLike - supports numpy and xarray - idx_time (int) - required : + idx_time (int) - required: the dimension for time index - idx_chunk (int): - the dimension used for chunking (ignored if the method flattens non-temporal - dimensions). Otherwise, if not specified, automatically chooses a chunking dimension. num_workers (int): number of workers to use for processing persistence, defaults to number of cpus. num_chunks (int): @@ -50,13 +160,16 @@ def _compute_persistence( simple_impute (bool): defaults to True. Set to False if nan needs to be preserved. NOTE: methods that require multiple non-nan datapoints to function may be forced to nan. - return_raw_result (bool): - whether to return the result in the original data type. By default it returns a Dataset. Returns: - Original dataset with lead time filled with persistence values. + data in the original structure, with time dimension reduced to 1 + FUTUREWORK: - - for more complex modes (not yet implemented) the leadtimes are not constant. + + an optional stochastic signal (autoregressive function) that can be applied to derive + future values. I.e. the persistence is cached and the function is used to derive + lead-times if we want non-constant behaviour or things like confidence intervals shown + (or attempted to be). """ # for a given leadtime: # input data -> upgrade to PetDataset -> map_each_var -> _compute_persistence_single @@ -66,8 +179,7 @@ def _compute_persistence( pet_ds = pet_persist.PetDataset(arr).with_return_raw_result(return_raw_result) ds_result = pet_ds.map_each_var( _compute_persistence_single, - idx_time, - idx_chunk, + idx_time_dim, num_chunks, method, simple_impute, @@ -101,3 +213,7 @@ def _compute_persistence_single( # TODO: work chain i.e. slice -> impute -> compute # TODO: merge result raise NotImplementedError() + + +if __name__ == "__main__": + raise NotImplementedError("TODO - standalone call") diff --git a/packages/bundled_models/persistence/src/persistence/registered_model.py b/packages/bundled_models/persistence/src/persistence/registered_model.py new file mode 100644 index 00000000..1eeebc0b --- /dev/null +++ b/packages/bundled_models/persistence/src/persistence/registered_model.py @@ -0,0 +1,59 @@ +""" +Register persistence model in zoo + +NOTE: + +- this is temproary compatibility with pipeline ingest to fit in with the paradigm similar to + FourCastNeXT. + +- zoo may get deprecated in favour of direct implementations in bundled models, so any interfacing + is intentionally lightweight, with some shortcuts. +""" + + +@pyearthtools.zoo.register("Development/Persistence", exists="ignore") +class PersistenceRM(pyearthtools.zoo.BaseForecastModel): + _name = "Development/Persistence" + + def __init__( + self, + *, + pipeline_name: str = None, + output: Optional[os.PathLike] = None, + pipeline=None, + lead_time: int | str, + **kwargs, + ) -> None: + """ + TODO initialize persistence class with appropriate arguments + """ + raise NotImplementedError("TODO") + super().__init__( + pipeline_name=pipeline_name, pipeline=pipeline, output=output, **kwargs + ) + + def load(self, **kwargs) -> tuple[Any, dict[str, Any]]: + """ + TODO + + - check pipeline was constructed with a TemporalWindow or equivilent Temporal* index + extraction methods. + - pass the merged indices into the persistence algorithm + - the return type should be a "Predictor" that accepts some kwargs + - for a simplistic persistence model we don't want the recurrent predictor, as the + internal methods already handle any splitting and stacking. + - instead use the TimeWindow directly + - I'm not sure how this handles data sets + + The easiest way to do this is to: + + - look at a sample pipleline with a TemporalWindow method + - determine how to translate the variables into an output + - standardise the output to look like the original example + + FUTUREWORK + + while predictors in other cases e.g. fourcastnext have caching implemented. The strategy + needs to be considered carefully. So it will be bypassed for the initial implementation + """ + raise NotImplementedError("TODO") diff --git a/packages/bundled_models/persistence/tests/test__interface.py b/packages/bundled_models/persistence/tests/test__interface.py index 20626c89..8763f4a0 100644 --- a/packages/bundled_models/persistence/tests/test__interface.py +++ b/packages/bundled_models/persistence/tests/test__interface.py @@ -136,9 +136,6 @@ def test_chunker_multi_index_increment(): for v, i in enumerate(zip(np_start_index, np_end_index)) } da_slice = da.isel(**multi_slice) - import pdb - - pdb.set_trace() da_slice.shape From 4af0517266661a6c7c43312b1b0c852149ef052b Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Thu, 5 Mar 2026 12:46:03 +1100 Subject: [PATCH 26/28] [skip ci] slight reorganisation of package structure; update persistence_impl interface - still work in progress --- .../{daskconfig.py => config/dask.py} | 0 .../src/persistence/interface/types.py | 154 ++++++++++++++++++ .../src/persistence/persistence_impl.py | 103 ++++++++---- .../persistence/src/persistence/types.py | 14 +- 4 files changed, 240 insertions(+), 31 deletions(-) rename packages/bundled_models/persistence/src/persistence/{daskconfig.py => config/dask.py} (100%) create mode 100644 packages/bundled_models/persistence/src/persistence/interface/types.py diff --git a/packages/bundled_models/persistence/src/persistence/daskconfig.py b/packages/bundled_models/persistence/src/persistence/config/dask.py similarity index 100% rename from packages/bundled_models/persistence/src/persistence/daskconfig.py rename to packages/bundled_models/persistence/src/persistence/config/dask.py diff --git a/packages/bundled_models/persistence/src/persistence/interface/types.py b/packages/bundled_models/persistence/src/persistence/interface/types.py new file mode 100644 index 00000000..feca2f8d --- /dev/null +++ b/packages/bundled_models/persistence/src/persistence/interface/types.py @@ -0,0 +1,154 @@ +""" +Common data array/set transformations supported by the persistence model, the main usecase is to map +a function to each data variable independently. This is a common pattern as more often than not we +wouldn't be intermixing variables in basic pre-processing steps. + +TODO: this should be somewhere more common +""" + +from typing import Union, Generic +from collections.abc import Callable +from enum import StrEnum, auto +import xarray as xr +import numpy as np +import numpy.typing as npt + +PetDataArrayLike = Union[xr.DataArray, xr.Dataset, npt.ArrayLike] + + +class PetInputDataType(StrEnum): + XR_DATAARRAY = "xr_dataarray" + XR_DATASET = "xr_dataset" + NP_ARRAY = "np_array" + UNKNOWN = auto() + + +class PetDataset: + def __init__( + self, + arraylike: PetDataArrayLike, + dummy_varname="_dummyvarname", # used for xarray dataarrays and numpy arrays + dimnames: list[str] = None, # used only for numpy arrays + ): + """ + Takes a PetDataArrayLike and converts it to a PetDataset which is compatible with the + `map_each_var` computation. + + `dimnames` is only relevant for numpy - and only if using name-based indexing for retrieving + e.g. time dimension + """ + self.raw_type = PetInputDataType.UNKNOWN + self.ds = self.from_arrlike(arraylike, dummy_varname, dimnames) + self.return_raw_result = False + + def with_return_raw_result(self, return_raw_result=bool): + """ + Optionally set this to return raw array from `map_each_var` + """ + self.return_raw_result = return_raw_result + + def from_np_array( + self, arraylike: npt.ArrayLike, dummy_varname, dimnames + ) -> xr.Dataset: + self.raw_type = PetInputDataType.NP_ARRAY + return self.from_xr_dataarray( + xr.DataArray(arraylike, dims=dimnames), dummy_varname + ) + + def from_xr_dataarray(self, arraylike: xr.DataArray, dummy_varname) -> xr.Dataset: + self.raw_type = PetInputDataType.XR_DATAARRAY + return xr.Dataset({dummy_varname: arraylike}) + + def from_xr_dataset(self, arraylike: xr.Dataset) -> xr.Dataset: + self.raw_type = PetInputDataType.XR_DATASET + return arraylike + + def from_arrlike(self, arraylike, dummy_varname, dimnames) -> xr.Dataset: + # Order is important here, For example: + # xr.DataArray may be a npt.ArrayLike, but not the other way around. If we swap the order, + # the xr.DataArray constructor will never be reached. + + msg_type_error = """ + The provided data does not have a supported array type, supported array types are: + xr.DataArray, xr.Dataset and np.ndarray. + """ + + if isinstance(arraylike, xr.Dataset): + return self.from_xr_dataset(arraylike) + + if isinstance(arraylike, xr.DataArray): + return self.from_xr_dataarray(arraylike, dummy_varname) + + if isinstance(arraylike, (np.ndarray, list, tuple)): + arraylike = np.asarray(arraylike) # force convert just in case + return self.from_np_array(arraylike, dummy_varname, dimnames) + + # unsupported type + raise TypeError(msg_type_error) + + def map_each_var( + self, + _fn: Callable[[xr.DataArray, ...], xr.DataArray], + *_fn_args, + **_fn_kwargs, + ) -> PetDataArrayLike: + """ + Applies a function over each data array in the dataset. The return type will be dataset. + + The return type of each function operation itself will be per variable (dataarray). + + Only functions that have common structure associated to the variables in the Dataset will + work properly. + + IMPORTANT: global attributes and special variables may not be preserved. This operation is + destructive and for intermediate computation purposes only. + + Args: + _fn: takes a DataArray as its first input arg and produces a DataArray as output + _fn_args: additional positional arguments to provide to _fn + _fn_kwargs: additional keyword arguments to provide to _fn + """ + dict_res = {} + invalid_ret_err_msg = ( + "PetDataset.map_each_var: Expect function to return a single xr.DataArray" + ) + + for k_var, v_da in self.ds.data_vars.items(): + # sense check + assert isinstance(v_da, xr.DataArray) + + da_res = _fn(v_da, *_fn_args, **_fn_kwargs) + + if not isinstance(da_res, xr.DataArray): + raise RuntimeError(invalid_ret_err_msg) + + dict_res[k_var] = da_res + + ds_res = xr.Dataset(dict_res) + + if self.return_raw_result: + return self._raw_result(ds_res) + + # return upgraded dataset by default + return ds_res + + def _raw_result(self, ds: xr.Dataset) -> PetDataArrayLike: + """ + Converts a result back into the original data structure. Down-converting is a lot safer and + so less checks required. + + NOTE: the returned datatype may have dummy names attached, as such these results are for + intermediate computation purposes only, not for operational outputs. + """ + if self.raw_type == PetDataArrayLike.UNKNOWN: + # this should not happen - _raw_result should not be called externally + raise RuntimeError("PetDataset._raw_result: Invalid raw type encountered") + elif self.raw_type == PetDataArrayLike.XR_DATASET: + # nothing to do + return ds + elif self.raw_type == PetDataArrayLike.XR_DATAARRAY: + # extract the dataarray + return ds[self._dummyvarname] + elif self.raw_type == PetDataArrayLike.NP_ARRAY: + # extract the numpy array - note this may force a memory load. + return ds[self._dummyvarname].values diff --git a/packages/bundled_models/persistence/src/persistence/persistence_impl.py b/packages/bundled_models/persistence/src/persistence/persistence_impl.py index e014eda7..3efd7010 100644 --- a/packages/bundled_models/persistence/src/persistence/persistence_impl.py +++ b/packages/bundled_models/persistence/src/persistence/persistence_impl.py @@ -77,14 +77,23 @@ candidate to do this. """ +from persistence.interface import ( + PetDataArrayLike, + PersistenceComputePool, + PersistenceBackend, + PersistenceMethod, + PetDataset, +) + def predict( - arr: pet_persist.PetDataArrayLike, + arr: PetDataArrayLike, idx_time_dim: int, num_workers: int = None, num_chunks: int = None, method: PersistenceMethod | str = PersistenceMethod.MEDIAN_OF_THREE, simple_impute: bool = True, + backend_type: PersistenceBackendType = PersistenceBackendType.NUMPY, ) -> pet_persist.PetDataArrayLike: """ Calculate the persistence of historical observations, to be used as a baseline for other models. @@ -147,72 +156,110 @@ def predict( Use imputation only if data is sparse and predictable. Args: + arr (array-like) - required: ArrayLike - supports numpy and xarray + idx_time (int) - required: the dimension for time index + num_workers (int): number of workers to use for processing persistence, defaults to number of cpus. + num_chunks (int): number of chunks to use, defaults to `min(num_cpu, len(chunk_dimension))` - method (string/enum): - see `PersistenceMethod`. Supports "most_recent" (default) and "median_of_three" + + method (str | StrEnum): + The method to use to compute persistence. see `PersistenceMethod`. + Supports: + - "median_of_three" + - "most_recent" + simple_impute (bool): defaults to True. Set to False if nan needs to be preserved. NOTE: methods that require multiple non-nan datapoints to function may be forced to nan. + backend_type (str | StrEnum): + see `PersistenceBackendType`. The backend compute engine to use. + Supports: + - "numpy" + Returns: - data in the original structure, with time dimension reduced to 1 + + an array (PetDataArrayLike) matched to the same specific input type in + (PetDataArrayLike), i.e. output is guaranteed to have the same type as + the input array. FUTUREWORK: - an optional stochastic signal (autoregressive function) that can be applied to derive - future values. I.e. the persistence is cached and the function is used to derive - lead-times if we want non-constant behaviour or things like confidence intervals shown - (or attempted to be). + Optionally also return and/or cache a stochastic signal (autoregressive function) that + can be applied onto the persistence output (if the given method supports it). This + allows for persistence guided by some simple derived trend (like day cycles). + + Again, its important that this stochastic trend isn't derived using complicated methods, + and hence the user cannot provide this signal - it has to be pre-derived and cached by + one of the persistence methods dynamically. """ - # for a given leadtime: - # input data -> upgrade to PetDataset -> map_each_var -> _compute_persistence_single - # TODO: - # - lead time handling for more complex methods - method = pet_persist.PersistenceMethod(method) - pet_ds = pet_persist.PetDataset(arr).with_return_raw_result(return_raw_result) - ds_result = pet_ds.map_each_var( - _compute_persistence_single, - idx_time_dim, - num_chunks, - method, - simple_impute, + if isinstance(method, str): + # force it to EnumStr - auto raises error if not compatible. + method = PersistenceMethod(method) + + # --- DEPRECATED --- + # TODO: remove with_return_raw_result from PetDataset, there's no reason to + # keep the lifted structure when the caller likely only requires the + # original structure back. + # pet_ds = pet_persist.PetDataset(arr).with_return_raw_result(return_raw_result) + # --- + + # lift structure to dataset representation (higher order) + # structural order (highest to lowest) + # - xr.Dataset + # - xr.DataArray + # - np.ndarray + pet_ds = PetDataset(arr) + + raise NotImplementedError("TODO: map to persistence metadata") + + metadta = PersistenceMetadata(...) + + # apply function (ALWAYS) and destruct result (ONLYIF original array was lower order) + arr_result = pet_ds.map_each_var( + _predict_single_var, + metadata, ) - return ds_result + # safety capture for dev/test + assert type(arr) == type(arr_result) + return arr_result -def _compute_persistence_single( + +# TODO: make this ingest PersistenceMetadata instead... +def _predict_single_var( da: xr.DataArray, - idx_time: int, - idx_chunk: int = None, + idx_time_dim: int, num_chunks: int = None, - method: PersistenceMethod = PersistenceMethod.MOST_RECENT, + method: PersistenceMethod = PersistenceMethod.MEDIAN_OF_THREE, simple_impute: bool = True, ): """ Computes persistence for a single data array, has the same interface as _compute_persistence except that the first argument is a data array. """ + # create metadata + # input dataarray -> chunk -> impute -> compute persistence -> merge chunks chunker = PersistenceChunker( da_lazy=da, method=method, num_chunks=num_chunks, - idx_time=idx_time, - idx_chunk=idx_chunk, + idx_time_dim=idx_time, ) # TODO: worker pool # TODO: work chain i.e. slice -> impute -> compute # TODO: merge result - raise NotImplementedError() + raise NotImplementedError("TODO - some missing parts") if __name__ == "__main__": diff --git a/packages/bundled_models/persistence/src/persistence/types.py b/packages/bundled_models/persistence/src/persistence/types.py index feca2f8d..c3173326 100644 --- a/packages/bundled_models/persistence/src/persistence/types.py +++ b/packages/bundled_models/persistence/src/persistence/types.py @@ -44,6 +44,9 @@ def __init__( def with_return_raw_result(self, return_raw_result=bool): """ Optionally set this to return raw array from `map_each_var` + + NOTE: this is a special purpose function. It is currently UNUSED, but + may be useful in some rare circumstances where """ self.return_raw_result = return_raw_result @@ -108,11 +111,16 @@ def map_each_var( _fn_args: additional positional arguments to provide to _fn _fn_kwargs: additional keyword arguments to provide to _fn """ - dict_res = {} - invalid_ret_err_msg = ( + errmsg_badinputtype = "PetDataset.map_each_var: invalid input type detected" + errmsg_singlearrayret = ( "PetDataset.map_each_var: Expect function to return a single xr.DataArray" ) + if self.raw_type == PetInputDataType.UNKNOWN: + raise RuntimeError(errmsg_badinputtype) + + dict_res = {} + for k_var, v_da in self.ds.data_vars.items(): # sense check assert isinstance(v_da, xr.DataArray) @@ -120,7 +128,7 @@ def map_each_var( da_res = _fn(v_da, *_fn_args, **_fn_kwargs) if not isinstance(da_res, xr.DataArray): - raise RuntimeError(invalid_ret_err_msg) + raise RuntimeError(errmsg_singlearrayret) dict_res[k_var] = da_res From 015cb20ca31a83142a6e1decc2098348ea2a5cca Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Thu, 5 Mar 2026 12:49:13 +1100 Subject: [PATCH 27/28] [skip ci] missed move of types.py --- .../src/persistence/interface/types.py | 14 +- .../persistence/src/persistence/types.py | 162 ------------------ 2 files changed, 11 insertions(+), 165 deletions(-) delete mode 100644 packages/bundled_models/persistence/src/persistence/types.py diff --git a/packages/bundled_models/persistence/src/persistence/interface/types.py b/packages/bundled_models/persistence/src/persistence/interface/types.py index feca2f8d..c3173326 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/types.py +++ b/packages/bundled_models/persistence/src/persistence/interface/types.py @@ -44,6 +44,9 @@ def __init__( def with_return_raw_result(self, return_raw_result=bool): """ Optionally set this to return raw array from `map_each_var` + + NOTE: this is a special purpose function. It is currently UNUSED, but + may be useful in some rare circumstances where """ self.return_raw_result = return_raw_result @@ -108,11 +111,16 @@ def map_each_var( _fn_args: additional positional arguments to provide to _fn _fn_kwargs: additional keyword arguments to provide to _fn """ - dict_res = {} - invalid_ret_err_msg = ( + errmsg_badinputtype = "PetDataset.map_each_var: invalid input type detected" + errmsg_singlearrayret = ( "PetDataset.map_each_var: Expect function to return a single xr.DataArray" ) + if self.raw_type == PetInputDataType.UNKNOWN: + raise RuntimeError(errmsg_badinputtype) + + dict_res = {} + for k_var, v_da in self.ds.data_vars.items(): # sense check assert isinstance(v_da, xr.DataArray) @@ -120,7 +128,7 @@ def map_each_var( da_res = _fn(v_da, *_fn_args, **_fn_kwargs) if not isinstance(da_res, xr.DataArray): - raise RuntimeError(invalid_ret_err_msg) + raise RuntimeError(errmsg_singlearrayret) dict_res[k_var] = da_res diff --git a/packages/bundled_models/persistence/src/persistence/types.py b/packages/bundled_models/persistence/src/persistence/types.py deleted file mode 100644 index c3173326..00000000 --- a/packages/bundled_models/persistence/src/persistence/types.py +++ /dev/null @@ -1,162 +0,0 @@ -""" -Common data array/set transformations supported by the persistence model, the main usecase is to map -a function to each data variable independently. This is a common pattern as more often than not we -wouldn't be intermixing variables in basic pre-processing steps. - -TODO: this should be somewhere more common -""" - -from typing import Union, Generic -from collections.abc import Callable -from enum import StrEnum, auto -import xarray as xr -import numpy as np -import numpy.typing as npt - -PetDataArrayLike = Union[xr.DataArray, xr.Dataset, npt.ArrayLike] - - -class PetInputDataType(StrEnum): - XR_DATAARRAY = "xr_dataarray" - XR_DATASET = "xr_dataset" - NP_ARRAY = "np_array" - UNKNOWN = auto() - - -class PetDataset: - def __init__( - self, - arraylike: PetDataArrayLike, - dummy_varname="_dummyvarname", # used for xarray dataarrays and numpy arrays - dimnames: list[str] = None, # used only for numpy arrays - ): - """ - Takes a PetDataArrayLike and converts it to a PetDataset which is compatible with the - `map_each_var` computation. - - `dimnames` is only relevant for numpy - and only if using name-based indexing for retrieving - e.g. time dimension - """ - self.raw_type = PetInputDataType.UNKNOWN - self.ds = self.from_arrlike(arraylike, dummy_varname, dimnames) - self.return_raw_result = False - - def with_return_raw_result(self, return_raw_result=bool): - """ - Optionally set this to return raw array from `map_each_var` - - NOTE: this is a special purpose function. It is currently UNUSED, but - may be useful in some rare circumstances where - """ - self.return_raw_result = return_raw_result - - def from_np_array( - self, arraylike: npt.ArrayLike, dummy_varname, dimnames - ) -> xr.Dataset: - self.raw_type = PetInputDataType.NP_ARRAY - return self.from_xr_dataarray( - xr.DataArray(arraylike, dims=dimnames), dummy_varname - ) - - def from_xr_dataarray(self, arraylike: xr.DataArray, dummy_varname) -> xr.Dataset: - self.raw_type = PetInputDataType.XR_DATAARRAY - return xr.Dataset({dummy_varname: arraylike}) - - def from_xr_dataset(self, arraylike: xr.Dataset) -> xr.Dataset: - self.raw_type = PetInputDataType.XR_DATASET - return arraylike - - def from_arrlike(self, arraylike, dummy_varname, dimnames) -> xr.Dataset: - # Order is important here, For example: - # xr.DataArray may be a npt.ArrayLike, but not the other way around. If we swap the order, - # the xr.DataArray constructor will never be reached. - - msg_type_error = """ - The provided data does not have a supported array type, supported array types are: - xr.DataArray, xr.Dataset and np.ndarray. - """ - - if isinstance(arraylike, xr.Dataset): - return self.from_xr_dataset(arraylike) - - if isinstance(arraylike, xr.DataArray): - return self.from_xr_dataarray(arraylike, dummy_varname) - - if isinstance(arraylike, (np.ndarray, list, tuple)): - arraylike = np.asarray(arraylike) # force convert just in case - return self.from_np_array(arraylike, dummy_varname, dimnames) - - # unsupported type - raise TypeError(msg_type_error) - - def map_each_var( - self, - _fn: Callable[[xr.DataArray, ...], xr.DataArray], - *_fn_args, - **_fn_kwargs, - ) -> PetDataArrayLike: - """ - Applies a function over each data array in the dataset. The return type will be dataset. - - The return type of each function operation itself will be per variable (dataarray). - - Only functions that have common structure associated to the variables in the Dataset will - work properly. - - IMPORTANT: global attributes and special variables may not be preserved. This operation is - destructive and for intermediate computation purposes only. - - Args: - _fn: takes a DataArray as its first input arg and produces a DataArray as output - _fn_args: additional positional arguments to provide to _fn - _fn_kwargs: additional keyword arguments to provide to _fn - """ - errmsg_badinputtype = "PetDataset.map_each_var: invalid input type detected" - errmsg_singlearrayret = ( - "PetDataset.map_each_var: Expect function to return a single xr.DataArray" - ) - - if self.raw_type == PetInputDataType.UNKNOWN: - raise RuntimeError(errmsg_badinputtype) - - dict_res = {} - - for k_var, v_da in self.ds.data_vars.items(): - # sense check - assert isinstance(v_da, xr.DataArray) - - da_res = _fn(v_da, *_fn_args, **_fn_kwargs) - - if not isinstance(da_res, xr.DataArray): - raise RuntimeError(errmsg_singlearrayret) - - dict_res[k_var] = da_res - - ds_res = xr.Dataset(dict_res) - - if self.return_raw_result: - return self._raw_result(ds_res) - - # return upgraded dataset by default - return ds_res - - def _raw_result(self, ds: xr.Dataset) -> PetDataArrayLike: - """ - Converts a result back into the original data structure. Down-converting is a lot safer and - so less checks required. - - NOTE: the returned datatype may have dummy names attached, as such these results are for - intermediate computation purposes only, not for operational outputs. - """ - if self.raw_type == PetDataArrayLike.UNKNOWN: - # this should not happen - _raw_result should not be called externally - raise RuntimeError("PetDataset._raw_result: Invalid raw type encountered") - elif self.raw_type == PetDataArrayLike.XR_DATASET: - # nothing to do - return ds - elif self.raw_type == PetDataArrayLike.XR_DATAARRAY: - # extract the dataarray - return ds[self._dummyvarname] - elif self.raw_type == PetDataArrayLike.NP_ARRAY: - # extract the numpy array - note this may force a memory load. - return ds[self._dummyvarname].values From 16477b41428622fc3c0c9f6b68e50feb2796e6d6 Mon Sep 17 00:00:00 2001 From: Nikeeth Ramanathan Date: Thu, 5 Mar 2026 12:57:15 +1100 Subject: [PATCH 28/28] [skip ci] improved explanation of `with_return_raw_result` --- .../src/persistence/interface/types.py | 40 +++++++++++++++++-- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/packages/bundled_models/persistence/src/persistence/interface/types.py b/packages/bundled_models/persistence/src/persistence/interface/types.py index c3173326..e91ba438 100644 --- a/packages/bundled_models/persistence/src/persistence/interface/types.py +++ b/packages/bundled_models/persistence/src/persistence/interface/types.py @@ -39,14 +39,46 @@ def __init__( """ self.raw_type = PetInputDataType.UNKNOWN self.ds = self.from_arrlike(arraylike, dummy_varname, dimnames) - self.return_raw_result = False + self.return_raw_result = True - def with_return_raw_result(self, return_raw_result=bool): + def with_return_raw_result(self, return_raw_result: bool = True): """ Optionally set this to return raw array from `map_each_var` - NOTE: this is a special purpose function. It is currently UNUSED, but - may be useful in some rare circumstances where + NOTE: this is a special purpose function. It is useful when multiple operations that take in + PetDataArrayLike are chained. In which case self.return_raw_result = False will have some + slight performance benefit, otherwise you'd have to do: + + ``` + pd1 = PetDataset(arr) + res1 = pd1.map_each_var(fn1) + pd2 = PetDataset(res1) # each of this call incurs a overhead. + res2 = pd2.map_each_var(fn2) + ``` + + Instead, setting `with_return_raw_result(False)` we can chain methods: + + ``` + pet_ds = PetDataset(arr) + # no over head since the return type of each method is already a PetDataset + result = pet_ds.map_each_var(fn1).map_each_var(fn2)... + ``` + + Finally we can set: + + ``` + raw_result = + pet_ds.map_each_var(fn1) + .map_each_var(fn2) + ... + .with_return_raw_result() + .map_each_var(final_fn) + ``` + + if we explicitly need the raw result at the end. + + The default (True) is always to return the original array type. This would be the case for + most one-off computations. """ self.return_raw_result = return_raw_result