Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@ jobs:
test:
strategy:
matrix:
os: [ubuntu-latest, windows-latest] # 'macos-latest' failing on github-ci with memory error
os: [ubuntu-latest, windows-latest, macos-latest] # failing on github-ci with memory error
backend: ['torch','jax'] # 'tensorflow' is excluded since on github-ci tests get stuck for a yet unknown reason
python-version: ['3.10','3.11','3.12']
# macos-latest:
# - misses support for ratarmount > index_zstd
# - for >=3.13 indexed_zstd/libzstd-seek/zstd-seek.h:20:10: fatal error: zstd.h: No such file or directory
python-version: ['3.10','3.11','3.12','3.13','3.14']
runs-on: ${{ matrix.os }}
steps:
- name: Remove unnecessary tools (Linux)
Expand All @@ -34,11 +37,19 @@ jobs:
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies ${{ matrix.os }}
if: runner.os == 'Linux'
run: |
# ensure zstd.h is available
sudo apt update && sudo apt install -y libzstd-dev

- name: Install pip
run: python -m pip install -U pip

- name: Install package
run: python -m pip install -e .[ml,dev,test]
run: |
python -m pip install -e .[dev,test]
python -m pip install ${{ matrix.backend }}

# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/accessing-contextual-information-about-workflow-runs#runner-context
- name: Set Env (Windows)
Expand All @@ -57,6 +68,13 @@ jobs:
run: |
echo "KERAS_BACKEND=${{ matrix.backend }}" >> $GITHUB_ENV
echo "PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0" >> $GITHUB_ENV
if [ -d "/opt/homebrew/bin" ]; then
echo "/opt/homebrew/bin" >> $GITHUB_PATH
elif [ -d "/usr/local/bin" ]; then
echo "/usr/local/bin" >> $GITHUB_PATH
fi
brew install zstd


# Run keras backend specific tests (for keras>=3)
# Due to space issues avoid creating too many venvs (via tox)
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ repos:

- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.14.3
rev: v0.15.1
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]

# https://pycqa.github.io/isort/docs/configuration/black_compatibility.html
- repo: https://github.com/pycqa/isort
rev: 5.12.0
rev: 8.0.1
hooks:
- id: isort
args: ["--profile", "black", "--filter-files"]
Expand Down
2 changes: 2 additions & 0 deletions docs/examples/damast-process-pipeline.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

from damast.core import DataProcessingPipeline
from damast.data_handling.transformers import AddDeltaTime
from damast.domains.maritime.transformers.features import DeltaDistance, Speed
Expand Down
8 changes: 6 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ classifiers = [
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
"License :: OSI Approved :: BSD License",
]

Expand All @@ -44,12 +46,14 @@ dependencies = [
"networkx",
"numba",
"numpy>=2",
"pandas",
"polars>=1.36.1",
"psutil",
"pyais",
"pyarrow",
"pydantic>=2.0",
"ratarmount>=1.1",
"pydot",
"ratarmount>=1.2.1; platform_system != 'Darwin'",
"scikit-learn",
"tables",
"tqdm",
Expand Down Expand Up @@ -81,7 +85,7 @@ ml = [
# BEGIN keras backends
"torch",
"jax[cpu]",
"tensorflow"
"tensorflow; python_version <= '3.13'"
# END backends
]

Expand Down
4 changes: 2 additions & 2 deletions src/damast/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
describe,
input,
output,
)
)
from .metadata import (
ArtifactSpecification,
DataSpecification,
History,
MetaData,
ValidationMode,
)
)

__all__ = [
"AnnotatedDataFrame",
Expand Down
2 changes: 1 addition & 1 deletion src/damast/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
DAMAST_CSV_DEFAULT_ARGS,
DAMAST_SPEC_SUFFIX,
DAMAST_SUPPORTED_FILE_FORMATS,
)
)
from .data_description import ListOfValues, MinMax
from .metadata import DataSpecification, MetaData, ValidationMode
from .types import DataFrame, XDataFrame
Expand Down
2 changes: 1 addition & 1 deletion src/damast/core/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
DECORATED_DESCRIPTION,
DECORATED_INPUT_SPECS,
DECORATED_OUTPUT_SPECS,
)
)
from .metadata import ArtifactSpecification, DataSpecification
from .transformations import PipelineElement

Expand Down
6 changes: 3 additions & 3 deletions src/damast/core/polars_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def open(cls, path: str | Path, sep = ',') -> polars.LazyFrame:
raise ValueError(f"{cls.__name__}.load_data: Unsupported input file format {path.suffix}")

@classmethod
def from_vaex_hdf5(cls, path: str | Path) -> tuple[polars.LazyFrame, 'MetaData']:
def from_vaex_hdf5(cls, path: str | Path) -> tuple[polars.LazyFrame, 'MetaData']: # noqa
"""
Load hdf5 file and (damast) metadata if found in the file.
"""
Expand Down Expand Up @@ -337,7 +337,7 @@ def from_vaex_hdf5(cls, path: str | Path) -> tuple[polars.LazyFrame, 'MetaData']
return polars.LazyFrame(data), metadata

@classmethod
def import_netcdf(cls, path: list[str|Path]) -> tuple[polars.LazyFrame, dict[str, 'MetaData']]:
def import_netcdf(cls, path: list[str|Path]) -> tuple[polars.LazyFrame, dict[str, 'MetaData']]: #noqa

ensure_packages(pkgs=["dask", "xarray", "pandas"],
required_for="Loading netcdf files",
Expand All @@ -356,7 +356,7 @@ def import_netcdf(cls, path: list[str|Path]) -> tuple[polars.LazyFrame, dict[str
return df.lazy(), {}

@classmethod
def import_hdf5(cls, files: str | Path | list[str|Path]) -> tuple[polars.LazyFrame, dict[str, 'MetaData']]:
def import_hdf5(cls, files: str | Path | list[str|Path]) -> tuple[polars.LazyFrame, dict[str, 'MetaData']]: # noqa
"""
Import a dataframe stored as HDF5.

Expand Down
6 changes: 3 additions & 3 deletions src/damast/core/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
DECORATED_DESCRIPTION,
DECORATED_INPUT_SPECS,
DECORATED_OUTPUT_SPECS,
)
)
from .formatting import DEFAULT_INDENT


Expand All @@ -43,13 +43,13 @@ def fit_transform(self, df: AnnotatedDataFrame, other: AnnotatedDataFrame | None

class PipelineElement(Transformer):
#: Pipeline in which context this processor will be run
parent_pipeline: 'DataProcessingPipeline'
parent_pipeline: 'DataProcessingPipeline' #noqa

#: Map names of input and outputs for a particular pipeline
_name_mappings: dict[str, dict[str, str]]

#: Map names of datasource (arguments) to a specific (extra) transformer arguments
def set_parent(self, pipeline: 'DataProcessingPipeline'):
def set_parent(self, pipeline: 'DataProcessingPipeline'): #noqa
"""
Sets the parent pipeline for this pipeline element

Expand Down
26 changes: 19 additions & 7 deletions src/damast/data_handling/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import logging
import random
import sys
import time
from typing import Any, List, Optional, Union

Expand All @@ -13,6 +14,7 @@
import polars as pl

from damast.core.types import DataFrame, XDataFrame
from damast.ml import keras

__all__ = [
"GroupSequenceAccessor",
Expand All @@ -21,6 +23,16 @@
logger = logging.getLogger("damast")


if sys.platform == "darwin":
# Handle "Cannot convert a MPS Tensor to float64 dtype as the MPS framework doesn't support float64. Please use float32 instead"
def _mps_precision(data):
if data.dtype == np.float64:
return data.astype(np.float32)
return data
else:
def _mps_precision(data):
return data

# https://www.tensorflow.org/tutorials/structured_data/time_series
class GroupSequenceAccessor:
"""
Expand Down Expand Up @@ -60,7 +72,7 @@ def __init__(self,
self.groups = df.unique(group_column)

if sort_columns is not None:
self.sort_columns = sort_columns if type(sort_columns) == list else [sort_columns]
self.sort_columns = sort_columns if type(sort_columns) is list else [sort_columns]
else:
self.sort_columns = sort_columns

Expand Down Expand Up @@ -178,7 +190,7 @@ def to_keras_generator(self, features: List[str],
f" got {datatypes}")

if use_target:
target = target if type(target) == list else [target]
target = target if type(target) is list else [target]

if sequence_forecast < 0:
raise ValueError(f"{self.__class__.__name__}: Sequence forecast cannot be negative")
Expand Down Expand Up @@ -219,7 +231,7 @@ def _generator(features: List[str], target: Optional[List[str]],
all_columns += self.sort_columns
use_target = target is not None
if use_target:
target = target if type(target) == list else target
target = target if type(target) is list else target
all_columns += target
else:
# If no target, we are not forecasting
Expand Down Expand Up @@ -288,12 +300,12 @@ def _generator(features: List[str], target: Optional[List[str]],
# target it the last step in the timeline, so the last
target_chunk.append(target_window.to_numpy())

X = np.array(chunk)
X = _mps_precision(np.array(chunk))
if use_target:
if np.lib.NumpyVersion(np.__version__) >= '2.0.0':
y = np.array(target_chunk)
y = _mps_precision(np.array(target_chunk))
else:
y = np.array(target_chunk, copy=False)
y = _mps_precision(np.array(target_chunk, copy=False))
yield (X, y)
else:
yield (X,)
Expand Down Expand Up @@ -408,7 +420,7 @@ def to_keras_generator(self, features,
raise ValueError(f"{self.__class__.__name__}: Sequence forecast cannot be negative")

if use_target:
target = target if type(target) == list else target
target = target if type(target) is list else target
if sequence_forecast == 0:
raise ValueError(f"{self.__class__.__name__}: Cannot do extract targets with no sequence forecast")
else:
Expand Down
2 changes: 1 addition & 1 deletion src/damast/data_handling/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
ChangeTypeColumn,
JoinDataFrameByColumn,
MultiplyValue,
)
)
from .filters import DropMissingOrNan, FilterWithin, RemoveValueRows
from .normalizers import normalize

Expand Down
2 changes: 1 addition & 1 deletion src/damast/data_handling/transformers/augmenters.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ def __init__(self, new_type: Any):
"""Constructor"""
if isinstance(new_type, pl.datatypes.classes.DataTypeClass):
self.new_type = new_type
elif type(new_type) == str:
elif type(new_type) is str:
polar_type = new_type.capitalize()
if not hasattr(pl.datatypes.classes, polar_type):
raise TypeError(f"Type {new_type} has not correspondence in 'polars'")
Expand Down
2 changes: 1 addition & 1 deletion src/damast/domains/maritime/ais/data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
ColumnName,
CourseOverGround,
SpeedOverGround,
)
)

_log: Logger = getLogger(__name__)
_log.setLevel(INFO)
Expand Down
2 changes: 1 addition & 1 deletion src/damast/domains/maritime/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
DropMissingOrNan,
FilterWithin,
RemoveValueRows,
)
)
from damast.data_handling.transformers.augmenters import AddLocalIndex
from damast.domains.maritime.ais import vessel_types
from damast.domains.maritime.transformers import AddVesselType, ComputeClosestAnchorage
Expand Down
2 changes: 1 addition & 1 deletion src/damast/domains/maritime/transformers/augmenters.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def load_data(cls,
try:
return XDataFrame.open(path=filename, sep=sep)
except FileNotFoundError:
raise RuntimeError(f"{cls}: Vessel type information not accessible. File {vessel_type_csv} not found")
raise RuntimeError(f"{cls}: Vessel type information not accessible. File {filename} not found")

@damast.core.describe("Compute distance from dataset to closest anchorage")
@damast.core.input({"x": {"representation_type": float, "unit": damast.core.units.units.deg},
Expand Down
35 changes: 35 additions & 0 deletions src/damast/ml/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import importlib
import logging
import os

logger = logging.getLogger(__name__)

# To ensure that autoloading works as expected
# from damast.ml import keras
def backend_available(backend: str) -> bool:
try:
importlib.import_module(backend)
return True
except ImportError:
return False

def autodiscover_backend(priority: list[str] = ["torch", "tensorflow", "jax"]):
for backend in priority:
if backend_available(backend):
logger.warning(f"Autoloading backend: {backend} - to explicity select backend set env variable KERAS_BACKEND")
os.environ['KERAS_BACKEND'] = backend
break
else:
logger.info(f"Ignoring backend: {backend}, library is not available")

if "KERAS_BACKEND" in os.environ:
backend = os.environ['KERAS_BACKEND']
if not backend_available(backend):
raise RuntimeError(
"Keras backend: select backend '{backend}' is not available. "
"Please install the required package(s)."
)
else:
autodiscover_backend()

import keras # noqa
Loading
Loading