From a177d3535086990d66d337e0a8855f5d30a41ef5 Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Thu, 26 Feb 2026 16:07:37 -0500 Subject: [PATCH 1/2] Folder name matching --- .github/copilot-instructions.md | 80 +++++++++++++++++++ datasets/ecmwf-forecast/dataset.yaml | 6 +- pctasks/dataset/pctasks/dataset/models.py | 1 + .../dataset/pctasks/dataset/splits/task.py | 1 + 4 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 .github/copilot-instructions.md diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 000000000..cceb6c8ce --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,80 @@ +# Project Guidelines — Planetary Computer Tasks (pctasks) + +## Architecture + +Monorepo of 12 Python namespace packages under `pctasks/`, each independently installable via `hatchling`. Dependency order: `core → cli → task → client → ingest → ingest_task → dataset → run → notify → router → server → dev`. Additional code in `pctasks_funcs/` (Azure Functions), `datasets/` (~60 dataset definitions), and `deployment/` (Terraform/Helm). + +**Core data flow**: blob storage assets → splits (by prefix) → chunks (file listing CSVs) → `Collection.create_item()` produces pystac Items → ingest into PgSTAC via pypgstac. + +Key abstractions: +- **`PCBaseModel`** (`pctasks/core/pctasks/core/models/base.py`): Pydantic v2 base with YAML serde, `by_alias=True`, `exclude_none=True`. +- **`Task[T, U]`** (`pctasks/task/pctasks/task/task.py`): Generic abstract task, define `_input_model`/`_output_model` and `run(input, context)`. +- **`Collection`** (`pctasks/dataset/pctasks/dataset/collection.py`): Abstract base for datasets; implement `create_item(cls, asset_uri, storage_factory) -> List[pystac.Item] | WaitTaskResult` as a classmethod. +- **`StorageFactory`**: Creates `Storage` objects from `blob:////` URIs. +- **Settings**: `pydantic-settings` with double-underscore env vars (`PCTASKS__COSMOSDB__URL`, `PCTASKS_RUN__TASK_RUNNER_TYPE`). + +## Code Style + +- **Formatter**: black (default settings) + isort. Run `scripts/format`. +- **Linter**: flake8 with `max-line-length = 120`, ignores `E203, W503, E731, E722` (see `.flake8`). +- **Type checking**: mypy with `disallow_untyped_defs = True` — all functions must have type annotations (see `mypy.ini`). +- **No docstrings** unless explicitly requested. +- All models extend `PCBaseModel` (Pydantic v2). Use `model_validate`, `model_dump`, `field_validator`, `model_validator`. + +## Build and Test + +```bash +scripts/install # Create venv with uv, install all packages in editable mode +scripts/server # Start local dev stack (Azurite, PgSTAC, server, STAC API) +scripts/test # Full lint + test suite (runs in Docker via docker-compose.console.yml) +scripts/test --subpackage core # Test one package +scripts/test --test-only # Skip lint +scripts/test --lint-only # Skip tests +scripts/format # Auto-format all packages (black + isort) +``` + +Per-package test commands (from within `pctasks//`): +```bash +pytest tests/ # Run package tests +mypy --config-file ../../mypy.ini pctasks # Type-check +``` + +## Project Conventions + +### Dataset directory pattern (`datasets//`) +``` +dataset.yaml # Required: dataset definition (id, image, collections, asset_storage, chunks) +.py # Required: Collection subclass with create_item classmethod +collection/template.json # STAC collection Jinja2 template +test_.py # Tests: import collection class, call create_item with blob URIs +requirements.txt # Optional: extra pip deps +Dockerfile # Optional: custom image (otherwise uses pctasks-task-base) +``` + +Dataset YAML references code via `${{ local.path(./.py) }}` and class via `:` (no `.py`). + +### Templating syntax in YAML +- `${{ args. }}` — CLI arguments +- `${{ secrets. }}` — KeyVault/dev-secrets.yaml +- `${{ local.path() }}` — Path relative to YAML file +- `${{ local.file() }}` — Inline file contents + +### CLI +Click-based with plugin discovery via `pctasks.commands` entry point group. Each package registers subcommands in its `pyproject.toml`. + +### Testing patterns +- pytest with `asyncio_mode = auto` (see `pytest.ini`) +- Dataset tests are co-located: `datasets//test_.py` +- Package tests: `pctasks//tests/` +- Integration tests: `tests/` +- Use `StorageFactory()` in tests for blob access; `SimpleWorkflowExecutor` for local workflow execution. + +## Integration Points + +- **Azure Blob Storage**: All asset URIs use `blob:////` scheme. +- **Cosmos DB**: Workflow/run state persistence; emulated locally via docker-compose. +- **PgSTAC**: STAC item storage (PostgreSQL + PostGIS), port 5499 locally. +- **Azurite**: Local Azure Storage emulator, ports 10000-10002. +- **Argo Workflows**: Production workflow orchestration on Kubernetes. +- **Azure Batch**: Production task execution. +- **Kind cluster**: Local Kubernetes for Argo testing (`scripts/cluster setup`). diff --git a/datasets/ecmwf-forecast/dataset.yaml b/datasets/ecmwf-forecast/dataset.yaml index 294878396..ab75fb629 100644 --- a/datasets/ecmwf-forecast/dataset.yaml +++ b/datasets/ecmwf-forecast/dataset.yaml @@ -1,8 +1,9 @@ id: ecmwf_forecast -image: ${{ args.registry }}/pctasks-ecmwf-forecast:2026.01.12 +image: ${{ args.registry }}/pctasks-ecmwf-forecast:2026.02.26 args: - registry + - year_prefix code: src: ${{ local.path(./ecmwf_forecast.py) }} @@ -17,6 +18,9 @@ collections: asset_storage: - uri: blob://ai4edataeuwest/ecmwf/ chunks: + splits: + - depth: 1 + folder_matches: ^${{ args.year_prefix }} options: # currently excluding "aifs", in favor of "ifs" # Could put that in a different collection, or modify diff --git a/pctasks/dataset/pctasks/dataset/models.py b/pctasks/dataset/pctasks/dataset/models.py index a8838ba04..f7a0bc56f 100644 --- a/pctasks/dataset/pctasks/dataset/models.py +++ b/pctasks/dataset/pctasks/dataset/models.py @@ -42,6 +42,7 @@ class SplitDefinition(PCBaseModel): prefix: Optional[str] = None depth: int + folder_matches: Optional[str] = None class ChunkOptions(PCBaseModel): diff --git a/pctasks/dataset/pctasks/dataset/splits/task.py b/pctasks/dataset/pctasks/dataset/splits/task.py index 35425c4e8..e2451dee7 100644 --- a/pctasks/dataset/pctasks/dataset/splits/task.py +++ b/pctasks/dataset/pctasks/dataset/splits/task.py @@ -50,6 +50,7 @@ def run( max_depth=split_config.depth, min_depth=split_config.depth, name_starts_with=split_prefix, + folder_matches=split_config.folder_matches, ): print(".", end="", flush=True) # Avoid walking through the same prefix twice From 16522cf3fef25efb7f7a83b42c57a58733405af5 Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Thu, 26 Feb 2026 16:16:36 -0500 Subject: [PATCH 2/2] Use hyphen --- datasets/ecmwf-forecast/dataset.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/ecmwf-forecast/dataset.yaml b/datasets/ecmwf-forecast/dataset.yaml index ab75fb629..347027790 100644 --- a/datasets/ecmwf-forecast/dataset.yaml +++ b/datasets/ecmwf-forecast/dataset.yaml @@ -3,7 +3,7 @@ image: ${{ args.registry }}/pctasks-ecmwf-forecast:2026.02.26 args: - registry - - year_prefix + - year-prefix code: src: ${{ local.path(./ecmwf_forecast.py) }} @@ -20,7 +20,7 @@ collections: chunks: splits: - depth: 1 - folder_matches: ^${{ args.year_prefix }} + folder_matches: ^${{ args.year-prefix }} options: # currently excluding "aifs", in favor of "ifs" # Could put that in a different collection, or modify