From 6718dd34ae16980999d26fe357a0eff686afc052 Mon Sep 17 00:00:00 2001 From: "agent-kurouto[bot]" <268466204+agent-kurouto[bot]@users.noreply.github.com> Date: Fri, 20 Mar 2026 22:25:18 +0000 Subject: [PATCH 1/8] feat(hashing): integrate starfix-python for Arrow schema and data hashing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the ad-hoc Arrow IPC serialisation + hashlib approach in SemanticArrowHasher with a new StarfixArrowHasher that delegates to the canonical `starfix` ArrowDigester (nauticalab/starfix-python). Changes ------- * src/orcapod/hashing/arrow_hashers.py - Add StarfixArrowHasher class: reuses SemanticHashingVisitor for semantic-type pre-processing (e.g. Path → file-content hash), then calls ArrowDigester.hash_table / hash_schema for the final digest. - Adds hash_schema() method for schema-only fingerprinting. - Empty-column edge-case: fall back to field.type when no rows exist. * src/orcapod/hashing/versioned_hashers.py - _CURRENT_ARROW_HASHER_ID remains "arrow_v0.1". - get_versioned_semantic_arrow_hasher() now returns StarfixArrowHasher. * src/orcapod/contexts/data/v0.1.json - Swap arrow_hasher._class to StarfixArrowHasher (hasher_id unchanged: "arrow_v0.1"); remove IPC-specific params (hash_algorithm, chunk_size, serialization_method). * pyproject.toml / uv.lock - Declare starfix dependency sourced from nauticalab/starfix-python (pure-Python implementation, git source via [tool.uv.sources]). * tests/test_hashing/test_starfix_arrow_hasher.py (new) - 24 unit tests covering schema hashing, table hashing, column-order independence, batch-split parity, type normalisation, empty tables, versioned_hashers factory, and context integration. Closes PLT-1063 Co-Authored-By: Claude Sonnet 4.6 --- pyproject.toml | 5 +- src/orcapod/contexts/data/v0.1.json | 8 +- src/orcapod/hashing/arrow_hashers.py | 124 ++++++++- src/orcapod/hashing/versioned_hashers.py | 6 +- .../test_hashing/test_starfix_arrow_hasher.py | 236 ++++++++++++++++++ uv.lock | 34 +-- 6 files changed, 372 insertions(+), 41 deletions(-) create mode 100644 tests/test_hashing/test_starfix_arrow_hasher.py diff --git a/pyproject.toml b/pyproject.toml index ad9c04ef..d5226222 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ dependencies = [ "deltalake>=1.0.2", "graphviz>=0.21", "gitpython>=3.1.45", - "starfix>=0.1.3", + "starfix", "pygraphviz>=1.14", "tzdata>=2024.1", "uuid-utils>=0.11.1", @@ -51,6 +51,9 @@ where = ["src"] [tool.setuptools_scm] version_file = "src/orcapod/_version.py" +[tool.uv.sources] +starfix = { git = "https://github.com/nauticalab/starfix-python.git" } + [dependency-groups] dev = [ diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index cd16b5d5..2cb2bfc4 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -22,12 +22,9 @@ } }, "arrow_hasher": { - "_class": "orcapod.hashing.arrow_hashers.SemanticArrowHasher", + "_class": "orcapod.hashing.arrow_hashers.StarfixArrowHasher", "_config": { "hasher_id": "arrow_v0.1", - "hash_algorithm": "sha256", - "chunk_size": 8192, - "serialization_method": "logical", "semantic_registry": { "_ref": "semantic_registry" } @@ -83,7 +80,8 @@ "changelog": [ "Initial release with Path semantic type support", "Basic SHA-256 hashing for files and objects", - "Arrow logical serialization method" + "Arrow logical serialization method", + "arrow_v0.1: replaced custom IPC serialisation+hashlib with starfix ArrowDigester for cross-language-compatible Arrow hashing" ] } } diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index 77cb3b49..a8c34a9e 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -4,6 +4,7 @@ from typing import Any import pyarrow as pa +from starfix import ArrowDigester from orcapod.hashing import arrow_serialization from orcapod.hashing.visitors import SemanticHashingVisitor @@ -197,7 +198,7 @@ def hash_table(self, table: pa.Table | pa.RecordBatch) -> ContentHash: return ContentHash(method=self.hasher_id, digest=hasher.digest()) - def hash_table_with_metadata(self, table: pa.Table) -> dict[str, Any]: + def hash_table_with_metadata(self, table: pa.Table) -> dict[str, Any]: # noqa: C901 """ Compute hash with additional metadata about the process. @@ -233,3 +234,124 @@ def hash_table_with_metadata(self, table: pa.Table) -> dict[str, Any]: "processed_columns": processed_columns, "column_order": [field.name for field in table.schema], } + + +class StarfixArrowHasher: + """ + Arrow table hasher backed by the starfix-python ``ArrowDigester``. + + This hasher produces cross-language-compatible, deterministic content + addresses for Arrow tables and schemas by delegating to the canonical + StarFix specification (``starfix-python``). + + Pipeline + -------- + 1. **Semantic pre-processing** — the ``SemanticHashingVisitor`` traverses + every column and replaces recognised semantic types (e.g. ``Path`` + structs) with their content-addressed hash strings. This step runs + before the Arrow bytes are ever touched by starfix, so the final hash + captures *file content* for path-typed columns rather than the raw + path string. + 2. **Starfix hashing** — ``ArrowDigester.hash_table`` (or + ``ArrowDigester.hash_schema``) is called on the pre-processed table / + schema. The digester is column-order-independent and normalises + ``Utf8`` → ``LargeUtf8``, ``Binary`` → ``LargeBinary``, etc., + producing a 35-byte versioned SHA-256 digest that is byte-for-byte + identical to the Rust ``starfix`` crate output. + + Parameters + ---------- + semantic_registry: + Registry of semantic type converters used during pre-processing. + hasher_id: + String identifier embedded in every ``ContentHash`` produced by + this hasher. Bump this value whenever the hash algorithm changes + so that stored hashes remain distinguishable. + """ + + def __init__( + self, + semantic_registry: SemanticTypeRegistry, + hasher_id: str, + ) -> None: + self._hasher_id = hasher_id + self.semantic_registry = semantic_registry + + @property + def hasher_id(self) -> str: + return self._hasher_id + + def _process_table_columns(self, table: pa.Table | pa.RecordBatch) -> pa.Table: + """Replace semantic-typed columns with their content-hash strings.""" + new_columns: list[pa.Array] = [] + new_fields: list[pa.Field] = [] + + for i, field in enumerate(table.schema): + column_data = table.column(i).to_pylist() + visitor = SemanticHashingVisitor(self.semantic_registry) + + try: + new_type: pa.DataType | None = None + processed_data: list[Any] = [] + for value in column_data: + processed_type, processed_value = visitor.visit(field.type, value) + if new_type is None: + new_type = processed_type + processed_data.append(processed_value) + + # For empty columns there are no values to infer the type from; + # fall back to the field's declared type. + if new_type is None: + new_type = field.type + new_columns.append(pa.array(processed_data, type=new_type)) + new_fields.append(pa.field(field.name, new_type)) + + except Exception as exc: + raise RuntimeError( + f"Failed to process column '{field.name}': {exc}" + ) from exc + + return pa.table(new_columns, schema=pa.schema(new_fields)) + + def hash_schema(self, schema: pa.Schema) -> ContentHash: + """Hash an Arrow schema using the starfix canonical algorithm. + + Parameters + ---------- + schema: + The ``pyarrow.Schema`` to hash. + + Returns + ------- + ContentHash + A ``ContentHash`` whose ``digest`` is the 35-byte versioned + SHA-256 produced by ``ArrowDigester.hash_schema``. + """ + digest = ArrowDigester.hash_schema(schema) + return ContentHash(method=self._hasher_id, digest=digest) + + def hash_table(self, table: pa.Table | pa.RecordBatch) -> ContentHash: + """Hash an Arrow table (or ``RecordBatch``) using starfix. + + Semantic types are resolved to their content-hash strings before + the table is passed to ``ArrowDigester.hash_table``, ensuring that + path-typed columns contribute their *file content* hash rather than + the literal path string. + + Parameters + ---------- + table: + The ``pa.Table`` or ``pa.RecordBatch`` to hash. + + Returns + ------- + ContentHash + A ``ContentHash`` whose ``digest`` is the 35-byte versioned + SHA-256 produced by ``ArrowDigester.hash_table``. + """ + if isinstance(table, pa.RecordBatch): + table = pa.Table.from_batches([table]) + + processed_table = self._process_table_columns(table) + digest = ArrowDigester.hash_table(processed_table) + return ContentHash(method=self._hasher_id, digest=digest) diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index 8adce44d..fda8cd07 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -123,7 +123,7 @@ def get_versioned_semantic_arrow_hasher( ArrowHasherProtocol A fully configured SemanticArrowHasher instance. """ - from orcapod.hashing.arrow_hashers import SemanticArrowHasher + from orcapod.hashing.arrow_hashers import StarfixArrowHasher from orcapod.hashing.file_hashers import BasicFileHasher from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry from orcapod.semantic_types.semantic_struct_converters import PathStructConverter @@ -138,11 +138,11 @@ def get_versioned_semantic_arrow_hasher( registry.register_converter("path", path_converter) logger.debug( - "get_versioned_semantic_arrow_hasher: creating SemanticArrowHasher " + "get_versioned_semantic_arrow_hasher: creating StarfixArrowHasher " "(hasher_id=%r)", hasher_id, ) - hasher: Any = SemanticArrowHasher( + hasher: Any = StarfixArrowHasher( hasher_id=hasher_id, semantic_registry=registry, ) diff --git a/tests/test_hashing/test_starfix_arrow_hasher.py b/tests/test_hashing/test_starfix_arrow_hasher.py new file mode 100644 index 00000000..fef27614 --- /dev/null +++ b/tests/test_hashing/test_starfix_arrow_hasher.py @@ -0,0 +1,236 @@ +""" +Tests for StarfixArrowHasher — the starfix-backed Arrow content hasher. + +Coverage +-------- +- hash_schema: returns ContentHash with correct method and 35-byte digest +- hash_table / hash_record_batch: returns ContentHash, digest is 35 bytes +- Column-order independence: reordering columns does not change the hash +- Batch-split independence: splitting a table across batches does not change + the hash (verified via RecordBatch round-trip) +- Type normalisation: Utf8 and LargeUtf8 columns hash identically +- Empty table: hash_table handles a table with zero rows +- Schema-only vs data hash: hash_schema and hash_table differ for the same schema +- versioned_hashers factory: get_versioned_semantic_arrow_hasher returns a + StarfixArrowHasher with hasher_id == _CURRENT_ARROW_HASHER_ID +- Context integration: the v0.1 context wires up StarfixArrowHasher +- Stability (golden values): hard-coded digests catch accidental algorithm changes +""" + +from __future__ import annotations + +import pytest +import pyarrow as pa + +from orcapod.hashing.arrow_hashers import StarfixArrowHasher +from orcapod.hashing.versioned_hashers import ( + _CURRENT_ARROW_HASHER_ID, + get_versioned_semantic_arrow_hasher, +) +from orcapod.semantic_types import SemanticTypeRegistry +from orcapod.types import ContentHash + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +HASHER_ID = "arrow_v0.1" + +_DIGEST_LEN = 35 # 3 version bytes + 32 SHA-256 bytes + + +def _make_hasher() -> StarfixArrowHasher: + return StarfixArrowHasher( + semantic_registry=SemanticTypeRegistry(), + hasher_id=HASHER_ID, + ) + + +# --------------------------------------------------------------------------- +# hash_schema +# --------------------------------------------------------------------------- + + +class TestHashSchema: + def test_returns_content_hash(self): + schema = pa.schema([pa.field("x", pa.int32())]) + h = _make_hasher().hash_schema(schema) + assert isinstance(h, ContentHash) + + def test_method_is_hasher_id(self): + schema = pa.schema([pa.field("x", pa.int32())]) + h = _make_hasher().hash_schema(schema) + assert h.method == HASHER_ID + + def test_digest_length(self): + schema = pa.schema([pa.field("x", pa.int32())]) + h = _make_hasher().hash_schema(schema) + assert len(h.digest) == _DIGEST_LEN + + def test_deterministic(self): + schema = pa.schema([pa.field("a", pa.int64()), pa.field("b", pa.float32())]) + h1 = _make_hasher().hash_schema(schema) + h2 = _make_hasher().hash_schema(schema) + assert h1.digest == h2.digest + + def test_different_schemas_differ(self): + s1 = pa.schema([pa.field("a", pa.int32())]) + s2 = pa.schema([pa.field("a", pa.float32())]) + h1 = _make_hasher().hash_schema(s1) + h2 = _make_hasher().hash_schema(s2) + assert h1.digest != h2.digest + + def test_column_order_independent(self): + """Schema field order should not affect the hash.""" + s1 = pa.schema([pa.field("a", pa.int32()), pa.field("b", pa.float64())]) + s2 = pa.schema([pa.field("b", pa.float64()), pa.field("a", pa.int32())]) + h1 = _make_hasher().hash_schema(s1) + h2 = _make_hasher().hash_schema(s2) + assert h1.digest == h2.digest + + def test_golden_value(self): + """Pin the digest so unintentional algorithm changes are caught.""" + schema = pa.schema( + [ + pa.field("id", pa.int32(), nullable=False), + pa.field("value", pa.float64(), nullable=True), + ] + ) + h = _make_hasher().hash_schema(schema) + # First 6 hex chars encode the 3-byte starfix version prefix (000001) + assert h.digest.hex().startswith("000001"), ( + f"Unexpected version prefix in digest: {h.digest.hex()}" + ) + + +# --------------------------------------------------------------------------- +# hash_table +# --------------------------------------------------------------------------- + + +class TestHashTable: + def test_returns_content_hash(self): + table = pa.table({"x": [1, 2, 3]}) + h = _make_hasher().hash_table(table) + assert isinstance(h, ContentHash) + + def test_method_is_hasher_id(self): + table = pa.table({"x": [1, 2, 3]}) + h = _make_hasher().hash_table(table) + assert h.method == HASHER_ID + + def test_digest_length(self): + table = pa.table({"x": [1, 2, 3]}) + h = _make_hasher().hash_table(table) + assert len(h.digest) == _DIGEST_LEN + + def test_deterministic(self): + table = pa.table({"a": [1, 2], "b": [3.0, 4.0]}) + h1 = _make_hasher().hash_table(table) + h2 = _make_hasher().hash_table(table) + assert h1.digest == h2.digest + + def test_column_order_independent(self): + """Reordering columns must not change the hash.""" + t1 = pa.table({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]}) + t2 = pa.table({"b": [4.0, 5.0, 6.0], "a": [1, 2, 3]}) + h1 = _make_hasher().hash_table(t1) + h2 = _make_hasher().hash_table(t2) + assert h1.digest == h2.digest + + def test_different_data_differs(self): + t1 = pa.table({"x": [1, 2, 3]}) + t2 = pa.table({"x": [1, 2, 4]}) + assert _make_hasher().hash_table(t1).digest != _make_hasher().hash_table(t2).digest + + def test_empty_table(self): + """Zero-row tables should be hashable without error.""" + table = pa.table({"x": pa.array([], type=pa.int32())}) + h = _make_hasher().hash_table(table) + assert isinstance(h, ContentHash) + assert len(h.digest) == _DIGEST_LEN + + def test_schema_hash_differs_from_data_hash(self): + schema = pa.schema([pa.field("x", pa.int32())]) + table = pa.table({"x": [1, 2, 3]}) + hs = _make_hasher().hash_schema(schema) + ht = _make_hasher().hash_table(table) + assert hs.digest != ht.digest + + def test_utf8_largeutf8_normalised(self): + """Utf8 and LargeUtf8 columns should hash identically (starfix normalises).""" + t_utf8 = pa.table({"s": pa.array(["hello", "world"], type=pa.utf8())}) + t_large = pa.table( + {"s": pa.array(["hello", "world"], type=pa.large_utf8())} + ) + h1 = _make_hasher().hash_table(t_utf8) + h2 = _make_hasher().hash_table(t_large) + assert h1.digest == h2.digest + + +# --------------------------------------------------------------------------- +# RecordBatch (batch-split independence) +# --------------------------------------------------------------------------- + + +class TestHashRecordBatch: + def test_record_batch_matches_table(self): + """hash_table on a RecordBatch must equal hash_table on the equivalent Table.""" + table = pa.table({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]}) + batch = table.to_batches()[0] + h_table = _make_hasher().hash_table(table) + h_batch = _make_hasher().hash_table(batch) + assert h_table.digest == h_batch.digest + + def test_method_is_hasher_id_for_batch(self): + batch = pa.record_batch({"x": [1, 2]}) + h = _make_hasher().hash_table(batch) + assert h.method == HASHER_ID + + +# --------------------------------------------------------------------------- +# versioned_hashers factory +# --------------------------------------------------------------------------- + + +class TestVersionedHashersFactory: + def test_returns_starfix_hasher(self): + hasher = get_versioned_semantic_arrow_hasher() + assert isinstance(hasher, StarfixArrowHasher) + + def test_hasher_id_matches_current_constant(self): + hasher = get_versioned_semantic_arrow_hasher() + assert hasher.hasher_id == _CURRENT_ARROW_HASHER_ID + + def test_current_hasher_id_is_arrow_v0_1(self): + """Pin the current version constant to detect accidental version drifts.""" + assert _CURRENT_ARROW_HASHER_ID == "arrow_v0.1" + + +# --------------------------------------------------------------------------- +# Context integration +# --------------------------------------------------------------------------- + + +class TestContextIntegration: + def test_v01_context_uses_starfix_hasher(self): + from orcapod.contexts import resolve_context + + ctx = resolve_context("v0.1") + assert isinstance(ctx.arrow_hasher, StarfixArrowHasher) + + def test_v01_context_hasher_id(self): + from orcapod.contexts import resolve_context + + ctx = resolve_context("v0.1") + assert ctx.arrow_hasher.hasher_id == "arrow_v0.1" + + def test_context_hash_table_functional(self): + from orcapod.contexts import resolve_context + + ctx = resolve_context("v0.1") + table = pa.table({"key": [1, 2, 3], "val": [0.1, 0.2, 0.3]}) + h = ctx.arrow_hasher.hash_table(table) + assert isinstance(h, ContentHash) + assert len(h.digest) == _DIGEST_LEN diff --git a/uv.lock b/uv.lock index 99972ffc..b366f4b4 100644 --- a/uv.lock +++ b/uv.lock @@ -1490,27 +1490,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899, upload-time = "2024-04-15T13:44:43.265Z" }, ] -[[package]] -name = "maturin" -version = "1.9.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/9a/35/c3370188492f4c139c7a318f438d01b8185c216303c49c4bc885c98b6afb/maturin-1.9.6.tar.gz", hash = "sha256:2c2ae37144811d365509889ed7220b0598487f1278c2441829c3abf56cc6324a", size = 214846, upload-time = "2025-10-07T12:45:08.408Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/55/5c/b435418ba4ba2647a1f7a95d53314991b1e556e656ae276dea993c3bce1d/maturin-1.9.6-py3-none-linux_armv6l.whl", hash = "sha256:26e3ab1a42a7145824210e9d763f6958f2c46afb1245ddd0bab7d78b1f59bb3f", size = 8134483, upload-time = "2025-10-07T12:44:44.274Z" }, - { url = "https://files.pythonhosted.org/packages/4d/1c/8e58eda6601f328b412cdeeaa88a9b6a10e591e2a73f313e8c0154d68385/maturin-1.9.6-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:5263dda3f71feef2e4122baf5c4620e4b3710dbb7f2121f85a337182de214369", size = 15776470, upload-time = "2025-10-07T12:44:47.476Z" }, - { url = "https://files.pythonhosted.org/packages/6c/33/8c967cce6848cdd87a2e442c86120ac644b80c5ed4c32e3291bde6a17df8/maturin-1.9.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:fe78262c2800c92f67d1ce3c0f6463f958a692cc67bfb572e5dbf5b4b696a8ba", size = 8226557, upload-time = "2025-10-07T12:44:49.844Z" }, - { url = "https://files.pythonhosted.org/packages/58/bd/3e2675cdc8b7270700ba30c663c852a35694441732a107ac30ebd6878bd8/maturin-1.9.6-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:7ab827c6e8c022eb2e1e7fb6deede54549c8460b20ccc2e9268cc6e8cde957a8", size = 8166544, upload-time = "2025-10-07T12:44:51.396Z" }, - { url = "https://files.pythonhosted.org/packages/58/1f/a2047ddf2230e700d5f8a13dd4b9af5ce806ad380c32e58105888205926e/maturin-1.9.6-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:0246202377c49449315305209f45c8ecef6e2d6bd27a04b5b6f1ab3e4ea47238", size = 8641010, upload-time = "2025-10-07T12:44:53.658Z" }, - { url = "https://files.pythonhosted.org/packages/be/1f/265d63c7aa6faf363d4a3f23396f51bc6b4d5c7680a4190ae68dba25dea2/maturin-1.9.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:f5bac167700fbb6f8c8ed1a97b494522554b4432d7578e11403b894b6a91d99f", size = 7965945, upload-time = "2025-10-07T12:44:55.248Z" }, - { url = "https://files.pythonhosted.org/packages/4c/ca/a8e61979ccfe080948bcc1bddd79356157aee687134df7fb013050cec783/maturin-1.9.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:7f53d3b1d8396d3fea3e1ee5fd37558bca5719090f3d194ba1c02b0b56327ae3", size = 7978820, upload-time = "2025-10-07T12:44:56.919Z" }, - { url = "https://files.pythonhosted.org/packages/bf/4a/81b412f8ad02a99801ef19ec059fba0822d1d28fb44cb6a92e722f05f278/maturin-1.9.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:7f506eb358386d94d6ec3208c003130cf4b69cab26034fc0cbbf8bf83afa4c2e", size = 10452064, upload-time = "2025-10-07T12:44:58.232Z" }, - { url = "https://files.pythonhosted.org/packages/5b/12/cc96c7a8cb51d8dcc9badd886c361caa1526fba7fa69d1e7892e613b71d4/maturin-1.9.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2d6984ab690af509f525dbd2b130714207c06ebb14a5814edbe1e42b17ae0de", size = 8852401, upload-time = "2025-10-07T12:44:59.8Z" }, - { url = "https://files.pythonhosted.org/packages/51/8e/653ac3c9f2c25cdd81aefb0a2d17ff140ca5a14504f5e3c7f94dcfe4dbb7/maturin-1.9.6-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:5c2252b0956bb331460ac750c805ddf0d9b44442449fc1f16e3b66941689d0bc", size = 8425057, upload-time = "2025-10-07T12:45:01.711Z" }, - { url = "https://files.pythonhosted.org/packages/db/29/f13490328764ae9bfc1da55afc5b707cebe4fa75ad7a1573bfa82cfae0c6/maturin-1.9.6-py3-none-win32.whl", hash = "sha256:f2c58d29ebdd4346fd004e6be213d071fdd94a77a16aa91474a21a4f9dbf6309", size = 7165956, upload-time = "2025-10-07T12:45:03.766Z" }, - { url = "https://files.pythonhosted.org/packages/db/9f/dd51e5ac1fce47581b8efa03d77a03f928c0ef85b6e48a61dfa37b6b85a2/maturin-1.9.6-py3-none-win_amd64.whl", hash = "sha256:1b39a5d82572c240d20d9e8be024d722dfb311d330c5e28ddeb615211755941a", size = 8145722, upload-time = "2025-10-07T12:45:05.487Z" }, - { url = "https://files.pythonhosted.org/packages/65/f2/e97aaba6d0d78c5871771bf9dd71d4eb8dac15df9109cf452748d2207412/maturin-1.9.6-py3-none-win_arm64.whl", hash = "sha256:ac02a30083553d2a781c10cd6f5480119bf6692fd177e743267406cad2ad198c", size = 6857006, upload-time = "2025-10-07T12:45:06.813Z" }, -] - [[package]] name = "mdurl" version = "0.1.2" @@ -2114,7 +2093,7 @@ requires-dist = [ { name = "ray", extras = ["default"], marker = "extra == 'ray'", specifier = "==2.48.0" }, { name = "redis", marker = "extra == 'redis'", specifier = ">=6.2.0" }, { name = "s3fs", specifier = ">=2025.12.0" }, - { name = "starfix", specifier = ">=0.1.3" }, + { name = "starfix", git = "https://github.com/nauticalab/starfix-python.git" }, { name = "typing-extensions" }, { name = "tzdata", specifier = ">=2024.1" }, { name = "uuid-utils", specifier = ">=0.11.1" }, @@ -3461,18 +3440,11 @@ wheels = [ [[package]] name = "starfix" -version = "0.1.3" -source = { registry = "https://pypi.org/simple" } +version = "0.0.2" +source = { git = "https://github.com/nauticalab/starfix-python.git#344617bc6f7fcabab5c011d5774ed47de33f21de" } dependencies = [ - { name = "ipykernel" }, - { name = "maturin" }, { name = "pyarrow" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/dd/73/942a97c83a54ec1f641af1c2c8ff15c8ad5e1955d66f56c5437ef6e5c18e/starfix-0.1.3.tar.gz", hash = "sha256:4ac9090e24374dd3d4af466d04bdf6a9fe180ac8fd902b94b29f263d58803b5e", size = 18254, upload-time = "2025-10-29T19:53:23.657Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/54/be/98ca0482cdb4fa25a11a4dbc59c4d2a643bd8210c6c3305b2d58b5e0460c/starfix-0.1.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:ef86702f0d0c8cd37b00cf63aeb6a555832eb24d7853cbe84316473ac38992d8", size = 469719, upload-time = "2025-10-29T19:53:22.473Z" }, - { url = "https://files.pythonhosted.org/packages/94/bf/208c8307d9f005ee9e6709e15bc6fff40c77293c31a8539324dddde8e783/starfix-0.1.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b0c713211ea8b293dbb4f172ca648a7b78481603c47d729c87126c867ed5b5a5", size = 598464, upload-time = "2025-10-29T19:53:21.126Z" }, -] [[package]] name = "strictyaml" From 5c42ec5f34920950e3103f78958831c49617efa4 Mon Sep 17 00:00:00 2001 From: "agent-kurouto[bot]" <268466204+agent-kurouto[bot]@users.noreply.github.com> Date: Fri, 20 Mar 2026 22:47:00 +0000 Subject: [PATCH 2/8] fix(sources): explicitly reject empty tables in ArrowTableSource Previously, constructing an ArrowTableSource with a zero-row table raised an AssertionError deep inside SemanticArrowHasher._process_table_columns as an accidental side effect. Now that StarfixArrowHasher handles empty columns gracefully, that implicit guard was removed. Add an explicit ValueError at the top of ArrowTableSource.__init__ so the rejection is clear, early, and independent of the hasher implementation. Co-Authored-By: Claude Sonnet 4.6 --- src/orcapod/core/sources/arrow_table_source.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/orcapod/core/sources/arrow_table_source.py b/src/orcapod/core/sources/arrow_table_source.py index 42d04a63..637fb4cc 100644 --- a/src/orcapod/core/sources/arrow_table_source.py +++ b/src/orcapod/core/sources/arrow_table_source.py @@ -28,6 +28,12 @@ def __init__( ) -> None: super().__init__(**kwargs) + if len(table) == 0: + raise ValueError( + "ArrowTableSource requires a non-empty table; " + "the provided table has zero rows." + ) + builder = SourceStreamBuilder(self.data_context, self.orcapod_config) result = builder.build( table, From 385d7ec5e7d964484c2c195800b3120c403a6a46 Mon Sep 17 00:00:00 2001 From: "agent-kurouto[bot]" <268466204+agent-kurouto[bot]@users.noreply.github.com> Date: Fri, 20 Mar 2026 22:55:31 +0000 Subject: [PATCH 3/8] test(hashing): expand StarfixArrowHasher stability and correctness coverage - Replace weak version-prefix-only golden value with full 35-byte pinned digests for both schema and table hashing - Add row-order sensitivity: reordered rows must produce a different hash - Add null-position sensitivity: null at position 0 vs 1 must differ - Add null-vs-no-null: presence of any null changes the hash - Add Binary/LargeBinary normalisation parity (mirrors existing Utf8 test) - Add integer-width sensitivity: int32 and int64 with same values must differ - Add field-nullability schema test: nullable=True vs False produce different schema hashes, with fully pinned golden digests for both - Add multi-batch table consistency: Table from two RecordBatches must hash identically to the equivalent single-batch Table Total: 33 tests, up from 24. Co-Authored-By: Claude Sonnet 4.6 --- .../test_hashing/test_starfix_arrow_hasher.py | 92 +++++++++++++++++-- 1 file changed, 85 insertions(+), 7 deletions(-) diff --git a/tests/test_hashing/test_starfix_arrow_hasher.py b/tests/test_hashing/test_starfix_arrow_hasher.py index fef27614..0dfb5fd8 100644 --- a/tests/test_hashing/test_starfix_arrow_hasher.py +++ b/tests/test_hashing/test_starfix_arrow_hasher.py @@ -7,14 +7,19 @@ - hash_table / hash_record_batch: returns ContentHash, digest is 35 bytes - Column-order independence: reordering columns does not change the hash - Batch-split independence: splitting a table across batches does not change - the hash (verified via RecordBatch round-trip) -- Type normalisation: Utf8 and LargeUtf8 columns hash identically + the hash (verified via single-batch and multi-batch RecordBatch round-trips) +- Row-order sensitivity: reordering rows changes the hash +- Type normalisation: Utf8/LargeUtf8 and Binary/LargeBinary hash identically +- Integer width sensitivity: int32 and int64 with the same values differ +- Null handling: null position matters; a null changes the hash vs no-null +- Field nullability: nullable=True and nullable=False produce different schema hashes - Empty table: hash_table handles a table with zero rows - Schema-only vs data hash: hash_schema and hash_table differ for the same schema - versioned_hashers factory: get_versioned_semantic_arrow_hasher returns a StarfixArrowHasher with hasher_id == _CURRENT_ARROW_HASHER_ID - Context integration: the v0.1 context wires up StarfixArrowHasher -- Stability (golden values): hard-coded digests catch accidental algorithm changes +- Stability (golden values): fully-pinned 35-byte digests catch accidental + algorithm changes in both schema and table hashing """ from __future__ import annotations @@ -90,7 +95,7 @@ def test_column_order_independent(self): assert h1.digest == h2.digest def test_golden_value(self): - """Pin the digest so unintentional algorithm changes are caught.""" + """Pin the full 35-byte digest so any algorithm change is caught.""" schema = pa.schema( [ pa.field("id", pa.int32(), nullable=False), @@ -98,9 +103,28 @@ def test_golden_value(self): ] ) h = _make_hasher().hash_schema(schema) - # First 6 hex chars encode the 3-byte starfix version prefix (000001) - assert h.digest.hex().startswith("000001"), ( - f"Unexpected version prefix in digest: {h.digest.hex()}" + assert h.digest.hex() == "000001d676ef0263a8e0e7500b1c97033993dbe445172ca0f9e7577b3994bfa6224b4c", ( + f"Schema golden digest changed — was the starfix algorithm updated? " + f"Got: {h.digest.hex()}" + ) + + def test_nullability_affects_schema_hash(self): + """nullable=True and nullable=False must produce different schema hashes.""" + s_nullable = pa.schema([pa.field("x", pa.int32(), nullable=True)]) + s_non_nullable = pa.schema([pa.field("x", pa.int32(), nullable=False)]) + h_nullable = _make_hasher().hash_schema(s_nullable) + h_non_nullable = _make_hasher().hash_schema(s_non_nullable) + assert h_nullable.digest != h_non_nullable.digest + + def test_nullability_golden_values(self): + """Pin nullable and non-nullable schema digests independently.""" + s_nullable = pa.schema([pa.field("x", pa.int32(), nullable=True)]) + s_non_nullable = pa.schema([pa.field("x", pa.int32(), nullable=False)]) + assert _make_hasher().hash_schema(s_nullable).digest.hex() == ( + "000001b8005339e69df64cda60f9ff9c98caa264092a7b666c3f7f85a2bfed20bae3db" + ) + assert _make_hasher().hash_schema(s_non_nullable).digest.hex() == ( + "00000179353568a71430411e8108ee02d425800f6d5054d9b2baa871cad90f3e06422a" ) @@ -168,6 +192,49 @@ def test_utf8_largeutf8_normalised(self): h2 = _make_hasher().hash_table(t_large) assert h1.digest == h2.digest + def test_binary_largebinary_normalised(self): + """Binary and LargeBinary columns should hash identically (starfix normalises).""" + t_bin = pa.table({"b": pa.array([b"hello", b"world"], type=pa.binary())}) + t_lbin = pa.table({"b": pa.array([b"hello", b"world"], type=pa.large_binary())}) + assert _make_hasher().hash_table(t_bin).digest == _make_hasher().hash_table(t_lbin).digest + + def test_row_order_matters(self): + """Reordering rows must produce a different hash (rows carry positional semantics).""" + t_orig = pa.table({"x": pa.array([1, 2, 3], type=pa.int64())}) + t_reversed = pa.table({"x": pa.array([3, 2, 1], type=pa.int64())}) + assert _make_hasher().hash_table(t_orig).digest != _make_hasher().hash_table(t_reversed).digest + + def test_null_position_matters(self): + """A null in a different row position must produce a different hash.""" + t_null_first = pa.table({"x": pa.array([None, 2, 3], type=pa.int32())}) + t_null_second = pa.table({"x": pa.array([1, None, 3], type=pa.int32())}) + assert _make_hasher().hash_table(t_null_first).digest != _make_hasher().hash_table(t_null_second).digest + + def test_null_differs_from_no_null(self): + """A table with a null value must hash differently from one without.""" + t_with_null = pa.table({"x": pa.array([1, None, 3], type=pa.int32())}) + t_no_null = pa.table({"x": pa.array([1, 2, 3], type=pa.int32())}) + assert _make_hasher().hash_table(t_with_null).digest != _make_hasher().hash_table(t_no_null).digest + + def test_int32_differs_from_int64(self): + """Same values stored as int32 and int64 must hash differently.""" + t_i32 = pa.table({"x": pa.array([1, 2, 3], type=pa.int32())}) + t_i64 = pa.table({"x": pa.array([1, 2, 3], type=pa.int64())}) + assert _make_hasher().hash_table(t_i32).digest != _make_hasher().hash_table(t_i64).digest + + def test_golden_value_table(self): + """Pin the full 35-byte table digest so any algorithm change is caught.""" + table = pa.table({ + "id": pa.array([1, 2, 3], type=pa.int32()), + "score": pa.array([0.1, 0.2, 0.3], type=pa.float64()), + "label": pa.array(["a", "b", "c"], type=pa.utf8()), + }) + h = _make_hasher().hash_table(table) + assert h.digest.hex() == "0000010cd7fe5462420b84f03a06925374e528817a3b72319e679a17e7380964878791", ( + f"Table golden digest changed — was the starfix algorithm updated? " + f"Got: {h.digest.hex()}" + ) + # --------------------------------------------------------------------------- # RecordBatch (batch-split independence) @@ -188,6 +255,17 @@ def test_method_is_hasher_id_for_batch(self): h = _make_hasher().hash_table(batch) assert h.method == HASHER_ID + def test_multi_batch_table_matches_single_batch(self): + """A Table built from multiple RecordBatches must hash the same as + an equivalent single-batch Table (chunked arrays are transparent).""" + b1 = pa.record_batch({"x": pa.array([1, 2], type=pa.int64())}) + b2 = pa.record_batch({"x": pa.array([3, 4], type=pa.int64())}) + t_multi = pa.Table.from_batches([b1, b2]) + t_single = pa.table({"x": pa.array([1, 2, 3, 4], type=pa.int64())}) + h_multi = _make_hasher().hash_table(t_multi) + h_single = _make_hasher().hash_table(t_single) + assert h_multi.digest == h_single.digest + # --------------------------------------------------------------------------- # versioned_hashers factory From 6c8666ad2b009805153404fa0670cbb03aece078 Mon Sep 17 00:00:00 2001 From: "agent-kurouto[bot]" <268466204+agent-kurouto[bot]@users.noreply.github.com> Date: Sat, 21 Mar 2026 05:40:49 +0000 Subject: [PATCH 4/8] fix(hashing): address Copilot review feedback on StarfixArrowHasher - Short-circuit primitive columns in _process_table_columns to avoid costly Python round-trips for columns that cannot contain semantic types - Infer output type from first non-null processed value, preventing incorrect type inference when a semantic-struct column starts with nulls - Preserve field nullable/metadata via field.with_type() and propagate schema-level metadata through to the reconstructed table - Pin starfix git source to locked commit SHA in pyproject.toml - Clarify v0.1.json changelog entry wording Co-Authored-By: Claude Sonnet 4.6 --- pyproject.toml | 2 +- src/orcapod/contexts/data/v0.1.json | 2 +- src/orcapod/hashing/arrow_hashers.py | 30 +++++++++++++++++++++++----- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d5226222..f3773c52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ where = ["src"] version_file = "src/orcapod/_version.py" [tool.uv.sources] -starfix = { git = "https://github.com/nauticalab/starfix-python.git" } +starfix = { git = "https://github.com/nauticalab/starfix-python.git", rev = "344617bc6f7fcabab5c011d5774ed47de33f21de" } [dependency-groups] diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index 2cb2bfc4..f75798b5 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -81,7 +81,7 @@ "Initial release with Path semantic type support", "Basic SHA-256 hashing for files and objects", "Arrow logical serialization method", - "arrow_v0.1: replaced custom IPC serialisation+hashlib with starfix ArrowDigester for cross-language-compatible Arrow hashing" + "Introduced arrow_v0.1 StarfixArrowHasher using starfix ArrowDigester for cross-language-compatible Arrow hashing" ] } } diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index a8c34a9e..7bc9e6cc 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -287,6 +287,19 @@ def _process_table_columns(self, table: pa.Table | pa.RecordBatch) -> pa.Table: new_fields: list[pa.Field] = [] for i, field in enumerate(table.schema): + # Short-circuit: primitive columns cannot contain semantic types, so skip + # the costly Python round-trip and reuse the original Arrow array directly. + if not ( + pa.types.is_struct(field.type) + or pa.types.is_list(field.type) + or pa.types.is_large_list(field.type) + or pa.types.is_fixed_size_list(field.type) + or pa.types.is_map(field.type) + ): + new_columns.append(table.column(i)) + new_fields.append(field) + continue + column_data = table.column(i).to_pylist() visitor = SemanticHashingVisitor(self.semantic_registry) @@ -295,23 +308,30 @@ def _process_table_columns(self, table: pa.Table | pa.RecordBatch) -> pa.Table: processed_data: list[Any] = [] for value in column_data: processed_type, processed_value = visitor.visit(field.type, value) - if new_type is None: + # Infer the output type from the first non-null processed value. + # When the first row is null, visit_struct returns the original + # struct type rather than the converted type (e.g. large_string), + # which would cause pa.array() to fail for subsequent non-null rows. + if new_type is None and processed_value is not None: new_type = processed_type processed_data.append(processed_value) - # For empty columns there are no values to infer the type from; - # fall back to the field's declared type. + # For empty or all-null columns there are no non-null values to infer + # the type from; fall back to the field's declared type. if new_type is None: new_type = field.type new_columns.append(pa.array(processed_data, type=new_type)) - new_fields.append(pa.field(field.name, new_type)) + # Preserve original field attributes (nullable, metadata) while + # updating only the type, so the schema fed to starfix remains faithful. + new_fields.append(field.with_type(new_type)) except Exception as exc: raise RuntimeError( f"Failed to process column '{field.name}': {exc}" ) from exc - return pa.table(new_columns, schema=pa.schema(new_fields)) + # Preserve the original schema-level metadata while using updated fields. + return pa.table(new_columns, schema=pa.schema(new_fields, metadata=table.schema.metadata)) def hash_schema(self, schema: pa.Schema) -> ContentHash: """Hash an Arrow schema using the starfix canonical algorithm. From 6c1abd68a2725263b1fa5c814fe16bf773300d38 Mon Sep 17 00:00:00 2001 From: "agent-kurouto[bot]" <268466204+agent-kurouto[bot]@users.noreply.github.com> Date: Sat, 21 Mar 2026 05:50:50 +0000 Subject: [PATCH 5/8] fix(sources,deps): allow empty ArrowTableSource and pin starfix via PEP 508 - Remove the zero-row ValueError guard from ArrowTableSource.__init__. The check was added to make an accidental AssertionError explicit, but now that StarfixArrowHasher handles empty tables correctly there is no technical reason to reject them; empty sources are valid and produce zero output records. - Replace bare "starfix" + [tool.uv.sources] override with a PEP 508 direct URL in dependencies, so both uv and pip resolve the same pinned commit without relying on uv-specific source overrides. Co-Authored-By: Claude Sonnet 4.6 --- pyproject.toml | 5 +---- src/orcapod/core/sources/arrow_table_source.py | 6 ------ 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f3773c52..028decef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ dependencies = [ "deltalake>=1.0.2", "graphviz>=0.21", "gitpython>=3.1.45", - "starfix", + "starfix @ git+https://github.com/nauticalab/starfix-python.git@344617bc6f7fcabab5c011d5774ed47de33f21de", "pygraphviz>=1.14", "tzdata>=2024.1", "uuid-utils>=0.11.1", @@ -51,9 +51,6 @@ where = ["src"] [tool.setuptools_scm] version_file = "src/orcapod/_version.py" -[tool.uv.sources] -starfix = { git = "https://github.com/nauticalab/starfix-python.git", rev = "344617bc6f7fcabab5c011d5774ed47de33f21de" } - [dependency-groups] dev = [ diff --git a/src/orcapod/core/sources/arrow_table_source.py b/src/orcapod/core/sources/arrow_table_source.py index 637fb4cc..42d04a63 100644 --- a/src/orcapod/core/sources/arrow_table_source.py +++ b/src/orcapod/core/sources/arrow_table_source.py @@ -28,12 +28,6 @@ def __init__( ) -> None: super().__init__(**kwargs) - if len(table) == 0: - raise ValueError( - "ArrowTableSource requires a non-empty table; " - "the provided table has zero rows." - ) - builder = SourceStreamBuilder(self.data_context, self.orcapod_config) result = builder.build( table, From a292930a38b8187b36050fb143c3acf35dcd3107 Mon Sep 17 00:00:00 2001 From: "agent-kurouto[bot]" <268466204+agent-kurouto[bot]@users.noreply.github.com> Date: Sat, 21 Mar 2026 05:58:20 +0000 Subject: [PATCH 6/8] fix(deps): update uv.lock to pin starfix git rev correctly The lockfile was missing the `?rev=` query parameter for the pinned starfix git commit, causing `uv sync --locked` to fail in CI. Co-Authored-By: Claude Sonnet 4.6 --- uv.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/uv.lock b/uv.lock index b366f4b4..eeae09e0 100644 --- a/uv.lock +++ b/uv.lock @@ -2093,7 +2093,7 @@ requires-dist = [ { name = "ray", extras = ["default"], marker = "extra == 'ray'", specifier = "==2.48.0" }, { name = "redis", marker = "extra == 'redis'", specifier = ">=6.2.0" }, { name = "s3fs", specifier = ">=2025.12.0" }, - { name = "starfix", git = "https://github.com/nauticalab/starfix-python.git" }, + { name = "starfix", git = "https://github.com/nauticalab/starfix-python.git?rev=344617bc6f7fcabab5c011d5774ed47de33f21de" }, { name = "typing-extensions" }, { name = "tzdata", specifier = ">=2024.1" }, { name = "uuid-utils", specifier = ">=0.11.1" }, @@ -3441,7 +3441,7 @@ wheels = [ [[package]] name = "starfix" version = "0.0.2" -source = { git = "https://github.com/nauticalab/starfix-python.git#344617bc6f7fcabab5c011d5774ed47de33f21de" } +source = { git = "https://github.com/nauticalab/starfix-python.git?rev=344617bc6f7fcabab5c011d5774ed47de33f21de#344617bc6f7fcabab5c011d5774ed47de33f21de" } dependencies = [ { name = "pyarrow" }, ] From 03cdc0c19c7c7fada01df4ff5d43265982085a19 Mon Sep 17 00:00:00 2001 From: "agent-kurouto[bot]" <268466204+agent-kurouto[bot]@users.noreply.github.com> Date: Sat, 21 Mar 2026 06:01:17 +0000 Subject: [PATCH 7/8] test(sources): update empty table test to reflect allowed-empty behavior MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 6c1abd6 intentionally removed the zero-row guard from ArrowTableSource, making empty tables valid (they produce zero output records). Update test_empty_table_raises → test_empty_table_constructs_successfully to assert the new contract instead of the old rejected-empty behavior. Co-Authored-By: Claude Sonnet 4.6 --- test-objective/unit/test_sources.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test-objective/unit/test_sources.py b/test-objective/unit/test_sources.py index 298eee9b..88eb0e07 100644 --- a/test-objective/unit/test_sources.py +++ b/test-objective/unit/test_sources.py @@ -45,16 +45,17 @@ def test_normal_construction(self): source = ArrowTableSource(_simple_table(), tag_columns=["name"]) assert source is not None - def test_empty_table_raises(self): - """An empty table raises an error during construction.""" + def test_empty_table_constructs_successfully(self): + """An empty table constructs successfully and produces zero output rows.""" empty = pa.table( { "name": pa.array([], type=pa.large_string()), "age": pa.array([], type=pa.int64()), } ) - with pytest.raises(Exception): - ArrowTableSource(empty, tag_columns=["name"]) + source = ArrowTableSource(empty, tag_columns=["name"]) + assert source is not None + assert source.as_table().num_rows == 0 def test_missing_tag_columns_raises_value_error(self): """Specifying tag columns not in the table raises ValueError.""" From ab017038c049fa8d080c11d29ee95d2ffcab8573 Mon Sep 17 00:00:00 2001 From: "agent-kurouto[bot]" <268466204+agent-kurouto[bot]@users.noreply.github.com> Date: Sat, 21 Mar 2026 06:08:56 +0000 Subject: [PATCH 8/8] test(hashing): add cross-process hash stability tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds tests/test_hashing/test_cross_process_stability.py covering the previously untested property that hash values are safe to persist: - Arrow schema and table hashes are identical when re-computed in a freshly-spawned subprocess (different interpreter lifetime). - All hashes are immune to PYTHONHASHSEED randomisation — CPython randomises built-in hash() per-process, which affects dict/set iteration order; the tests confirm neither StarfixArrowHasher nor the semantic hasher inherit that instability. - Column-order independence is confirmed cross-process for both schemas and tables. - Row-order sensitivity is confirmed cross-process (rows matter, column order does not). - Semantic hasher stability is verified for primitives, containers, nested structures, sets, and tuples across processes. Co-Authored-By: Claude Sonnet 4.6 --- .../test_cross_process_stability.py | 303 ++++++++++++++++++ 1 file changed, 303 insertions(+) create mode 100644 tests/test_hashing/test_cross_process_stability.py diff --git a/tests/test_hashing/test_cross_process_stability.py b/tests/test_hashing/test_cross_process_stability.py new file mode 100644 index 00000000..e8e71700 --- /dev/null +++ b/tests/test_hashing/test_cross_process_stability.py @@ -0,0 +1,303 @@ +"""Cross-process hash stability tests. + +Verifies that hash values produced by StarfixArrowHasher and the semantic +hasher are deterministic across independent Python interpreter processes, +immune to PYTHONHASHSEED randomisation, and therefore safe to persist in +databases or caches. + +Coverage +-------- +- Arrow schema hashes: identical across separately-spawned processes +- Arrow table data hashes: identical across separately-spawned processes +- Semantic (Python object) hashes: identical across processes +- PYTHONHASHSEED independence: hashes do not vary when Python's built-in + hash() randomisation seed changes (critical for dicts and sets) +- Column-order independence confirmed cross-process +- Row-order sensitivity confirmed cross-process (rows matter, columns don't) +""" + +from __future__ import annotations + +import json +import os +import subprocess +import sys +import textwrap + +import pyarrow as pa +import pytest + +from orcapod.hashing.versioned_hashers import ( + get_versioned_semantic_arrow_hasher, + get_versioned_semantic_hasher, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _run_subprocess(script: str, pythonhashseed: str) -> dict[str, str]: + """Run *script* in a fresh interpreter with the given PYTHONHASHSEED. + + Returns the JSON-decoded dict printed to stdout. + Raises AssertionError if the subprocess exits non-zero. + """ + env = os.environ.copy() + env["PYTHONHASHSEED"] = pythonhashseed + result = subprocess.run( + [sys.executable, "-c", script], + capture_output=True, + text=True, + env=env, + ) + assert result.returncode == 0, ( + f"Subprocess failed (PYTHONHASHSEED={pythonhashseed}):\n{result.stderr}" + ) + return json.loads(result.stdout.strip()) + + +# --------------------------------------------------------------------------- +# Subprocess scripts (self-contained; no shared state with the test process) +# --------------------------------------------------------------------------- + +_ARROW_SCHEMA_SCRIPT = textwrap.dedent("""\ + import json + import pyarrow as pa + from orcapod.hashing.versioned_hashers import get_versioned_semantic_arrow_hasher + + hasher = get_versioned_semantic_arrow_hasher() + schemas = { + "simple": pa.schema([ + pa.field("id", pa.int64(), nullable=False), + pa.field("value", pa.float64(), nullable=True), + ]), + "string_types": pa.schema([ + pa.field("name", pa.utf8()), + pa.field("data", pa.large_binary()), + ]), + "nested_struct": pa.schema([ + pa.field("point", pa.struct([ + pa.field("x", pa.float32()), + pa.field("y", pa.float32()), + ])), + ]), + # Same fields as "simple" but columns listed in reverse order — + # must hash identically (column-order independence). + "simple_reordered": pa.schema([ + pa.field("value", pa.float64(), nullable=True), + pa.field("id", pa.int64(), nullable=False), + ]), + } + print(json.dumps({name: hasher.hash_schema(s).digest.hex() for name, s in schemas.items()})) +""") + +_ARROW_TABLE_SCRIPT = textwrap.dedent("""\ + import json + import pyarrow as pa + from orcapod.hashing.versioned_hashers import get_versioned_semantic_arrow_hasher + + hasher = get_versioned_semantic_arrow_hasher() + tables = { + "integers": pa.table({ + "id": pa.array([1, 2, 3], type=pa.int64()), + "x": pa.array([10, 20, 30], type=pa.int64()), + }), + "with_nulls": pa.table({ + "a": pa.array([1, None, 3], type=pa.int32()), + "b": pa.array([None, 2.0, 3.0], type=pa.float64()), + }), + "strings": pa.table({ + "name": pa.array(["alice", "bob", "carol"], type=pa.large_utf8()), + }), + "empty": pa.table({ + "x": pa.array([], type=pa.int64()), + }), + # Same rows as "integers" but columns swapped — must hash identically. + "integers_col_swap": pa.table({ + "x": pa.array([10, 20, 30], type=pa.int64()), + "id": pa.array([1, 2, 3], type=pa.int64()), + }), + # Same rows as "integers" but rows reversed — must hash DIFFERENTLY. + "integers_row_reversed": pa.table({ + "id": pa.array([3, 2, 1], type=pa.int64()), + "x": pa.array([30, 20, 10], type=pa.int64()), + }), + } + print(json.dumps({name: hasher.hash_table(t).digest.hex() for name, t in tables.items()})) +""") + +_SEMANTIC_SCRIPT = textwrap.dedent("""\ + import json + from orcapod.hashing.versioned_hashers import get_versioned_semantic_hasher + + hasher = get_versioned_semantic_hasher() + objects = { + "int": 42, + "neg_int": -7, + "float": 3.14159, + "string": "hello world", + "none": None, + "bool_true": True, + "bool_false": False, + "list": [1, 2, 3], + "nested_dict": {"a": 1, "b": [2, 3], "c": {"d": 4}}, + "set": [1, 2, 3], # hashed as set equivalent + "tuple": [1, 2, 3], # distinguished by type tag inside hasher + "empty_list": [], + "empty_dict": {}, + } + # sets/tuples need special treatment for JSON serialisation — hash via + # the real Python types rather than JSON-roundtripped lists. + typed_objects = { + "set": {1, 2, 3}, + "tuple": (1, 2, 3), + } + results = {name: hasher.hash_object(obj).digest.hex() for name, obj in objects.items()} + results.update({name: hasher.hash_object(obj).digest.hex() for name, obj in typed_objects.items()}) + print(json.dumps(results)) +""") + + +# --------------------------------------------------------------------------- +# Tests: Arrow schema hash cross-process stability +# --------------------------------------------------------------------------- + + +class TestArrowSchemaHashCrossProcess: + """Arrow schema hashes are identical across independent processes.""" + + def test_schema_hash_matches_subprocess(self): + """Hash computed locally equals hash computed in a fresh subprocess.""" + hasher = get_versioned_semantic_arrow_hasher() + local = { + "simple": hasher.hash_schema(pa.schema([ + pa.field("id", pa.int64(), nullable=False), + pa.field("value", pa.float64(), nullable=True), + ])).digest.hex(), + "string_types": hasher.hash_schema(pa.schema([ + pa.field("name", pa.utf8()), + pa.field("data", pa.large_binary()), + ])).digest.hex(), + "nested_struct": hasher.hash_schema(pa.schema([ + pa.field("point", pa.struct([ + pa.field("x", pa.float32()), + pa.field("y", pa.float32()), + ])), + ])).digest.hex(), + "simple_reordered": hasher.hash_schema(pa.schema([ + pa.field("value", pa.float64(), nullable=True), + pa.field("id", pa.int64(), nullable=False), + ])).digest.hex(), + } + remote = _run_subprocess(_ARROW_SCHEMA_SCRIPT, pythonhashseed="42") + assert local == remote + + def test_schema_hash_independent_of_pythonhashseed(self): + """Two subprocesses with different PYTHONHASHSEED values agree.""" + hashes_a = _run_subprocess(_ARROW_SCHEMA_SCRIPT, pythonhashseed="0") + hashes_b = _run_subprocess(_ARROW_SCHEMA_SCRIPT, pythonhashseed="999999") + assert hashes_a == hashes_b + + def test_column_order_independence_cross_process(self): + """Reordering schema fields yields the same hash in a subprocess.""" + hashes = _run_subprocess(_ARROW_SCHEMA_SCRIPT, pythonhashseed="random") + assert hashes["simple"] == hashes["simple_reordered"] + + +# --------------------------------------------------------------------------- +# Tests: Arrow table data hash cross-process stability +# --------------------------------------------------------------------------- + + +class TestArrowTableHashCrossProcess: + """Arrow table data hashes are identical across independent processes.""" + + def test_table_hash_matches_subprocess(self): + """Hash computed locally equals hash computed in a fresh subprocess.""" + hasher = get_versioned_semantic_arrow_hasher() + local = { + "integers": hasher.hash_table(pa.table({ + "id": pa.array([1, 2, 3], type=pa.int64()), + "x": pa.array([10, 20, 30], type=pa.int64()), + })).digest.hex(), + "with_nulls": hasher.hash_table(pa.table({ + "a": pa.array([1, None, 3], type=pa.int32()), + "b": pa.array([None, 2.0, 3.0], type=pa.float64()), + })).digest.hex(), + "strings": hasher.hash_table(pa.table({ + "name": pa.array(["alice", "bob", "carol"], type=pa.large_utf8()), + })).digest.hex(), + "empty": hasher.hash_table(pa.table({ + "x": pa.array([], type=pa.int64()), + })).digest.hex(), + "integers_col_swap": hasher.hash_table(pa.table({ + "x": pa.array([10, 20, 30], type=pa.int64()), + "id": pa.array([1, 2, 3], type=pa.int64()), + })).digest.hex(), + "integers_row_reversed": hasher.hash_table(pa.table({ + "id": pa.array([3, 2, 1], type=pa.int64()), + "x": pa.array([30, 20, 10], type=pa.int64()), + })).digest.hex(), + } + remote = _run_subprocess(_ARROW_TABLE_SCRIPT, pythonhashseed="1337") + assert local == remote + + def test_table_hash_independent_of_pythonhashseed(self): + """Two subprocesses with different PYTHONHASHSEED values agree.""" + hashes_a = _run_subprocess(_ARROW_TABLE_SCRIPT, pythonhashseed="0") + hashes_b = _run_subprocess(_ARROW_TABLE_SCRIPT, pythonhashseed="12345") + assert hashes_a == hashes_b + + def test_column_order_independence_cross_process(self): + """Column-reordered tables hash identically across processes.""" + hashes = _run_subprocess(_ARROW_TABLE_SCRIPT, pythonhashseed="random") + assert hashes["integers"] == hashes["integers_col_swap"] + + def test_row_order_sensitivity_cross_process(self): + """Row-reversed tables hash differently across processes (row order matters).""" + hashes = _run_subprocess(_ARROW_TABLE_SCRIPT, pythonhashseed="random") + assert hashes["integers"] != hashes["integers_row_reversed"] + + +# --------------------------------------------------------------------------- +# Tests: Semantic (Python object) hash cross-process stability +# --------------------------------------------------------------------------- + + +class TestSemanticHashCrossProcess: + """Semantic Python-object hashes are identical across independent processes.""" + + def test_semantic_hash_matches_subprocess(self): + """Hash computed locally equals hash computed in a fresh subprocess.""" + hasher = get_versioned_semantic_hasher() + local = { + "int": hasher.hash_object(42).digest.hex(), + "neg_int": hasher.hash_object(-7).digest.hex(), + "float": hasher.hash_object(3.14159).digest.hex(), + "string": hasher.hash_object("hello world").digest.hex(), + "none": hasher.hash_object(None).digest.hex(), + "bool_true": hasher.hash_object(True).digest.hex(), + "bool_false": hasher.hash_object(False).digest.hex(), + "list": hasher.hash_object([1, 2, 3]).digest.hex(), + "nested_dict": hasher.hash_object({"a": 1, "b": [2, 3], "c": {"d": 4}}).digest.hex(), + "set": hasher.hash_object({1, 2, 3}).digest.hex(), + "tuple": hasher.hash_object((1, 2, 3)).digest.hex(), + "empty_list": hasher.hash_object([]).digest.hex(), + "empty_dict": hasher.hash_object({}).digest.hex(), + } + remote = _run_subprocess(_SEMANTIC_SCRIPT, pythonhashseed="42") + assert local == remote + + def test_semantic_hash_independent_of_pythonhashseed(self): + """Semantic hashes do not change when PYTHONHASHSEED differs. + + PYTHONHASHSEED affects dict and set iteration order in CPython. + This test confirms the semantic hasher normalises that away. + """ + hashes_seed0 = _run_subprocess(_SEMANTIC_SCRIPT, pythonhashseed="0") + hashes_seed1 = _run_subprocess(_SEMANTIC_SCRIPT, pythonhashseed="1") + hashes_random = _run_subprocess(_SEMANTIC_SCRIPT, pythonhashseed="random") + assert hashes_seed0 == hashes_seed1 + assert hashes_seed1 == hashes_random