From 6718dd34ae16980999d26fe357a0eff686afc052 Mon Sep 17 00:00:00 2001
From: "agent-kurouto[bot]"
 <268466204+agent-kurouto[bot]@users.noreply.github.com>
Date: Fri, 20 Mar 2026 22:25:18 +0000
Subject: [PATCH 1/8] feat(hashing): integrate starfix-python for Arrow schema
 and data hashing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the ad-hoc Arrow IPC serialisation + hashlib approach in
SemanticArrowHasher with a new StarfixArrowHasher that delegates to
the canonical `starfix` ArrowDigester (nauticalab/starfix-python).

Changes
-------
* src/orcapod/hashing/arrow_hashers.py
  - Add StarfixArrowHasher class: reuses SemanticHashingVisitor for
    semantic-type pre-processing (e.g. Path → file-content hash), then
    calls ArrowDigester.hash_table / hash_schema for the final digest.
  - Adds hash_schema() method for schema-only fingerprinting.
  - Empty-column edge-case: fall back to field.type when no rows exist.

* src/orcapod/hashing/versioned_hashers.py
  - _CURRENT_ARROW_HASHER_ID remains "arrow_v0.1".
  - get_versioned_semantic_arrow_hasher() now returns StarfixArrowHasher.

* src/orcapod/contexts/data/v0.1.json
  - Swap arrow_hasher._class to StarfixArrowHasher (hasher_id unchanged:
    "arrow_v0.1"); remove IPC-specific params (hash_algorithm, chunk_size,
    serialization_method).

* pyproject.toml / uv.lock
  - Declare starfix dependency sourced from nauticalab/starfix-python
    (pure-Python implementation, git source via [tool.uv.sources]).

* tests/test_hashing/test_starfix_arrow_hasher.py  (new)
  - 24 unit tests covering schema hashing, table hashing, column-order
    independence, batch-split parity, type normalisation, empty tables,
    versioned_hashers factory, and context integration.

Closes PLT-1063

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 pyproject.toml                                |   5 +-
 src/orcapod/contexts/data/v0.1.json           |   8 +-
 src/orcapod/hashing/arrow_hashers.py          | 124 ++++++++-
 src/orcapod/hashing/versioned_hashers.py      |   6 +-
 .../test_hashing/test_starfix_arrow_hasher.py | 236 ++++++++++++++++++
 uv.lock                                       |  34 +--
 6 files changed, 372 insertions(+), 41 deletions(-)
 create mode 100644 tests/test_hashing/test_starfix_arrow_hasher.py

diff --git a/pyproject.toml b/pyproject.toml
index ad9c04ef..d5226222 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ dependencies = [
     "deltalake>=1.0.2",
     "graphviz>=0.21",
     "gitpython>=3.1.45",
-    "starfix>=0.1.3",
+    "starfix",
     "pygraphviz>=1.14",
     "tzdata>=2024.1",
     "uuid-utils>=0.11.1",
@@ -51,6 +51,9 @@ where = ["src"]
 [tool.setuptools_scm]
 version_file = "src/orcapod/_version.py"
 
+[tool.uv.sources]
+starfix = { git = "https://github.com/nauticalab/starfix-python.git" }
+
 
 [dependency-groups]
 dev = [
diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json
index cd16b5d5..2cb2bfc4 100644
--- a/src/orcapod/contexts/data/v0.1.json
+++ b/src/orcapod/contexts/data/v0.1.json
@@ -22,12 +22,9 @@
         }
     },
     "arrow_hasher": {
-        "_class": "orcapod.hashing.arrow_hashers.SemanticArrowHasher",
+        "_class": "orcapod.hashing.arrow_hashers.StarfixArrowHasher",
         "_config": {
             "hasher_id": "arrow_v0.1",
-            "hash_algorithm": "sha256",
-            "chunk_size": 8192,
-            "serialization_method": "logical",
             "semantic_registry": {
                 "_ref": "semantic_registry"
             }
@@ -83,7 +80,8 @@
         "changelog": [
             "Initial release with Path semantic type support",
             "Basic SHA-256 hashing for files and objects",
-            "Arrow logical serialization method"
+            "Arrow logical serialization method",
+            "arrow_v0.1: replaced custom IPC serialisation+hashlib with starfix ArrowDigester for cross-language-compatible Arrow hashing"
         ]
     }
 }
diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py
index 77cb3b49..a8c34a9e 100644
--- a/src/orcapod/hashing/arrow_hashers.py
+++ b/src/orcapod/hashing/arrow_hashers.py
@@ -4,6 +4,7 @@
 from typing import Any
 
 import pyarrow as pa
+from starfix import ArrowDigester
 
 from orcapod.hashing import arrow_serialization
 from orcapod.hashing.visitors import SemanticHashingVisitor
@@ -197,7 +198,7 @@ def hash_table(self, table: pa.Table | pa.RecordBatch) -> ContentHash:
 
         return ContentHash(method=self.hasher_id, digest=hasher.digest())
 
-    def hash_table_with_metadata(self, table: pa.Table) -> dict[str, Any]:
+    def hash_table_with_metadata(self, table: pa.Table) -> dict[str, Any]:  # noqa: C901
         """
         Compute hash with additional metadata about the process.
 
@@ -233,3 +234,124 @@ def hash_table_with_metadata(self, table: pa.Table) -> dict[str, Any]:
             "processed_columns": processed_columns,
             "column_order": [field.name for field in table.schema],
         }
+
+
+class StarfixArrowHasher:
+    """
+    Arrow table hasher backed by the starfix-python ``ArrowDigester``.
+
+    This hasher produces cross-language-compatible, deterministic content
+    addresses for Arrow tables and schemas by delegating to the canonical
+    StarFix specification (``starfix-python``).
+
+    Pipeline
+    --------
+    1. **Semantic pre-processing** — the ``SemanticHashingVisitor`` traverses
+       every column and replaces recognised semantic types (e.g. ``Path``
+       structs) with their content-addressed hash strings.  This step runs
+       before the Arrow bytes are ever touched by starfix, so the final hash
+       captures *file content* for path-typed columns rather than the raw
+       path string.
+    2. **Starfix hashing** — ``ArrowDigester.hash_table`` (or
+       ``ArrowDigester.hash_schema``) is called on the pre-processed table /
+       schema.  The digester is column-order-independent and normalises
+       ``Utf8`` → ``LargeUtf8``, ``Binary`` → ``LargeBinary``, etc.,
+       producing a 35-byte versioned SHA-256 digest that is byte-for-byte
+       identical to the Rust ``starfix`` crate output.
+
+    Parameters
+    ----------
+    semantic_registry:
+        Registry of semantic type converters used during pre-processing.
+    hasher_id:
+        String identifier embedded in every ``ContentHash`` produced by
+        this hasher.  Bump this value whenever the hash algorithm changes
+        so that stored hashes remain distinguishable.
+    """
+
+    def __init__(
+        self,
+        semantic_registry: SemanticTypeRegistry,
+        hasher_id: str,
+    ) -> None:
+        self._hasher_id = hasher_id
+        self.semantic_registry = semantic_registry
+
+    @property
+    def hasher_id(self) -> str:
+        return self._hasher_id
+
+    def _process_table_columns(self, table: pa.Table | pa.RecordBatch) -> pa.Table:
+        """Replace semantic-typed columns with their content-hash strings."""
+        new_columns: list[pa.Array] = []
+        new_fields: list[pa.Field] = []
+
+        for i, field in enumerate(table.schema):
+            column_data = table.column(i).to_pylist()
+            visitor = SemanticHashingVisitor(self.semantic_registry)
+
+            try:
+                new_type: pa.DataType | None = None
+                processed_data: list[Any] = []
+                for value in column_data:
+                    processed_type, processed_value = visitor.visit(field.type, value)
+                    if new_type is None:
+                        new_type = processed_type
+                    processed_data.append(processed_value)
+
+                # For empty columns there are no values to infer the type from;
+                # fall back to the field's declared type.
+                if new_type is None:
+                    new_type = field.type
+                new_columns.append(pa.array(processed_data, type=new_type))
+                new_fields.append(pa.field(field.name, new_type))
+
+            except Exception as exc:
+                raise RuntimeError(
+                    f"Failed to process column '{field.name}': {exc}"
+                ) from exc
+
+        return pa.table(new_columns, schema=pa.schema(new_fields))
+
+    def hash_schema(self, schema: pa.Schema) -> ContentHash:
+        """Hash an Arrow schema using the starfix canonical algorithm.
+
+        Parameters
+        ----------
+        schema:
+            The ``pyarrow.Schema`` to hash.
+
+        Returns
+        -------
+        ContentHash
+            A ``ContentHash`` whose ``digest`` is the 35-byte versioned
+            SHA-256 produced by ``ArrowDigester.hash_schema``.
+        """
+        digest = ArrowDigester.hash_schema(schema)
+        return ContentHash(method=self._hasher_id, digest=digest)
+
+    def hash_table(self, table: pa.Table | pa.RecordBatch) -> ContentHash:
+        """Hash an Arrow table (or ``RecordBatch``) using starfix.
+
+        Semantic types are resolved to their content-hash strings before
+        the table is passed to ``ArrowDigester.hash_table``, ensuring that
+        path-typed columns contribute their *file content* hash rather than
+        the literal path string.
+
+        Parameters
+        ----------
+        table:
+            The ``pa.Table`` or ``pa.RecordBatch`` to hash.
+
+        Returns
+        -------
+        ContentHash
+            A ``ContentHash`` whose ``digest`` is the 35-byte versioned
+            SHA-256 produced by ``ArrowDigester.hash_table``.
+        """
+        if isinstance(table, pa.RecordBatch):
+            table = pa.Table.from_batches([table])
+
+        processed_table = self._process_table_columns(table)
+        digest = ArrowDigester.hash_table(processed_table)
+        return ContentHash(method=self._hasher_id, digest=digest)
diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py
index 8adce44d..fda8cd07 100644
--- a/src/orcapod/hashing/versioned_hashers.py
+++ b/src/orcapod/hashing/versioned_hashers.py
@@ -123,7 +123,7 @@ def get_versioned_semantic_arrow_hasher(
     ArrowHasherProtocol
         A fully configured SemanticArrowHasher instance.
     """
-    from orcapod.hashing.arrow_hashers import SemanticArrowHasher
+    from orcapod.hashing.arrow_hashers import StarfixArrowHasher
     from orcapod.hashing.file_hashers import BasicFileHasher
     from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry
     from orcapod.semantic_types.semantic_struct_converters import PathStructConverter
@@ -138,11 +138,11 @@ def get_versioned_semantic_arrow_hasher(
     registry.register_converter("path", path_converter)
 
     logger.debug(
-        "get_versioned_semantic_arrow_hasher: creating SemanticArrowHasher "
+        "get_versioned_semantic_arrow_hasher: creating StarfixArrowHasher "
         "(hasher_id=%r)",
         hasher_id,
     )
-    hasher: Any = SemanticArrowHasher(
+    hasher: Any = StarfixArrowHasher(
         hasher_id=hasher_id,
         semantic_registry=registry,
     )
diff --git a/tests/test_hashing/test_starfix_arrow_hasher.py b/tests/test_hashing/test_starfix_arrow_hasher.py
new file mode 100644
index 00000000..fef27614
--- /dev/null
+++ b/tests/test_hashing/test_starfix_arrow_hasher.py
@@ -0,0 +1,236 @@
+"""
+Tests for StarfixArrowHasher — the starfix-backed Arrow content hasher.
+
+Coverage
+--------
+- hash_schema: returns ContentHash with correct method and 35-byte digest
+- hash_table / hash_record_batch: returns ContentHash, digest is 35 bytes
+- Column-order independence: reordering columns does not change the hash
+- Batch-split independence: splitting a table across batches does not change
+  the hash (verified via RecordBatch round-trip)
+- Type normalisation: Utf8 and LargeUtf8 columns hash identically
+- Empty table: hash_table handles a table with zero rows
+- Schema-only vs data hash: hash_schema and hash_table differ for the same schema
+- versioned_hashers factory: get_versioned_semantic_arrow_hasher returns a
+  StarfixArrowHasher with hasher_id == _CURRENT_ARROW_HASHER_ID
+- Context integration: the v0.1 context wires up StarfixArrowHasher
+- Stability (golden values): hard-coded digests catch accidental algorithm changes
+"""
+
+from __future__ import annotations
+
+import pytest
+import pyarrow as pa
+
+from orcapod.hashing.arrow_hashers import StarfixArrowHasher
+from orcapod.hashing.versioned_hashers import (
+    _CURRENT_ARROW_HASHER_ID,
+    get_versioned_semantic_arrow_hasher,
+)
+from orcapod.semantic_types import SemanticTypeRegistry
+from orcapod.types import ContentHash
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+HASHER_ID = "arrow_v0.1"
+
+_DIGEST_LEN = 35  # 3 version bytes + 32 SHA-256 bytes
+
+
+def _make_hasher() -> StarfixArrowHasher:
+    return StarfixArrowHasher(
+        semantic_registry=SemanticTypeRegistry(),
+        hasher_id=HASHER_ID,
+    )
+
+
+# ---------------------------------------------------------------------------
+# hash_schema
+# ---------------------------------------------------------------------------
+
+
+class TestHashSchema:
+    def test_returns_content_hash(self):
+        schema = pa.schema([pa.field("x", pa.int32())])
+        h = _make_hasher().hash_schema(schema)
+        assert isinstance(h, ContentHash)
+
+    def test_method_is_hasher_id(self):
+        schema = pa.schema([pa.field("x", pa.int32())])
+        h = _make_hasher().hash_schema(schema)
+        assert h.method == HASHER_ID
+
+    def test_digest_length(self):
+        schema = pa.schema([pa.field("x", pa.int32())])
+        h = _make_hasher().hash_schema(schema)
+        assert len(h.digest) == _DIGEST_LEN
+
+    def test_deterministic(self):
+        schema = pa.schema([pa.field("a", pa.int64()), pa.field("b", pa.float32())])
+        h1 = _make_hasher().hash_schema(schema)
+        h2 = _make_hasher().hash_schema(schema)
+        assert h1.digest == h2.digest
+
+    def test_different_schemas_differ(self):
+        s1 = pa.schema([pa.field("a", pa.int32())])
+        s2 = pa.schema([pa.field("a", pa.float32())])
+        h1 = _make_hasher().hash_schema(s1)
+        h2 = _make_hasher().hash_schema(s2)
+        assert h1.digest != h2.digest
+
+    def test_column_order_independent(self):
+        """Schema field order should not affect the hash."""
+        s1 = pa.schema([pa.field("a", pa.int32()), pa.field("b", pa.float64())])
+        s2 = pa.schema([pa.field("b", pa.float64()), pa.field("a", pa.int32())])
+        h1 = _make_hasher().hash_schema(s1)
+        h2 = _make_hasher().hash_schema(s2)
+        assert h1.digest == h2.digest
+
+    def test_golden_value(self):
+        """Pin the digest so unintentional algorithm changes are caught."""
+        schema = pa.schema(
+            [
+                pa.field("id", pa.int32(), nullable=False),
+                pa.field("value", pa.float64(), nullable=True),
+            ]
+        )
+        h = _make_hasher().hash_schema(schema)
+        # First 6 hex chars encode the 3-byte starfix version prefix (000001)
+        assert h.digest.hex().startswith("000001"), (
+            f"Unexpected version prefix in digest: {h.digest.hex()}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# hash_table
+# ---------------------------------------------------------------------------
+
+
+class TestHashTable:
+    def test_returns_content_hash(self):
+        table = pa.table({"x": [1, 2, 3]})
+        h = _make_hasher().hash_table(table)
+        assert isinstance(h, ContentHash)
+
+    def test_method_is_hasher_id(self):
+        table = pa.table({"x": [1, 2, 3]})
+        h = _make_hasher().hash_table(table)
+        assert h.method == HASHER_ID
+
+    def test_digest_length(self):
+        table = pa.table({"x": [1, 2, 3]})
+        h = _make_hasher().hash_table(table)
+        assert len(h.digest) == _DIGEST_LEN
+
+    def test_deterministic(self):
+        table = pa.table({"a": [1, 2], "b": [3.0, 4.0]})
+        h1 = _make_hasher().hash_table(table)
+        h2 = _make_hasher().hash_table(table)
+        assert h1.digest == h2.digest
+
+    def test_column_order_independent(self):
+        """Reordering columns must not change the hash."""
+        t1 = pa.table({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})
+        t2 = pa.table({"b": [4.0, 5.0, 6.0], "a": [1, 2, 3]})
+        h1 = _make_hasher().hash_table(t1)
+        h2 = _make_hasher().hash_table(t2)
+        assert h1.digest == h2.digest
+
+    def test_different_data_differs(self):
+        t1 = pa.table({"x": [1, 2, 3]})
+        t2 = pa.table({"x": [1, 2, 4]})
+        assert _make_hasher().hash_table(t1).digest != _make_hasher().hash_table(t2).digest
+
+    def test_empty_table(self):
+        """Zero-row tables should be hashable without error."""
+        table = pa.table({"x": pa.array([], type=pa.int32())})
+        h = _make_hasher().hash_table(table)
+        assert isinstance(h, ContentHash)
+        assert len(h.digest) == _DIGEST_LEN
+
+    def test_schema_hash_differs_from_data_hash(self):
+        schema = pa.schema([pa.field("x", pa.int32())])
+        table = pa.table({"x": [1, 2, 3]})
+        hs = _make_hasher().hash_schema(schema)
+        ht = _make_hasher().hash_table(table)
+        assert hs.digest != ht.digest
+
+    def test_utf8_largeutf8_normalised(self):
+        """Utf8 and LargeUtf8 columns should hash identically (starfix normalises)."""
+        t_utf8 = pa.table({"s": pa.array(["hello", "world"], type=pa.utf8())})
+        t_large = pa.table(
+            {"s": pa.array(["hello", "world"], type=pa.large_utf8())}
+        )
+        h1 = _make_hasher().hash_table(t_utf8)
+        h2 = _make_hasher().hash_table(t_large)
+        assert h1.digest == h2.digest
+
+
+# ---------------------------------------------------------------------------
+# RecordBatch (batch-split independence)
+# ---------------------------------------------------------------------------
+
+
+class TestHashRecordBatch:
+    def test_record_batch_matches_table(self):
+        """hash_table on a RecordBatch must equal hash_table on the equivalent Table."""
+        table = pa.table({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})
+        batch = table.to_batches()[0]
+        h_table = _make_hasher().hash_table(table)
+        h_batch = _make_hasher().hash_table(batch)
+        assert h_table.digest == h_batch.digest
+
+    def test_method_is_hasher_id_for_batch(self):
+        batch = pa.record_batch({"x": [1, 2]})
+        h = _make_hasher().hash_table(batch)
+        assert h.method == HASHER_ID
+
+
+# ---------------------------------------------------------------------------
+# versioned_hashers factory
+# ---------------------------------------------------------------------------
+
+
+class TestVersionedHashersFactory:
+    def test_returns_starfix_hasher(self):
+        hasher = get_versioned_semantic_arrow_hasher()
+        assert isinstance(hasher, StarfixArrowHasher)
+
+    def test_hasher_id_matches_current_constant(self):
+        hasher = get_versioned_semantic_arrow_hasher()
+        assert hasher.hasher_id == _CURRENT_ARROW_HASHER_ID
+
+    def test_current_hasher_id_is_arrow_v0_1(self):
+        """Pin the current version constant to detect accidental version drifts."""
+        assert _CURRENT_ARROW_HASHER_ID == "arrow_v0.1"
+
+
+# ---------------------------------------------------------------------------
+# Context integration
+# ---------------------------------------------------------------------------
+
+
+class TestContextIntegration:
+    def test_v01_context_uses_starfix_hasher(self):
+        from orcapod.contexts import resolve_context
+
+        ctx = resolve_context("v0.1")
+        assert isinstance(ctx.arrow_hasher, StarfixArrowHasher)
+
+    def test_v01_context_hasher_id(self):
+        from orcapod.contexts import resolve_context
+
+        ctx = resolve_context("v0.1")
+        assert ctx.arrow_hasher.hasher_id == "arrow_v0.1"
+
+    def test_context_hash_table_functional(self):
+        from orcapod.contexts import resolve_context
+
+        ctx = resolve_context("v0.1")
+        table = pa.table({"key": [1, 2, 3], "val": [0.1, 0.2, 0.3]})
+        h = ctx.arrow_hasher.hash_table(table)
+        assert isinstance(h, ContentHash)
+        assert len(h.digest) == _DIGEST_LEN
diff --git a/uv.lock b/uv.lock
index 99972ffc..b366f4b4 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1490,27 +1490,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899, upload-time = "2024-04-15T13:44:43.265Z" },
 ]
 
-[[package]]
-name = "maturin"
-version = "1.9.6"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/9a/35/c3370188492f4c139c7a318f438d01b8185c216303c49c4bc885c98b6afb/maturin-1.9.6.tar.gz", hash = "sha256:2c2ae37144811d365509889ed7220b0598487f1278c2441829c3abf56cc6324a", size = 214846, upload-time = "2025-10-07T12:45:08.408Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/55/5c/b435418ba4ba2647a1f7a95d53314991b1e556e656ae276dea993c3bce1d/maturin-1.9.6-py3-none-linux_armv6l.whl", hash = "sha256:26e3ab1a42a7145824210e9d763f6958f2c46afb1245ddd0bab7d78b1f59bb3f", size = 8134483, upload-time = "2025-10-07T12:44:44.274Z" },
-    { url = "https://files.pythonhosted.org/packages/4d/1c/8e58eda6601f328b412cdeeaa88a9b6a10e591e2a73f313e8c0154d68385/maturin-1.9.6-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:5263dda3f71feef2e4122baf5c4620e4b3710dbb7f2121f85a337182de214369", size = 15776470, upload-time = "2025-10-07T12:44:47.476Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/33/8c967cce6848cdd87a2e442c86120ac644b80c5ed4c32e3291bde6a17df8/maturin-1.9.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:fe78262c2800c92f67d1ce3c0f6463f958a692cc67bfb572e5dbf5b4b696a8ba", size = 8226557, upload-time = "2025-10-07T12:44:49.844Z" },
-    { url = "https://files.pythonhosted.org/packages/58/bd/3e2675cdc8b7270700ba30c663c852a35694441732a107ac30ebd6878bd8/maturin-1.9.6-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:7ab827c6e8c022eb2e1e7fb6deede54549c8460b20ccc2e9268cc6e8cde957a8", size = 8166544, upload-time = "2025-10-07T12:44:51.396Z" },
-    { url = "https://files.pythonhosted.org/packages/58/1f/a2047ddf2230e700d5f8a13dd4b9af5ce806ad380c32e58105888205926e/maturin-1.9.6-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:0246202377c49449315305209f45c8ecef6e2d6bd27a04b5b6f1ab3e4ea47238", size = 8641010, upload-time = "2025-10-07T12:44:53.658Z" },
-    { url = "https://files.pythonhosted.org/packages/be/1f/265d63c7aa6faf363d4a3f23396f51bc6b4d5c7680a4190ae68dba25dea2/maturin-1.9.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:f5bac167700fbb6f8c8ed1a97b494522554b4432d7578e11403b894b6a91d99f", size = 7965945, upload-time = "2025-10-07T12:44:55.248Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/ca/a8e61979ccfe080948bcc1bddd79356157aee687134df7fb013050cec783/maturin-1.9.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:7f53d3b1d8396d3fea3e1ee5fd37558bca5719090f3d194ba1c02b0b56327ae3", size = 7978820, upload-time = "2025-10-07T12:44:56.919Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/4a/81b412f8ad02a99801ef19ec059fba0822d1d28fb44cb6a92e722f05f278/maturin-1.9.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:7f506eb358386d94d6ec3208c003130cf4b69cab26034fc0cbbf8bf83afa4c2e", size = 10452064, upload-time = "2025-10-07T12:44:58.232Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/12/cc96c7a8cb51d8dcc9badd886c361caa1526fba7fa69d1e7892e613b71d4/maturin-1.9.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2d6984ab690af509f525dbd2b130714207c06ebb14a5814edbe1e42b17ae0de", size = 8852401, upload-time = "2025-10-07T12:44:59.8Z" },
-    { url = "https://files.pythonhosted.org/packages/51/8e/653ac3c9f2c25cdd81aefb0a2d17ff140ca5a14504f5e3c7f94dcfe4dbb7/maturin-1.9.6-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:5c2252b0956bb331460ac750c805ddf0d9b44442449fc1f16e3b66941689d0bc", size = 8425057, upload-time = "2025-10-07T12:45:01.711Z" },
-    { url = "https://files.pythonhosted.org/packages/db/29/f13490328764ae9bfc1da55afc5b707cebe4fa75ad7a1573bfa82cfae0c6/maturin-1.9.6-py3-none-win32.whl", hash = "sha256:f2c58d29ebdd4346fd004e6be213d071fdd94a77a16aa91474a21a4f9dbf6309", size = 7165956, upload-time = "2025-10-07T12:45:03.766Z" },
-    { url = "https://files.pythonhosted.org/packages/db/9f/dd51e5ac1fce47581b8efa03d77a03f928c0ef85b6e48a61dfa37b6b85a2/maturin-1.9.6-py3-none-win_amd64.whl", hash = "sha256:1b39a5d82572c240d20d9e8be024d722dfb311d330c5e28ddeb615211755941a", size = 8145722, upload-time = "2025-10-07T12:45:05.487Z" },
-    { url = "https://files.pythonhosted.org/packages/65/f2/e97aaba6d0d78c5871771bf9dd71d4eb8dac15df9109cf452748d2207412/maturin-1.9.6-py3-none-win_arm64.whl", hash = "sha256:ac02a30083553d2a781c10cd6f5480119bf6692fd177e743267406cad2ad198c", size = 6857006, upload-time = "2025-10-07T12:45:06.813Z" },
-]
-
 [[package]]
 name = "mdurl"
 version = "0.1.2"
@@ -2114,7 +2093,7 @@ requires-dist = [
     { name = "ray", extras = ["default"], marker = "extra == 'ray'", specifier = "==2.48.0" },
     { name = "redis", marker = "extra == 'redis'", specifier = ">=6.2.0" },
     { name = "s3fs", specifier = ">=2025.12.0" },
-    { name = "starfix", specifier = ">=0.1.3" },
+    { name = "starfix", git = "https://github.com/nauticalab/starfix-python.git" },
     { name = "typing-extensions" },
     { name = "tzdata", specifier = ">=2024.1" },
     { name = "uuid-utils", specifier = ">=0.11.1" },
@@ -3461,18 +3440,11 @@ wheels = [
 
 [[package]]
 name = "starfix"
-version = "0.1.3"
-source = { registry = "https://pypi.org/simple" }
+version = "0.0.2"
+source = { git = "https://github.com/nauticalab/starfix-python.git#344617bc6f7fcabab5c011d5774ed47de33f21de" }
 dependencies = [
-    { name = "ipykernel" },
-    { name = "maturin" },
     { name = "pyarrow" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/dd/73/942a97c83a54ec1f641af1c2c8ff15c8ad5e1955d66f56c5437ef6e5c18e/starfix-0.1.3.tar.gz", hash = "sha256:4ac9090e24374dd3d4af466d04bdf6a9fe180ac8fd902b94b29f263d58803b5e", size = 18254, upload-time = "2025-10-29T19:53:23.657Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/54/be/98ca0482cdb4fa25a11a4dbc59c4d2a643bd8210c6c3305b2d58b5e0460c/starfix-0.1.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:ef86702f0d0c8cd37b00cf63aeb6a555832eb24d7853cbe84316473ac38992d8", size = 469719, upload-time = "2025-10-29T19:53:22.473Z" },
-    { url = "https://files.pythonhosted.org/packages/94/bf/208c8307d9f005ee9e6709e15bc6fff40c77293c31a8539324dddde8e783/starfix-0.1.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b0c713211ea8b293dbb4f172ca648a7b78481603c47d729c87126c867ed5b5a5", size = 598464, upload-time = "2025-10-29T19:53:21.126Z" },
-]
 
 [[package]]
 name = "strictyaml"

From 5c42ec5f34920950e3103f78958831c49617efa4 Mon Sep 17 00:00:00 2001
From: "agent-kurouto[bot]"
 <268466204+agent-kurouto[bot]@users.noreply.github.com>
Date: Fri, 20 Mar 2026 22:47:00 +0000
Subject: [PATCH 2/8] fix(sources): explicitly reject empty tables in
 ArrowTableSource

Previously, constructing an ArrowTableSource with a zero-row table
raised an AssertionError deep inside SemanticArrowHasher._process_table_columns
as an accidental side effect.  Now that StarfixArrowHasher handles
empty columns gracefully, that implicit guard was removed.

Add an explicit ValueError at the top of ArrowTableSource.__init__ so
the rejection is clear, early, and independent of the hasher implementation.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/orcapod/core/sources/arrow_table_source.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/orcapod/core/sources/arrow_table_source.py b/src/orcapod/core/sources/arrow_table_source.py
index 42d04a63..637fb4cc 100644
--- a/src/orcapod/core/sources/arrow_table_source.py
+++ b/src/orcapod/core/sources/arrow_table_source.py
@@ -28,6 +28,12 @@ def __init__(
     ) -> None:
         super().__init__(**kwargs)
 
+        if len(table) == 0:
+            raise ValueError(
+                "ArrowTableSource requires a non-empty table; "
+                "the provided table has zero rows."
+            )
+
         builder = SourceStreamBuilder(self.data_context, self.orcapod_config)
         result = builder.build(
             table,

From 385d7ec5e7d964484c2c195800b3120c403a6a46 Mon Sep 17 00:00:00 2001
From: "agent-kurouto[bot]"
 <268466204+agent-kurouto[bot]@users.noreply.github.com>
Date: Fri, 20 Mar 2026 22:55:31 +0000
Subject: [PATCH 3/8] test(hashing): expand StarfixArrowHasher stability and
 correctness coverage

- Replace weak version-prefix-only golden value with full 35-byte pinned
  digests for both schema and table hashing
- Add row-order sensitivity: reordered rows must produce a different hash
- Add null-position sensitivity: null at position 0 vs 1 must differ
- Add null-vs-no-null: presence of any null changes the hash
- Add Binary/LargeBinary normalisation parity (mirrors existing Utf8 test)
- Add integer-width sensitivity: int32 and int64 with same values must differ
- Add field-nullability schema test: nullable=True vs False produce different
  schema hashes, with fully pinned golden digests for both
- Add multi-batch table consistency: Table from two RecordBatches must hash
  identically to the equivalent single-batch Table

Total: 33 tests, up from 24.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../test_hashing/test_starfix_arrow_hasher.py | 92 +++++++++++++++++--
 1 file changed, 85 insertions(+), 7 deletions(-)

diff --git a/tests/test_hashing/test_starfix_arrow_hasher.py b/tests/test_hashing/test_starfix_arrow_hasher.py
index fef27614..0dfb5fd8 100644
--- a/tests/test_hashing/test_starfix_arrow_hasher.py
+++ b/tests/test_hashing/test_starfix_arrow_hasher.py
@@ -7,14 +7,19 @@
 - hash_table / hash_record_batch: returns ContentHash, digest is 35 bytes
 - Column-order independence: reordering columns does not change the hash
 - Batch-split independence: splitting a table across batches does not change
-  the hash (verified via RecordBatch round-trip)
-- Type normalisation: Utf8 and LargeUtf8 columns hash identically
+  the hash (verified via single-batch and multi-batch RecordBatch round-trips)
+- Row-order sensitivity: reordering rows changes the hash
+- Type normalisation: Utf8/LargeUtf8 and Binary/LargeBinary hash identically
+- Integer width sensitivity: int32 and int64 with the same values differ
+- Null handling: null position matters; a null changes the hash vs no-null
+- Field nullability: nullable=True and nullable=False produce different schema hashes
 - Empty table: hash_table handles a table with zero rows
 - Schema-only vs data hash: hash_schema and hash_table differ for the same schema
 - versioned_hashers factory: get_versioned_semantic_arrow_hasher returns a
   StarfixArrowHasher with hasher_id == _CURRENT_ARROW_HASHER_ID
 - Context integration: the v0.1 context wires up StarfixArrowHasher
-- Stability (golden values): hard-coded digests catch accidental algorithm changes
+- Stability (golden values): fully-pinned 35-byte digests catch accidental
+  algorithm changes in both schema and table hashing
 """
 
 from __future__ import annotations
@@ -90,7 +95,7 @@ def test_column_order_independent(self):
         assert h1.digest == h2.digest
 
     def test_golden_value(self):
-        """Pin the digest so unintentional algorithm changes are caught."""
+        """Pin the full 35-byte digest so any algorithm change is caught."""
         schema = pa.schema(
             [
                 pa.field("id", pa.int32(), nullable=False),
@@ -98,9 +103,28 @@ def test_golden_value(self):
             ]
         )
         h = _make_hasher().hash_schema(schema)
-        # First 6 hex chars encode the 3-byte starfix version prefix (000001)
-        assert h.digest.hex().startswith("000001"), (
-            f"Unexpected version prefix in digest: {h.digest.hex()}"
+        assert h.digest.hex() == "000001d676ef0263a8e0e7500b1c97033993dbe445172ca0f9e7577b3994bfa6224b4c", (
+            f"Schema golden digest changed — was the starfix algorithm updated? "
+            f"Got: {h.digest.hex()}"
+        )
+
+    def test_nullability_affects_schema_hash(self):
+        """nullable=True and nullable=False must produce different schema hashes."""
+        s_nullable     = pa.schema([pa.field("x", pa.int32(), nullable=True)])
+        s_non_nullable = pa.schema([pa.field("x", pa.int32(), nullable=False)])
+        h_nullable     = _make_hasher().hash_schema(s_nullable)
+        h_non_nullable = _make_hasher().hash_schema(s_non_nullable)
+        assert h_nullable.digest != h_non_nullable.digest
+
+    def test_nullability_golden_values(self):
+        """Pin nullable and non-nullable schema digests independently."""
+        s_nullable     = pa.schema([pa.field("x", pa.int32(), nullable=True)])
+        s_non_nullable = pa.schema([pa.field("x", pa.int32(), nullable=False)])
+        assert _make_hasher().hash_schema(s_nullable).digest.hex() == (
+            "000001b8005339e69df64cda60f9ff9c98caa264092a7b666c3f7f85a2bfed20bae3db"
+        )
+        assert _make_hasher().hash_schema(s_non_nullable).digest.hex() == (
+            "00000179353568a71430411e8108ee02d425800f6d5054d9b2baa871cad90f3e06422a"
         )
 
 
@@ -168,6 +192,49 @@ def test_utf8_largeutf8_normalised(self):
         h2 = _make_hasher().hash_table(t_large)
         assert h1.digest == h2.digest
 
+    def test_binary_largebinary_normalised(self):
+        """Binary and LargeBinary columns should hash identically (starfix normalises)."""
+        t_bin  = pa.table({"b": pa.array([b"hello", b"world"], type=pa.binary())})
+        t_lbin = pa.table({"b": pa.array([b"hello", b"world"], type=pa.large_binary())})
+        assert _make_hasher().hash_table(t_bin).digest == _make_hasher().hash_table(t_lbin).digest
+
+    def test_row_order_matters(self):
+        """Reordering rows must produce a different hash (rows carry positional semantics)."""
+        t_orig     = pa.table({"x": pa.array([1, 2, 3], type=pa.int64())})
+        t_reversed = pa.table({"x": pa.array([3, 2, 1], type=pa.int64())})
+        assert _make_hasher().hash_table(t_orig).digest != _make_hasher().hash_table(t_reversed).digest
+
+    def test_null_position_matters(self):
+        """A null in a different row position must produce a different hash."""
+        t_null_first  = pa.table({"x": pa.array([None, 2, 3], type=pa.int32())})
+        t_null_second = pa.table({"x": pa.array([1, None, 3], type=pa.int32())})
+        assert _make_hasher().hash_table(t_null_first).digest != _make_hasher().hash_table(t_null_second).digest
+
+    def test_null_differs_from_no_null(self):
+        """A table with a null value must hash differently from one without."""
+        t_with_null = pa.table({"x": pa.array([1, None, 3], type=pa.int32())})
+        t_no_null   = pa.table({"x": pa.array([1, 2, 3],    type=pa.int32())})
+        assert _make_hasher().hash_table(t_with_null).digest != _make_hasher().hash_table(t_no_null).digest
+
+    def test_int32_differs_from_int64(self):
+        """Same values stored as int32 and int64 must hash differently."""
+        t_i32 = pa.table({"x": pa.array([1, 2, 3], type=pa.int32())})
+        t_i64 = pa.table({"x": pa.array([1, 2, 3], type=pa.int64())})
+        assert _make_hasher().hash_table(t_i32).digest != _make_hasher().hash_table(t_i64).digest
+
+    def test_golden_value_table(self):
+        """Pin the full 35-byte table digest so any algorithm change is caught."""
+        table = pa.table({
+            "id":    pa.array([1, 2, 3], type=pa.int32()),
+            "score": pa.array([0.1, 0.2, 0.3], type=pa.float64()),
+            "label": pa.array(["a", "b", "c"], type=pa.utf8()),
+        })
+        h = _make_hasher().hash_table(table)
+        assert h.digest.hex() == "0000010cd7fe5462420b84f03a06925374e528817a3b72319e679a17e7380964878791", (
+            f"Table golden digest changed — was the starfix algorithm updated? "
+            f"Got: {h.digest.hex()}"
+        )
+
 
 # ---------------------------------------------------------------------------
 # RecordBatch (batch-split independence)
@@ -188,6 +255,17 @@ def test_method_is_hasher_id_for_batch(self):
         h = _make_hasher().hash_table(batch)
         assert h.method == HASHER_ID
 
+    def test_multi_batch_table_matches_single_batch(self):
+        """A Table built from multiple RecordBatches must hash the same as
+        an equivalent single-batch Table (chunked arrays are transparent)."""
+        b1 = pa.record_batch({"x": pa.array([1, 2], type=pa.int64())})
+        b2 = pa.record_batch({"x": pa.array([3, 4], type=pa.int64())})
+        t_multi  = pa.Table.from_batches([b1, b2])
+        t_single = pa.table({"x": pa.array([1, 2, 3, 4], type=pa.int64())})
+        h_multi  = _make_hasher().hash_table(t_multi)
+        h_single = _make_hasher().hash_table(t_single)
+        assert h_multi.digest == h_single.digest
+
 
 # ---------------------------------------------------------------------------
 # versioned_hashers factory

From 6c8666ad2b009805153404fa0670cbb03aece078 Mon Sep 17 00:00:00 2001
From: "agent-kurouto[bot]"
 <268466204+agent-kurouto[bot]@users.noreply.github.com>
Date: Sat, 21 Mar 2026 05:40:49 +0000
Subject: [PATCH 4/8] fix(hashing): address Copilot review feedback on
 StarfixArrowHasher

- Short-circuit primitive columns in _process_table_columns to avoid
  costly Python round-trips for columns that cannot contain semantic types
- Infer output type from first non-null processed value, preventing
  incorrect type inference when a semantic-struct column starts with nulls
- Preserve field nullable/metadata via field.with_type() and propagate
  schema-level metadata through to the reconstructed table
- Pin starfix git source to locked commit SHA in pyproject.toml
- Clarify v0.1.json changelog entry wording

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 pyproject.toml                       |  2 +-
 src/orcapod/contexts/data/v0.1.json  |  2 +-
 src/orcapod/hashing/arrow_hashers.py | 30 +++++++++++++++++++++++-----
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d5226222..f3773c52 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,7 +52,7 @@ where = ["src"]
 version_file = "src/orcapod/_version.py"
 
 [tool.uv.sources]
-starfix = { git = "https://github.com/nauticalab/starfix-python.git" }
+starfix = { git = "https://github.com/nauticalab/starfix-python.git", rev = "344617bc6f7fcabab5c011d5774ed47de33f21de" }
 
 
 [dependency-groups]
diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json
index 2cb2bfc4..f75798b5 100644
--- a/src/orcapod/contexts/data/v0.1.json
+++ b/src/orcapod/contexts/data/v0.1.json
@@ -81,7 +81,7 @@
             "Initial release with Path semantic type support",
             "Basic SHA-256 hashing for files and objects",
             "Arrow logical serialization method",
-            "arrow_v0.1: replaced custom IPC serialisation+hashlib with starfix ArrowDigester for cross-language-compatible Arrow hashing"
+            "Introduced arrow_v0.1 StarfixArrowHasher using starfix ArrowDigester for cross-language-compatible Arrow hashing"
         ]
     }
 }
diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py
index a8c34a9e..7bc9e6cc 100644
--- a/src/orcapod/hashing/arrow_hashers.py
+++ b/src/orcapod/hashing/arrow_hashers.py
@@ -287,6 +287,19 @@ def _process_table_columns(self, table: pa.Table | pa.RecordBatch) -> pa.Table:
         new_fields: list[pa.Field] = []
 
         for i, field in enumerate(table.schema):
+            # Short-circuit: primitive columns cannot contain semantic types, so skip
+            # the costly Python round-trip and reuse the original Arrow array directly.
+            if not (
+                pa.types.is_struct(field.type)
+                or pa.types.is_list(field.type)
+                or pa.types.is_large_list(field.type)
+                or pa.types.is_fixed_size_list(field.type)
+                or pa.types.is_map(field.type)
+            ):
+                new_columns.append(table.column(i))
+                new_fields.append(field)
+                continue
+
             column_data = table.column(i).to_pylist()
             visitor = SemanticHashingVisitor(self.semantic_registry)
 
@@ -295,23 +308,30 @@ def _process_table_columns(self, table: pa.Table | pa.RecordBatch) -> pa.Table:
                 processed_data: list[Any] = []
                 for value in column_data:
                     processed_type, processed_value = visitor.visit(field.type, value)
-                    if new_type is None:
+                    # Infer the output type from the first non-null processed value.
+                    # When the first row is null, visit_struct returns the original
+                    # struct type rather than the converted type (e.g. large_string),
+                    # which would cause pa.array() to fail for subsequent non-null rows.
+                    if new_type is None and processed_value is not None:
                         new_type = processed_type
                     processed_data.append(processed_value)
 
-                # For empty columns there are no values to infer the type from;
-                # fall back to the field's declared type.
+                # For empty or all-null columns there are no non-null values to infer
+                # the type from; fall back to the field's declared type.
                 if new_type is None:
                     new_type = field.type
                 new_columns.append(pa.array(processed_data, type=new_type))
-                new_fields.append(pa.field(field.name, new_type))
+                # Preserve original field attributes (nullable, metadata) while
+                # updating only the type, so the schema fed to starfix remains faithful.
+                new_fields.append(field.with_type(new_type))
 
             except Exception as exc:
                 raise RuntimeError(
                     f"Failed to process column '{field.name}': {exc}"
                 ) from exc
 
-        return pa.table(new_columns, schema=pa.schema(new_fields))
+        # Preserve the original schema-level metadata while using updated fields.
+        return pa.table(new_columns, schema=pa.schema(new_fields, metadata=table.schema.metadata))
 
     def hash_schema(self, schema: pa.Schema) -> ContentHash:
         """Hash an Arrow schema using the starfix canonical algorithm.

From 6c1abd68a2725263b1fa5c814fe16bf773300d38 Mon Sep 17 00:00:00 2001
From: "agent-kurouto[bot]"
 <268466204+agent-kurouto[bot]@users.noreply.github.com>
Date: Sat, 21 Mar 2026 05:50:50 +0000
Subject: [PATCH 5/8] fix(sources,deps): allow empty ArrowTableSource and pin
 starfix via PEP 508

- Remove the zero-row ValueError guard from ArrowTableSource.__init__.
  The check was added to make an accidental AssertionError explicit, but
  now that StarfixArrowHasher handles empty tables correctly there is no
  technical reason to reject them; empty sources are valid and produce
  zero output records.
- Replace bare "starfix" + [tool.uv.sources] override with a PEP 508
  direct URL in dependencies, so both uv and pip resolve the same pinned
  commit without relying on uv-specific source overrides.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 pyproject.toml                                 | 5 +----
 src/orcapod/core/sources/arrow_table_source.py | 6 ------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f3773c52..028decef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ dependencies = [
     "deltalake>=1.0.2",
     "graphviz>=0.21",
     "gitpython>=3.1.45",
-    "starfix",
+    "starfix @ git+https://github.com/nauticalab/starfix-python.git@344617bc6f7fcabab5c011d5774ed47de33f21de",
     "pygraphviz>=1.14",
     "tzdata>=2024.1",
     "uuid-utils>=0.11.1",
@@ -51,9 +51,6 @@ where = ["src"]
 [tool.setuptools_scm]
 version_file = "src/orcapod/_version.py"
 
-[tool.uv.sources]
-starfix = { git = "https://github.com/nauticalab/starfix-python.git", rev = "344617bc6f7fcabab5c011d5774ed47de33f21de" }
-
 
 [dependency-groups]
 dev = [
diff --git a/src/orcapod/core/sources/arrow_table_source.py b/src/orcapod/core/sources/arrow_table_source.py
index 637fb4cc..42d04a63 100644
--- a/src/orcapod/core/sources/arrow_table_source.py
+++ b/src/orcapod/core/sources/arrow_table_source.py
@@ -28,12 +28,6 @@ def __init__(
     ) -> None:
         super().__init__(**kwargs)
 
-        if len(table) == 0:
-            raise ValueError(
-                "ArrowTableSource requires a non-empty table; "
-                "the provided table has zero rows."
-            )
-
         builder = SourceStreamBuilder(self.data_context, self.orcapod_config)
         result = builder.build(
             table,

From a292930a38b8187b36050fb143c3acf35dcd3107 Mon Sep 17 00:00:00 2001
From: "agent-kurouto[bot]"
 <268466204+agent-kurouto[bot]@users.noreply.github.com>
Date: Sat, 21 Mar 2026 05:58:20 +0000
Subject: [PATCH 6/8] fix(deps): update uv.lock to pin starfix git rev
 correctly

The lockfile was missing the `?rev=` query parameter for the pinned
starfix git commit, causing `uv sync --locked` to fail in CI.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 uv.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/uv.lock b/uv.lock
index b366f4b4..eeae09e0 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2093,7 +2093,7 @@ requires-dist = [
     { name = "ray", extras = ["default"], marker = "extra == 'ray'", specifier = "==2.48.0" },
     { name = "redis", marker = "extra == 'redis'", specifier = ">=6.2.0" },
     { name = "s3fs", specifier = ">=2025.12.0" },
-    { name = "starfix", git = "https://github.com/nauticalab/starfix-python.git" },
+    { name = "starfix", git = "https://github.com/nauticalab/starfix-python.git?rev=344617bc6f7fcabab5c011d5774ed47de33f21de" },
     { name = "typing-extensions" },
     { name = "tzdata", specifier = ">=2024.1" },
     { name = "uuid-utils", specifier = ">=0.11.1" },
@@ -3441,7 +3441,7 @@ wheels = [
 [[package]]
 name = "starfix"
 version = "0.0.2"
-source = { git = "https://github.com/nauticalab/starfix-python.git#344617bc6f7fcabab5c011d5774ed47de33f21de" }
+source = { git = "https://github.com/nauticalab/starfix-python.git?rev=344617bc6f7fcabab5c011d5774ed47de33f21de#344617bc6f7fcabab5c011d5774ed47de33f21de" }
 dependencies = [
     { name = "pyarrow" },
 ]

From 03cdc0c19c7c7fada01df4ff5d43265982085a19 Mon Sep 17 00:00:00 2001
From: "agent-kurouto[bot]"
 <268466204+agent-kurouto[bot]@users.noreply.github.com>
Date: Sat, 21 Mar 2026 06:01:17 +0000
Subject: [PATCH 7/8] test(sources): update empty table test to reflect
 allowed-empty behavior
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 6c1abd6 intentionally removed the zero-row guard from
ArrowTableSource, making empty tables valid (they produce zero output
records). Update test_empty_table_raises → test_empty_table_constructs_successfully
to assert the new contract instead of the old rejected-empty behavior.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 test-objective/unit/test_sources.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test-objective/unit/test_sources.py b/test-objective/unit/test_sources.py
index 298eee9b..88eb0e07 100644
--- a/test-objective/unit/test_sources.py
+++ b/test-objective/unit/test_sources.py
@@ -45,16 +45,17 @@ def test_normal_construction(self):
         source = ArrowTableSource(_simple_table(), tag_columns=["name"])
         assert source is not None
 
-    def test_empty_table_raises(self):
-        """An empty table raises an error during construction."""
+    def test_empty_table_constructs_successfully(self):
+        """An empty table constructs successfully and produces zero output rows."""
         empty = pa.table(
             {
                 "name": pa.array([], type=pa.large_string()),
                 "age": pa.array([], type=pa.int64()),
             }
         )
-        with pytest.raises(Exception):
-            ArrowTableSource(empty, tag_columns=["name"])
+        source = ArrowTableSource(empty, tag_columns=["name"])
+        assert source is not None
+        assert source.as_table().num_rows == 0
 
     def test_missing_tag_columns_raises_value_error(self):
         """Specifying tag columns not in the table raises ValueError."""

From ab017038c049fa8d080c11d29ee95d2ffcab8573 Mon Sep 17 00:00:00 2001
From: "agent-kurouto[bot]"
 <268466204+agent-kurouto[bot]@users.noreply.github.com>
Date: Sat, 21 Mar 2026 06:08:56 +0000
Subject: [PATCH 8/8] test(hashing): add cross-process hash stability tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds tests/test_hashing/test_cross_process_stability.py covering the
previously untested property that hash values are safe to persist:

- Arrow schema and table hashes are identical when re-computed in a
  freshly-spawned subprocess (different interpreter lifetime).
- All hashes are immune to PYTHONHASHSEED randomisation — CPython
  randomises built-in hash() per-process, which affects dict/set
  iteration order; the tests confirm neither StarfixArrowHasher nor
  the semantic hasher inherit that instability.
- Column-order independence is confirmed cross-process for both
  schemas and tables.
- Row-order sensitivity is confirmed cross-process (rows matter,
  column order does not).
- Semantic hasher stability is verified for primitives, containers,
  nested structures, sets, and tuples across processes.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../test_cross_process_stability.py           | 303 ++++++++++++++++++
 1 file changed, 303 insertions(+)
 create mode 100644 tests/test_hashing/test_cross_process_stability.py

diff --git a/tests/test_hashing/test_cross_process_stability.py b/tests/test_hashing/test_cross_process_stability.py
new file mode 100644
index 00000000..e8e71700
--- /dev/null
+++ b/tests/test_hashing/test_cross_process_stability.py
@@ -0,0 +1,303 @@
+"""Cross-process hash stability tests.
+
+Verifies that hash values produced by StarfixArrowHasher and the semantic
+hasher are deterministic across independent Python interpreter processes,
+immune to PYTHONHASHSEED randomisation, and therefore safe to persist in
+databases or caches.
+
+Coverage
+--------
+- Arrow schema hashes: identical across separately-spawned processes
+- Arrow table data hashes: identical across separately-spawned processes
+- Semantic (Python object) hashes: identical across processes
+- PYTHONHASHSEED independence: hashes do not vary when Python's built-in
+  hash() randomisation seed changes (critical for dicts and sets)
+- Column-order independence confirmed cross-process
+- Row-order sensitivity confirmed cross-process (rows matter, columns don't)
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+import sys
+import textwrap
+
+import pyarrow as pa
+import pytest
+
+from orcapod.hashing.versioned_hashers import (
+    get_versioned_semantic_arrow_hasher,
+    get_versioned_semantic_hasher,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _run_subprocess(script: str, pythonhashseed: str) -> dict[str, str]:
+    """Run *script* in a fresh interpreter with the given PYTHONHASHSEED.
+
+    Returns the JSON-decoded dict printed to stdout.
+    Raises AssertionError if the subprocess exits non-zero.
+    """
+    env = os.environ.copy()
+    env["PYTHONHASHSEED"] = pythonhashseed
+    result = subprocess.run(
+        [sys.executable, "-c", script],
+        capture_output=True,
+        text=True,
+        env=env,
+    )
+    assert result.returncode == 0, (
+        f"Subprocess failed (PYTHONHASHSEED={pythonhashseed}):\n{result.stderr}"
+    )
+    return json.loads(result.stdout.strip())
+
+
+# ---------------------------------------------------------------------------
+# Subprocess scripts (self-contained; no shared state with the test process)
+# ---------------------------------------------------------------------------
+
+_ARROW_SCHEMA_SCRIPT = textwrap.dedent("""\
+    import json
+    import pyarrow as pa
+    from orcapod.hashing.versioned_hashers import get_versioned_semantic_arrow_hasher
+
+    hasher = get_versioned_semantic_arrow_hasher()
+    schemas = {
+        "simple": pa.schema([
+            pa.field("id", pa.int64(), nullable=False),
+            pa.field("value", pa.float64(), nullable=True),
+        ]),
+        "string_types": pa.schema([
+            pa.field("name", pa.utf8()),
+            pa.field("data", pa.large_binary()),
+        ]),
+        "nested_struct": pa.schema([
+            pa.field("point", pa.struct([
+                pa.field("x", pa.float32()),
+                pa.field("y", pa.float32()),
+            ])),
+        ]),
+        # Same fields as "simple" but columns listed in reverse order —
+        # must hash identically (column-order independence).
+        "simple_reordered": pa.schema([
+            pa.field("value", pa.float64(), nullable=True),
+            pa.field("id", pa.int64(), nullable=False),
+        ]),
+    }
+    print(json.dumps({name: hasher.hash_schema(s).digest.hex() for name, s in schemas.items()}))
+""")
+
+_ARROW_TABLE_SCRIPT = textwrap.dedent("""\
+    import json
+    import pyarrow as pa
+    from orcapod.hashing.versioned_hashers import get_versioned_semantic_arrow_hasher
+
+    hasher = get_versioned_semantic_arrow_hasher()
+    tables = {
+        "integers": pa.table({
+            "id": pa.array([1, 2, 3], type=pa.int64()),
+            "x":  pa.array([10, 20, 30], type=pa.int64()),
+        }),
+        "with_nulls": pa.table({
+            "a": pa.array([1, None, 3], type=pa.int32()),
+            "b": pa.array([None, 2.0, 3.0], type=pa.float64()),
+        }),
+        "strings": pa.table({
+            "name": pa.array(["alice", "bob", "carol"], type=pa.large_utf8()),
+        }),
+        "empty": pa.table({
+            "x": pa.array([], type=pa.int64()),
+        }),
+        # Same rows as "integers" but columns swapped — must hash identically.
+        "integers_col_swap": pa.table({
+            "x":  pa.array([10, 20, 30], type=pa.int64()),
+            "id": pa.array([1, 2, 3], type=pa.int64()),
+        }),
+        # Same rows as "integers" but rows reversed — must hash DIFFERENTLY.
+        "integers_row_reversed": pa.table({
+            "id": pa.array([3, 2, 1], type=pa.int64()),
+            "x":  pa.array([30, 20, 10], type=pa.int64()),
+        }),
+    }
+    print(json.dumps({name: hasher.hash_table(t).digest.hex() for name, t in tables.items()}))
+""")
+
+_SEMANTIC_SCRIPT = textwrap.dedent("""\
+    import json
+    from orcapod.hashing.versioned_hashers import get_versioned_semantic_hasher
+
+    hasher = get_versioned_semantic_hasher()
+    objects = {
+        "int":         42,
+        "neg_int":     -7,
+        "float":       3.14159,
+        "string":      "hello world",
+        "none":        None,
+        "bool_true":   True,
+        "bool_false":  False,
+        "list":        [1, 2, 3],
+        "nested_dict": {"a": 1, "b": [2, 3], "c": {"d": 4}},
+        "set":         [1, 2, 3],   # hashed as set equivalent
+        "tuple":       [1, 2, 3],   # distinguished by type tag inside hasher
+        "empty_list":  [],
+        "empty_dict":  {},
+    }
+    # sets/tuples need special treatment for JSON serialisation — hash via
+    # the real Python types rather than JSON-roundtripped lists.
+    typed_objects = {
+        "set":   {1, 2, 3},
+        "tuple": (1, 2, 3),
+    }
+    results = {name: hasher.hash_object(obj).digest.hex() for name, obj in objects.items()}
+    results.update({name: hasher.hash_object(obj).digest.hex() for name, obj in typed_objects.items()})
+    print(json.dumps(results))
+""")
+
+
+# ---------------------------------------------------------------------------
+# Tests: Arrow schema hash cross-process stability
+# ---------------------------------------------------------------------------
+
+
+class TestArrowSchemaHashCrossProcess:
+    """Arrow schema hashes are identical across independent processes."""
+
+    def test_schema_hash_matches_subprocess(self):
+        """Hash computed locally equals hash computed in a fresh subprocess."""
+        hasher = get_versioned_semantic_arrow_hasher()
+        local = {
+            "simple": hasher.hash_schema(pa.schema([
+                pa.field("id", pa.int64(), nullable=False),
+                pa.field("value", pa.float64(), nullable=True),
+            ])).digest.hex(),
+            "string_types": hasher.hash_schema(pa.schema([
+                pa.field("name", pa.utf8()),
+                pa.field("data", pa.large_binary()),
+            ])).digest.hex(),
+            "nested_struct": hasher.hash_schema(pa.schema([
+                pa.field("point", pa.struct([
+                    pa.field("x", pa.float32()),
+                    pa.field("y", pa.float32()),
+                ])),
+            ])).digest.hex(),
+            "simple_reordered": hasher.hash_schema(pa.schema([
+                pa.field("value", pa.float64(), nullable=True),
+                pa.field("id", pa.int64(), nullable=False),
+            ])).digest.hex(),
+        }
+        remote = _run_subprocess(_ARROW_SCHEMA_SCRIPT, pythonhashseed="42")
+        assert local == remote
+
+    def test_schema_hash_independent_of_pythonhashseed(self):
+        """Two subprocesses with different PYTHONHASHSEED values agree."""
+        hashes_a = _run_subprocess(_ARROW_SCHEMA_SCRIPT, pythonhashseed="0")
+        hashes_b = _run_subprocess(_ARROW_SCHEMA_SCRIPT, pythonhashseed="999999")
+        assert hashes_a == hashes_b
+
+    def test_column_order_independence_cross_process(self):
+        """Reordering schema fields yields the same hash in a subprocess."""
+        hashes = _run_subprocess(_ARROW_SCHEMA_SCRIPT, pythonhashseed="random")
+        assert hashes["simple"] == hashes["simple_reordered"]
+
+
+# ---------------------------------------------------------------------------
+# Tests: Arrow table data hash cross-process stability
+# ---------------------------------------------------------------------------
+
+
+class TestArrowTableHashCrossProcess:
+    """Arrow table data hashes are identical across independent processes."""
+
+    def test_table_hash_matches_subprocess(self):
+        """Hash computed locally equals hash computed in a fresh subprocess."""
+        hasher = get_versioned_semantic_arrow_hasher()
+        local = {
+            "integers": hasher.hash_table(pa.table({
+                "id": pa.array([1, 2, 3], type=pa.int64()),
+                "x":  pa.array([10, 20, 30], type=pa.int64()),
+            })).digest.hex(),
+            "with_nulls": hasher.hash_table(pa.table({
+                "a": pa.array([1, None, 3], type=pa.int32()),
+                "b": pa.array([None, 2.0, 3.0], type=pa.float64()),
+            })).digest.hex(),
+            "strings": hasher.hash_table(pa.table({
+                "name": pa.array(["alice", "bob", "carol"], type=pa.large_utf8()),
+            })).digest.hex(),
+            "empty": hasher.hash_table(pa.table({
+                "x": pa.array([], type=pa.int64()),
+            })).digest.hex(),
+            "integers_col_swap": hasher.hash_table(pa.table({
+                "x":  pa.array([10, 20, 30], type=pa.int64()),
+                "id": pa.array([1, 2, 3], type=pa.int64()),
+            })).digest.hex(),
+            "integers_row_reversed": hasher.hash_table(pa.table({
+                "id": pa.array([3, 2, 1], type=pa.int64()),
+                "x":  pa.array([30, 20, 10], type=pa.int64()),
+            })).digest.hex(),
+        }
+        remote = _run_subprocess(_ARROW_TABLE_SCRIPT, pythonhashseed="1337")
+        assert local == remote
+
+    def test_table_hash_independent_of_pythonhashseed(self):
+        """Two subprocesses with different PYTHONHASHSEED values agree."""
+        hashes_a = _run_subprocess(_ARROW_TABLE_SCRIPT, pythonhashseed="0")
+        hashes_b = _run_subprocess(_ARROW_TABLE_SCRIPT, pythonhashseed="12345")
+        assert hashes_a == hashes_b
+
+    def test_column_order_independence_cross_process(self):
+        """Column-reordered tables hash identically across processes."""
+        hashes = _run_subprocess(_ARROW_TABLE_SCRIPT, pythonhashseed="random")
+        assert hashes["integers"] == hashes["integers_col_swap"]
+
+    def test_row_order_sensitivity_cross_process(self):
+        """Row-reversed tables hash differently across processes (row order matters)."""
+        hashes = _run_subprocess(_ARROW_TABLE_SCRIPT, pythonhashseed="random")
+        assert hashes["integers"] != hashes["integers_row_reversed"]
+
+
+# ---------------------------------------------------------------------------
+# Tests: Semantic (Python object) hash cross-process stability
+# ---------------------------------------------------------------------------
+
+
+class TestSemanticHashCrossProcess:
+    """Semantic Python-object hashes are identical across independent processes."""
+
+    def test_semantic_hash_matches_subprocess(self):
+        """Hash computed locally equals hash computed in a fresh subprocess."""
+        hasher = get_versioned_semantic_hasher()
+        local = {
+            "int":         hasher.hash_object(42).digest.hex(),
+            "neg_int":     hasher.hash_object(-7).digest.hex(),
+            "float":       hasher.hash_object(3.14159).digest.hex(),
+            "string":      hasher.hash_object("hello world").digest.hex(),
+            "none":        hasher.hash_object(None).digest.hex(),
+            "bool_true":   hasher.hash_object(True).digest.hex(),
+            "bool_false":  hasher.hash_object(False).digest.hex(),
+            "list":        hasher.hash_object([1, 2, 3]).digest.hex(),
+            "nested_dict": hasher.hash_object({"a": 1, "b": [2, 3], "c": {"d": 4}}).digest.hex(),
+            "set":         hasher.hash_object({1, 2, 3}).digest.hex(),
+            "tuple":       hasher.hash_object((1, 2, 3)).digest.hex(),
+            "empty_list":  hasher.hash_object([]).digest.hex(),
+            "empty_dict":  hasher.hash_object({}).digest.hex(),
+        }
+        remote = _run_subprocess(_SEMANTIC_SCRIPT, pythonhashseed="42")
+        assert local == remote
+
+    def test_semantic_hash_independent_of_pythonhashseed(self):
+        """Semantic hashes do not change when PYTHONHASHSEED differs.
+
+        PYTHONHASHSEED affects dict and set iteration order in CPython.
+        This test confirms the semantic hasher normalises that away.
+        """
+        hashes_seed0 = _run_subprocess(_SEMANTIC_SCRIPT, pythonhashseed="0")
+        hashes_seed1 = _run_subprocess(_SEMANTIC_SCRIPT, pythonhashseed="1")
+        hashes_random = _run_subprocess(_SEMANTIC_SCRIPT, pythonhashseed="random")
+        assert hashes_seed0 == hashes_seed1
+        assert hashes_seed1 == hashes_random