Skip to content

Commit ec77b47

Browse files
Merge pull request #1472 from datajoint/feat/1458-renderable-protocol
feat(#1458): SparkAdapter Codec Protocol
2 parents c6c4681 + ae8cdf1 commit ec77b47

3 files changed

Lines changed: 200 additions & 0 deletions

File tree

src/datajoint/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151
"get_codec",
5252
"ObjectRef",
5353
"NpyRef",
54+
# SparkAdapter Codec Protocol
55+
"SparkAdapter",
5456
# Storage Adapter API
5557
"StorageAdapter",
5658
"get_storage_adapter",
@@ -87,6 +89,7 @@
8789
from .instance import Instance, _ConfigProxy, _get_singleton_connection, _global_config, _check_thread_safe
8890
from .logging import logger
8991
from .objectref import ObjectRef
92+
from .spark import SparkAdapter
9093
from .storage_adapter import StorageAdapter, get_storage_adapter
9194
from .schemas import _Schema, VirtualModule, list_schemas, virtual_schema
9295
from .autopopulate import AutoPopulate

src/datajoint/spark.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""
2+
SparkAdapter Codec Protocol.
3+
4+
Opt-in contract for codecs that adapt their decoded values to Spark-native
5+
types — primitives, lists, dicts, and nested combinations.
6+
7+
Codecs implement this method when they want their column eligible for
8+
downstream typed-query systems (Spark SQL, Delta Sharing, BI tools).
9+
Generic codecs like ``<blob@>`` and ``<hash@>`` deliberately do not
10+
implement it: their decoded values can be arbitrary Python objects with
11+
no fixed Spark-native shape.
12+
13+
The contract is intentionally a Protocol rather than an abstract method
14+
on :class:`datajoint.Codec`:
15+
16+
- Generic codecs need no acknowledgement (no ``NotImplementedError`` stubs).
17+
- Existing plugin codecs continue to work unchanged.
18+
- Codec authors opt in by adding the method on their own release cadence.
19+
- Consumers detect support structurally via ``isinstance(codec, SparkAdapter)``.
20+
21+
See ``datajoint-docs/src/reference/specs/spark-adapter.md`` for the
22+
normative specification (signature, return-value shape constraints,
23+
worked codec examples).
24+
"""
25+
26+
from __future__ import annotations
27+
28+
from typing import Any, Protocol, runtime_checkable
29+
30+
31+
@runtime_checkable
32+
class SparkAdapter(Protocol):
33+
"""
34+
A codec that adapts its decoded values to Spark-native types.
35+
36+
Opt-in. Codecs implementing this method declare that their decoded
37+
values can be expressed as primitives, lists, or dicts of the same —
38+
i.e., shapes that map cleanly to Spark's ``StructType`` /
39+
``ArrayType`` / ``MapType``.
40+
41+
Consumers (e.g., a Databricks silver-layer publish pipeline) check
42+
``isinstance(codec, SparkAdapter)`` per column to determine eligibility.
43+
44+
Allowed return-value shapes:
45+
46+
- Primitives: ``bool``, ``int``, ``float``, ``str``, ``bytes``,
47+
``None``, ``datetime.date``, ``datetime.datetime``.
48+
- ``list[T]`` where ``T`` is any allowed shape (→ Spark ``ArrayType``).
49+
- ``dict[str, T]`` where ``T`` is any allowed shape (→ Spark
50+
``StructType`` or ``MapType``, consumer-decided).
51+
52+
NumPy arrays must be converted to lists; no tuples, sets, or custom
53+
objects in the return value.
54+
55+
Examples
56+
--------
57+
A 1D float-array codec (shipped as a plugin, not in datajoint-python)::
58+
59+
class FloatArrayCodec(dj.Codec):
60+
name = "float_array"
61+
62+
def encode(self, value, *, key=None, store_name=None): ...
63+
def decode(self, stored, *, key=None) -> np.ndarray: ...
64+
65+
def to_spark(self, decoded: np.ndarray, *, key=None) -> list[float]:
66+
return decoded.tolist() # → Spark ARRAY<DOUBLE>
67+
68+
Eligibility check::
69+
70+
from datajoint import SparkAdapter
71+
isinstance(FloatArrayCodec(), SparkAdapter) # True
72+
"""
73+
74+
def to_spark(self, decoded: Any, *, key: dict | None = None) -> Any:
75+
"""
76+
Adapt a decoded codec value to a Spark-native shape.
77+
78+
Parameters
79+
----------
80+
decoded : Any
81+
The Python value produced by the codec's ``decode()``.
82+
key : dict, optional
83+
Optional context dict — same shape as ``Codec.encode``'s
84+
``key`` parameter. Most codecs ignore it.
85+
86+
Returns
87+
-------
88+
Any
89+
A value composed entirely of allowed Spark-native shapes
90+
(see class docstring).
91+
"""
92+
...

tests/unit/test_spark.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
"""
2+
Unit tests for the SparkAdapter Codec Protocol (#1458).
3+
4+
The Protocol is a structural-typing contract — codecs opt in by
5+
implementing ``to_spark`` and consumers detect support via
6+
``isinstance(codec, SparkAdapter)``. These tests cover the detection
7+
behavior, not specific rendering implementations (which live downstream).
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import datajoint as dj
13+
from datajoint.spark import SparkAdapter
14+
15+
16+
class _SparkAdapterCodec:
17+
"""A minimal codec-like object that opts into the protocol."""
18+
19+
name = "fake_spark_adapter"
20+
21+
def to_spark(self, decoded, *, key=None):
22+
return list(decoded) if hasattr(decoded, "__iter__") else decoded
23+
24+
25+
class _OpaqueCodec:
26+
"""A minimal codec-like object that does NOT opt into the protocol."""
27+
28+
name = "fake_opaque"
29+
30+
def encode(self, value, *, key=None, store_name=None):
31+
return bytes(value)
32+
33+
def decode(self, stored, *, key=None):
34+
return stored
35+
36+
37+
def test_protocol_detects_opt_in():
38+
"""A class implementing ``to_spark`` is detected as a SparkAdapter."""
39+
assert isinstance(_SparkAdapterCodec(), SparkAdapter)
40+
41+
42+
def test_protocol_rejects_non_opt_in():
43+
"""A class without ``to_spark`` is not detected as a SparkAdapter."""
44+
assert not isinstance(_OpaqueCodec(), SparkAdapter)
45+
46+
47+
def test_protocol_exported_at_top_level():
48+
"""``dj.SparkAdapter`` is accessible at the top level."""
49+
assert dj.SparkAdapter is SparkAdapter
50+
51+
52+
def test_protocol_is_runtime_checkable():
53+
"""The Protocol is decorated with @runtime_checkable (the test fixtures
54+
above rely on this)."""
55+
# Direct assertion: classes lacking runtime_checkable would raise TypeError
56+
# on isinstance(). The previous tests would error rather than fail.
57+
try:
58+
isinstance(object(), SparkAdapter)
59+
except TypeError:
60+
raise AssertionError("SparkAdapter must be @runtime_checkable")
61+
62+
63+
def test_blob_codec_is_not_spark_adapter():
64+
"""The built-in <blob@> codec is intentionally non-adapting per the spec."""
65+
from datajoint.builtin_codecs.blob import BlobCodec
66+
67+
assert not isinstance(BlobCodec(), SparkAdapter)
68+
69+
70+
def test_hash_codec_is_not_spark_adapter():
71+
"""The built-in <hash@> codec is intentionally non-adapting per the spec."""
72+
from datajoint.builtin_codecs.hash import HashCodec
73+
74+
assert not isinstance(HashCodec(), SparkAdapter)
75+
76+
77+
def test_to_spark_invocation_passes_through():
78+
"""A codec implementing the method can be invoked and returns its result."""
79+
codec = _SparkAdapterCodec()
80+
assert codec.to_spark([1, 2, 3]) == [1, 2, 3]
81+
assert codec.to_spark(42) == 42
82+
83+
84+
def test_to_spark_method_accepts_key_kwarg():
85+
"""The method signature accepts the optional ``key`` keyword argument."""
86+
codec = _SparkAdapterCodec()
87+
# Should not raise
88+
codec.to_spark([1, 2, 3], key={"some_pk": 1})
89+
90+
91+
def test_subclass_adding_to_spark_becomes_adapter():
92+
"""A subclass of an opaque codec that adds the method becomes a SparkAdapter."""
93+
94+
class _OpaqueBase:
95+
name = "base"
96+
97+
def encode(self, value, *, key=None, store_name=None):
98+
return b""
99+
100+
class _TypedSubclass(_OpaqueBase):
101+
def to_spark(self, decoded, *, key=None):
102+
return decoded
103+
104+
assert not isinstance(_OpaqueBase(), SparkAdapter)
105+
assert isinstance(_TypedSubclass(), SparkAdapter)

0 commit comments

Comments
 (0)