Skip to content

Commit 52ff2ea

Browse files
Merge branch 'master' into feat/1423-diagram-trace
2 parents 03dd0a7 + ec77b47 commit 52ff2ea

11 files changed

Lines changed: 819 additions & 0 deletions

File tree

src/datajoint/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,15 @@
5151
"get_codec",
5252
"ObjectRef",
5353
"NpyRef",
54+
# SparkAdapter Codec Protocol
55+
"SparkAdapter",
5456
# Storage Adapter API
5557
"StorageAdapter",
5658
"get_storage_adapter",
5759
# Other
5860
"errors",
5961
"migrate",
62+
"deploy",
6063
"DataJointError",
6164
"ThreadSafetyError",
6265
"logger",
@@ -69,6 +72,7 @@
6972
# =============================================================================
7073
from . import errors
7174
from . import migrate
75+
from . import deploy
7276
from .codecs import (
7377
Codec,
7478
get_codec,
@@ -85,6 +89,7 @@
8589
from .instance import Instance, _ConfigProxy, _get_singleton_connection, _global_config, _check_thread_safe
8690
from .logging import logger
8791
from .objectref import ObjectRef
92+
from .spark import SparkAdapter
8893
from .storage_adapter import StorageAdapter, get_storage_adapter
8994
from .schemas import _Schema, VirtualModule, list_schemas, virtual_schema
9095
from .autopopulate import AutoPopulate

src/datajoint/adapters/postgres.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,6 +1280,31 @@ def enum_type_ddl(self, type_name: str, values: list[str]) -> str | None:
12801280
quoted_values = ", ".join(f"'{v}'" for v in values)
12811281
return f"CREATE TYPE {self.quote_identifier(type_name)} AS ENUM ({quoted_values})"
12821282

1283+
def replica_identity_ddl(self, full_table_name: str, mode: str) -> str:
1284+
"""
1285+
Generate ALTER TABLE ... REPLICA IDENTITY statement.
1286+
1287+
Controls how much of the old row PostgreSQL writes to WAL on UPDATE/DELETE.
1288+
``"default"`` logs only primary-key columns; ``"full"`` logs the entire row.
1289+
Required by some CDC tools (e.g. Databricks Lakehouse Sync) that need the
1290+
full pre-image to drive Slowly-Changing-Dimension history.
1291+
1292+
The ALTER is metadata-only, instant, and idempotent — re-applying the same
1293+
mode is a no-op at the storage layer.
1294+
1295+
Examples
1296+
--------
1297+
>>> adapter.replica_identity_ddl('"schema"."table"', 'full')
1298+
'ALTER TABLE "schema"."table" REPLICA IDENTITY FULL'
1299+
>>> adapter.replica_identity_ddl('"schema"."table"', 'default')
1300+
'ALTER TABLE "schema"."table" REPLICA IDENTITY DEFAULT'
1301+
"""
1302+
if mode not in ("default", "full"):
1303+
from ..errors import DataJointError
1304+
1305+
raise DataJointError(f"Unsupported replica_identity mode: {mode!r}. Expected 'default' or 'full'.")
1306+
return f"ALTER TABLE {full_table_name} REPLICA IDENTITY {mode.upper()}"
1307+
12831308
def get_pending_enum_ddl(self, schema_name: str) -> list[str]:
12841309
"""
12851310
Get DDL statements for pending enum types and clear the pending list.

src/datajoint/condition.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,20 @@ def assert_join_compatibility(
268268
lineage2 = expr2.heading[name].lineage
269269
# Semantic match requires both lineages to be non-None and equal
270270
if lineage1 is None or lineage2 is None or lineage1 != lineage2:
271+
if lineage1 is None or lineage2 is None:
272+
# Missing lineage usually means stale ~lineage rows that survived
273+
# an upgrade or a partial declare. Decoration in 2.3+ refreshes
274+
# lineage automatically, so this typically indicates a schema
275+
# that has not been re-decorated since the upgrade.
276+
raise DataJointError(
277+
f"Cannot join on attribute `{name}`: lineage missing on "
278+
f"one side ({lineage1} vs {lineage2}). This usually "
279+
f"indicates a stale `~lineage` entry from an older "
280+
f"DataJoint version or an incomplete declare. Run "
281+
f"`schema.rebuild_lineage()` to recompute lineage from "
282+
f"current FK definitions. If the lineages are genuinely "
283+
f"different, use `.proj()` to rename one of the attributes."
284+
)
271285
raise DataJointError(
272286
f"Cannot join on attribute `{name}`: "
273287
f"different lineages ({lineage1} vs {lineage2}). "

src/datajoint/deploy.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
"""
2+
Deployment-time operations for configuring an existing DataJoint pipeline.
3+
4+
This module hosts idempotent operational helpers — things you run as part of a
5+
deploy hook to configure a schema for its environment, distinct from
6+
:mod:`datajoint.migrate` which handles one-shot schema/state evolution.
7+
8+
The boundary between the two:
9+
10+
- :mod:`datajoint.migrate` — fix legacy state, evolve a schema definition,
11+
retroactive corrections. Cadence: one-shot. Examples: ``migrate_columns``,
12+
``add_job_metadata_columns``, ``rebuild_lineage``.
13+
- :mod:`datajoint.deploy` — configure an environment for a consumer's
14+
requirements (CDC tools, replication, role grants, performance tuning).
15+
Cadence: re-runnable, idempotent. Examples: :func:`set_replica_identity`.
16+
17+
Functions in this module should be safe to call repeatedly from a deploy hook
18+
without accumulating side effects.
19+
"""
20+
21+
from __future__ import annotations
22+
23+
from typing import TYPE_CHECKING, Any, Literal, Union
24+
25+
from .errors import DataJointError
26+
27+
if TYPE_CHECKING:
28+
from .schemas import _Schema
29+
from .table import Table
30+
31+
TargetType = Union["_Schema", type["Table"], "Table"]
32+
33+
34+
def set_replica_identity(
35+
target: "TargetType",
36+
mode: Literal["default", "full"] = "full",
37+
dry_run: bool = True,
38+
) -> dict:
39+
"""
40+
Apply ``ALTER TABLE ... REPLICA IDENTITY <mode>`` to a schema or table on PostgreSQL.
41+
42+
``REPLICA IDENTITY`` controls how much of the **old row** PostgreSQL writes to
43+
the write-ahead log on UPDATE/DELETE. Under ``DEFAULT``, only primary-key
44+
columns appear in WAL; under ``FULL``, the entire old row does.
45+
46+
Why this exists
47+
---------------
48+
Some change-data-capture (CDC) consumers require the full row pre-image to
49+
drive their downstream models. The canonical example is **Databricks
50+
Lakehouse Sync**: tables without ``REPLICA IDENTITY FULL`` are silently
51+
skipped by the sync — no error, just missing data downstream. Other CDC
52+
tools (Debezium, ClickHouse ClickPipes, Azure CDC) work fine with
53+
``DEFAULT`` when tables have a primary key; only Databricks mandates
54+
``FULL``.
55+
56+
This helper is the **operational** way to apply the setting. It is not a
57+
migration: there's no legacy state being fixed; the setting is simply a
58+
property of the deployment environment, and a fresh declare in a new
59+
environment may need it re-applied. It is idempotent — re-applying the
60+
same mode is a no-op at the storage layer — so it is safe to call from a
61+
deploy hook on every release.
62+
63+
Cost
64+
----
65+
The ALTER itself is metadata-only and instant, but requires a brief
66+
``AccessExclusiveLock`` on each table — it will block behind in-flight
67+
writes/reads on a busy table. Run during a quiet window on actively-
68+
ingested tables.
69+
70+
The ongoing cost is in WAL volume after the change: UPDATE/DELETE on
71+
tables with FULL log the entire old row, which can be sizable on tables
72+
with TOASTed bytea columns. For DataJoint's typical insert-append
73+
workload, this cost is negligible. The notable scenario is bulk
74+
``delete()`` on tables with ``<blob>`` columns — a transient WAL burst
75+
proportional to the deleted-row payload size.
76+
77+
Partial-failure semantics
78+
-------------------------
79+
If ``connection.query(ddl)`` raises on table N of M, the first N-1
80+
tables are already modified at the storage layer but the exception
81+
propagates without returning the partial summary. The operation is
82+
idempotent, so re-running brings the remaining tables into compliance.
83+
84+
Compliance considerations
85+
-------------------------
86+
Under ``DEFAULT``, only primary-key values appear in WAL. Under ``FULL``,
87+
entire rows do — including any PHI/PII/sensitive columns. For self-hosted
88+
PostgreSQL with unrestricted WAL access this is a real consideration; for
89+
managed PostgreSQL with logical replication confined to a specific
90+
subscriber (Lakebase, RDS), WAL stays inside the managed environment's
91+
security boundary. Apply intentionally.
92+
93+
Parameters
94+
----------
95+
target : Schema or Table
96+
A :class:`datajoint.Schema` (all user tables) or a
97+
:class:`datajoint.Table` class/instance (just that table).
98+
mode : str, default ``"full"``
99+
``"default"`` (PK only, minimal WAL) or ``"full"`` (entire row).
100+
dry_run : bool, default ``True``
101+
If True, collect the DDL statements but do not execute. Set to False
102+
to actually apply.
103+
104+
Returns
105+
-------
106+
dict
107+
- ``tables_analyzed`` (int): number of tables considered.
108+
- ``tables_modified`` (int): number of tables on which the ALTER ran.
109+
Always 0 when ``dry_run=True``.
110+
- ``ddl`` (list[str]): the DDL statements that were (or would be) executed.
111+
112+
Raises
113+
------
114+
DataJointError
115+
If the target's backend is not PostgreSQL, or if ``mode`` is not one of
116+
``"default"`` / ``"full"``.
117+
118+
Examples
119+
--------
120+
>>> from datajoint.deploy import set_replica_identity
121+
>>> # Preview
122+
>>> set_replica_identity(my_schema, mode="full", dry_run=True)
123+
{'tables_analyzed': 12, 'tables_modified': 0, 'ddl': ['ALTER TABLE "ms"."t1" REPLICA IDENTITY FULL', ...]}
124+
>>> # Apply
125+
>>> set_replica_identity(my_schema, mode="full", dry_run=False)
126+
{'tables_analyzed': 12, 'tables_modified': 12, 'ddl': [...]}
127+
>>> # Single table
128+
>>> set_replica_identity(MyTable, mode="full", dry_run=False)
129+
130+
See Also
131+
--------
132+
PostgreSQL: `Logical Replication — Replica Identity
133+
<https://www.postgresql.org/docs/current/logical-replication-publication.html>`_.
134+
Databricks: `Lakehouse Sync
135+
<https://docs.databricks.com/aws/en/oltp/projects/lakehouse-sync>`_.
136+
"""
137+
mode_normalized = mode.lower() if isinstance(mode, str) else mode
138+
if mode_normalized not in ("default", "full"):
139+
raise DataJointError(f"mode must be 'default' or 'full'; got {mode!r}")
140+
mode = mode_normalized # type: ignore[assignment]
141+
142+
from .schemas import _Schema
143+
from .table import Table
144+
145+
if isinstance(target, _Schema):
146+
connection = target.connection
147+
if connection is None:
148+
raise DataJointError("Schema has no active connection.")
149+
adapter = connection.adapter
150+
if target.database is None:
151+
raise DataJointError("Schema is not activated. Call schema.activate(...) before set_replica_identity().")
152+
tables = [adapter.make_full_table_name(target.database, t) for t in target.list_tables()]
153+
elif isinstance(target, type) and issubclass(target, Table):
154+
instance = target()
155+
connection = instance.connection
156+
if connection is None:
157+
raise DataJointError(f"Table {target.__name__} has no active connection.")
158+
adapter = connection.adapter
159+
tables = [instance.full_table_name]
160+
elif isinstance(target, Table):
161+
connection = target.connection
162+
if connection is None:
163+
raise DataJointError(f"Table {type(target).__name__} has no active connection.")
164+
adapter = connection.adapter
165+
tables = [target.full_table_name]
166+
else:
167+
raise DataJointError(f"target must be a Schema or Table class/instance; got {type(target).__name__}")
168+
169+
if not hasattr(adapter, "replica_identity_ddl"):
170+
raise DataJointError(
171+
f"set_replica_identity is PostgreSQL-only; the {adapter.backend} adapter does not support REPLICA IDENTITY."
172+
)
173+
174+
result: dict[str, Any] = {
175+
"tables_analyzed": len(tables),
176+
"tables_modified": 0,
177+
"ddl": [],
178+
}
179+
for full_name in tables:
180+
ddl = adapter.replica_identity_ddl(full_name, mode) # type: ignore[attr-defined]
181+
result["ddl"].append(ddl)
182+
if not dry_run:
183+
connection.query(ddl)
184+
result["tables_modified"] += 1
185+
return result

src/datajoint/schemas.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,23 @@ def _decorate_table(self, table_class: type, context: dict[str, Any], assert_dec
303303
if not is_declared and not assert_declared and create_tables:
304304
instance.declare(context)
305305
self.connection.dependencies.clear()
306+
elif is_declared and create_tables:
307+
# Table already exists — declare() didn't run, so _populate_lineage
308+
# didn't either. Scan the already-loaded heading for the symptom
309+
# of stale/missing lineage rows (#1454): any PK attribute with
310+
# lineage=None indicates the ~lineage table is missing rows for
311+
# this table. Only then trigger a refresh — no extra DB queries
312+
# on healthy schemas, automatic repair when the bug is present.
313+
#
314+
# Note: stale-but-non-None rows (DJ version skew that wrote a
315+
# different string format) are not auto-detected here; users hit
316+
# the tailored "rebuild_lineage" error message on first join.
317+
try:
318+
pk_lineages = [instance.heading[attr].lineage for attr in instance.primary_key]
319+
except Exception:
320+
pk_lineages = []
321+
if pk_lineages and any(lineage is None for lineage in pk_lineages):
322+
instance._refresh_lineage(context)
306323
is_declared = is_declared or instance.is_declared
307324

308325
# add table definition to the doc string

src/datajoint/spark.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""
2+
SparkAdapter Codec Protocol.
3+
4+
Opt-in contract for codecs that adapt their decoded values to Spark-native
5+
types — primitives, lists, dicts, and nested combinations.
6+
7+
Codecs implement this method when they want their column eligible for
8+
downstream typed-query systems (Spark SQL, Delta Sharing, BI tools).
9+
Generic codecs like ``<blob@>`` and ``<hash@>`` deliberately do not
10+
implement it: their decoded values can be arbitrary Python objects with
11+
no fixed Spark-native shape.
12+
13+
The contract is intentionally a Protocol rather than an abstract method
14+
on :class:`datajoint.Codec`:
15+
16+
- Generic codecs need no acknowledgement (no ``NotImplementedError`` stubs).
17+
- Existing plugin codecs continue to work unchanged.
18+
- Codec authors opt in by adding the method on their own release cadence.
19+
- Consumers detect support structurally via ``isinstance(codec, SparkAdapter)``.
20+
21+
See ``datajoint-docs/src/reference/specs/spark-adapter.md`` for the
22+
normative specification (signature, return-value shape constraints,
23+
worked codec examples).
24+
"""
25+
26+
from __future__ import annotations
27+
28+
from typing import Any, Protocol, runtime_checkable
29+
30+
31+
@runtime_checkable
32+
class SparkAdapter(Protocol):
33+
"""
34+
A codec that adapts its decoded values to Spark-native types.
35+
36+
Opt-in. Codecs implementing this method declare that their decoded
37+
values can be expressed as primitives, lists, or dicts of the same —
38+
i.e., shapes that map cleanly to Spark's ``StructType`` /
39+
``ArrayType`` / ``MapType``.
40+
41+
Consumers (e.g., a Databricks silver-layer publish pipeline) check
42+
``isinstance(codec, SparkAdapter)`` per column to determine eligibility.
43+
44+
Allowed return-value shapes:
45+
46+
- Primitives: ``bool``, ``int``, ``float``, ``str``, ``bytes``,
47+
``None``, ``datetime.date``, ``datetime.datetime``.
48+
- ``list[T]`` where ``T`` is any allowed shape (→ Spark ``ArrayType``).
49+
- ``dict[str, T]`` where ``T`` is any allowed shape (→ Spark
50+
``StructType`` or ``MapType``, consumer-decided).
51+
52+
NumPy arrays must be converted to lists; no tuples, sets, or custom
53+
objects in the return value.
54+
55+
Examples
56+
--------
57+
A 1D float-array codec (shipped as a plugin, not in datajoint-python)::
58+
59+
class FloatArrayCodec(dj.Codec):
60+
name = "float_array"
61+
62+
def encode(self, value, *, key=None, store_name=None): ...
63+
def decode(self, stored, *, key=None) -> np.ndarray: ...
64+
65+
def to_spark(self, decoded: np.ndarray, *, key=None) -> list[float]:
66+
return decoded.tolist() # → Spark ARRAY<DOUBLE>
67+
68+
Eligibility check::
69+
70+
from datajoint import SparkAdapter
71+
isinstance(FloatArrayCodec(), SparkAdapter) # True
72+
"""
73+
74+
def to_spark(self, decoded: Any, *, key: dict | None = None) -> Any:
75+
"""
76+
Adapt a decoded codec value to a Spark-native shape.
77+
78+
Parameters
79+
----------
80+
decoded : Any
81+
The Python value produced by the codec's ``decode()``.
82+
key : dict, optional
83+
Optional context dict — same shape as ``Codec.encode``'s
84+
``key`` parameter. Most codecs ignore it.
85+
86+
Returns
87+
-------
88+
Any
89+
A value composed entirely of allowed Spark-native shapes
90+
(see class docstring).
91+
"""
92+
...

0 commit comments

Comments
 (0)